simple_regex.cxx

Go to the documentation of this file.
00001 //
00002 // Copyright 2002, 2009 Lowell Boggs Jr.
00003 //
00004 // This file or directory, containing source code for a computer program,
00005 // is Copyrighted by Lowell Boggs, Jr.  987 Regency Drive, Lewisville
00006 // TX (USA), 75067.  You may use, copy, modify, and distribute this
00007 // source file without charge or obligation so long as you agree to
00008 // the following:
00009 //
00010 //  1.  You must indemnify Lowell Boggs against any and all financial
00011 //      obligations caused by its use, misuse, function, or malfunction.
00012 //      Further, you acknowledge that there is no warranty of any kind,
00013 //      whatsoever.
00014 //
00015 //  2.  You agree not to attempt to patent any portion of this original
00016 //      work -- though you may attempt to patent your own extensions to
00017 //      it if you so choose.
00018 //
00019 //  3.  You keep this copyright notice with the file and all copies
00020 //      of the file and do not change it anyway except language translation.
00021 //
00022 // You are responsible for enforcing your own compliance with these
00023 // conditions and may not use this source file if you cannot agree to the
00024 // above terms and conditions.
00025 
00026 #include <cxxtls/simple_regex.h>
00027 #include <regex.h>
00028 #include <memory.h>
00029 #include <string>
00030 
00031 #ifdef RE_DUP_MAX
00032 # undef RE_DUP_MAX
00033 #endif
00034 
00035 #include <algorithm>
00036 
00037 using namespace std;
00038 
00039 namespace cxxtls
00040 {
00041 
00042 
00043 //
00044 // the following table of error message strings comes directly out of regex.h
00045 // Most of the regex functions return a numeric error which is an array index
00046 // into the table.  location 0 means 'no error' and is a null pointer.
00047 static char const *errors[]=
00048 {
00049   0,
00050   "no match",
00051   "invalid pattern",
00052   "collation error",
00053   "Invalid character class name",
00054   "Trailing backslash",
00055   "Invalid back reference",
00056   "Unmatched left bracket",
00057   "Parenthesis imbalance",
00058   "Unmatched \\{",
00059   "Invalid contents of \\{\\}",
00060   "Invalid range end",
00061   "Out of memory",
00062   "No preceding re for repetition operator",
00063   "Premature end",
00064   "Compiled pattern bigger than 2^16 bytes",
00065   "Unmatched ) or \\); not returned from regcomp"
00066 };
00067 
00068 static char const * compile_helper(char const *string, 
00069                                    regex_t *compiledPattern,
00070                                    std::string const &options
00071                                   )
00072   // perform the actual compilation of the regular expression
00073 {
00074     static int initialized=0;
00075 
00076     if(!initialized)
00077         {
00078             initialized=1;
00079             re_set_syntax(RE_SYNTAX_GREP);
00080         }
00081 
00082     int opts=0;
00083 
00084     if(options.find_first_of('i') < options.size())
00085     {
00086       opts |= REG_ICASE;
00087     }
00088 
00089     int err = regcomp(compiledPattern, string, opts);
00090 
00091     return errors[err];
00092 }
00093 
00094 //
00095 //  Regular expression constructors
00096 //
00097 
00098 SimpleRegex::
00099 SimpleRegex(string const &r, string const &options)
00100 :  expression_(r)
00101 ,  options_(options)
00102 {
00103   match_count_ = 0;
00104 
00105   if(r.size() == 2  && r[0] != '.' ) // correct for an invalid regex:  'd*' is equivalent to '.' but causes an infinite loop
00106   {
00107 
00108     // huh?  This code makes no sense, but I do remember some bug involving having the first letter
00109     // be optional -- I'll bet the bug is in my code, but maybe no.  It was too long ago.
00110 
00111     if(r[1] == '*' )
00112     {
00113       expression_ = ".";
00114     }
00115   }
00116 
00117   error_ = compile_helper(expression_.c_str(), 
00118                           &compiled_expression, 
00119                           options_.c_str()
00120                          );
00121 
00122   ok_ = (error_ == 0);
00123 
00124 }
00125 
00126 SimpleRegex::
00127 SimpleRegex(char const *r, char const *options)
00128 :  expression_(r)
00129 ,  options_(options)
00130 {
00131   match_count_ = 0;
00132 
00133   if(expression_.size() == 2) // correct for an invalid regex:  'd*' is equivalent to '.' but causes an infinite loop
00134   {
00135     if(r[1] == '*' )
00136     {
00137       expression_ = ".";
00138     }
00139   }
00140 
00141   error_ = compile_helper(expression_.c_str(), 
00142                           &compiled_expression,
00143                           options_.c_str()
00144                          );
00145 
00146   ok_ = (error_ == 0);
00147 
00148 }
00149 
00150 SimpleRegex::
00151 SimpleRegex(SimpleRegex const &r)
00152 :  expression_(r.expression_)
00153 ,  options_(r.options_)
00154 {
00155   match_count_ = 0;
00156 
00157   error_ = compile_helper(expression_.c_str(), 
00158                           &compiled_expression,
00159                           options_.c_str()
00160                          );
00161 
00162   ok_ = (error_ == 0);
00163 
00164 }
00165 
00166 //
00167 // Since the text string of the expression is kept in a SimpleRegex we can
00168 // treat it as a 'value object' and the following assignment operators
00169 // make life easier -- although they all cost a compilation
00170 //
00171 SimpleRegex &
00172 SimpleRegex::
00173 operator=(SimpleRegex const &r)
00174 {
00175   match_count_ = 0;
00176   expression_  = r.expression_;
00177   options_     = r.options_;
00178 
00179   error_ = compile_helper(expression_.c_str(), 
00180                           &compiled_expression,
00181                           options_.c_str()
00182                          );
00183 
00184   return *this;
00185 }
00186 
00187 SimpleRegex &
00188 SimpleRegex::
00189 operator=(char const * r)
00190 {
00191   match_count_ = 0;
00192   expression_ = r;
00193   options_="";
00194 
00195   error_ = compile_helper(expression_.c_str(), 
00196                           &compiled_expression,
00197                           options_.c_str()
00198                          );
00199 
00200   return *this;
00201 
00202 }
00203 
00204 SimpleRegex &
00205 SimpleRegex::
00206 operator=(string const &r)
00207 {
00208   match_count_ = 0;
00209   expression_  = r;
00210   options_     = "";
00211 
00212   error_ = compile_helper(expression_.c_str(), &compiled_expression, "");
00213 
00214   return *this;
00215 
00216 }
00217 
00218 SimpleRegex::
00219 ~SimpleRegex()
00220 {
00221    regfree(&compiled_expression);
00222 }
00223 
00224 int
00225 SimpleRegex::
00226 operator() (char const *s, int sLength) const
00227   //
00228   // returns a count of matched strings and sets the error() value.
00229   // matches can be found by looking at matches()
00230   //
00231 {
00232   match_count_ = 0;
00233 
00234   if(!ok_)
00235     return 0;
00236 
00237   regmatch_t matches[max_matches];
00238 
00239   int rc = regexec1(&compiled_expression, s, sLength, max_matches, matches, 0);
00240 
00241   if(rc != 0)
00242   {
00243      error_ = errors[rc];
00244 
00245      return 0;
00246   }
00247 
00248   int i;
00249 
00250   for(i=0; i < max_matches; ++i)
00251   {
00252 
00253       if( matches[i].rm_so >= 0 )
00254           {
00255               matches_[i].offset = matches[i].rm_so;
00256               matches_[i].length = matches[i].rm_eo - matches[i].rm_so;
00257           }
00258       else
00259           {
00260               break;
00261           }
00262   }
00263 
00264   if(i == 0)
00265     error_ = errors[1];
00266 
00267 
00268   match_count_ = i;
00269 
00270   return i;
00271 
00272 
00273 }
00274 
00275 int
00276 SimpleRegex::
00277 operator() (string const &s, int offset, int length) const
00278 {
00279   match_count_ = 0;
00280 
00281   if(!ok_)
00282     return 0;
00283 
00284   int string_length = s.size();
00285 
00286   if(offset >= string_length)
00287   {
00288     // the offset into the string where the search is
00289     // supposed to start is beyond the length of the
00290     // string.  Normally this is an error, but
00291     // what if the search is for an empty string?
00292 
00293     if( 
00294           (expression_ == "^$")
00295        || (   (expression_.size() == 4)
00296            && (expression_[0] == '^')
00297            && (expression_[2] == '*')
00298            && (expression_[3] == '$')
00299           )
00300       )
00301     {
00302       // in this case, the user is check for empty
00303       // string, so call it a find
00304     }
00305     else
00306     {
00307       error_ = errors[1];   // not found
00308       return 0;
00309     }
00310   }
00311 
00312   if(offset + length > string_length)
00313     length = string_length - offset;
00314 
00315   char const *p = s.c_str();
00316 
00317   p += offset;
00318 
00319   int count = SimpleRegex::operator() (p, length);
00320 
00321   if(count)
00322   {
00323     int i;
00324 
00325     for(i=0; i < count; ++i)
00326     {
00327       matches_[i].offset += offset; // incorporate the input offset
00328     }
00329 
00330   }
00331 
00332   return count;
00333 
00334 }
00335 
00336 
00337 string
00338 SimpleRegex::
00339 expand(string const &original, string const &in) const
00340 {
00341   // expand a given string by replacing \0 - \9 with matched
00342   // components of the input string.
00343 
00344   string rv;
00345 
00346   size_t offset=0;
00347 
00348   size_t in_size = in.size();
00349 
00350   while(offset < in_size)
00351   {
00352      string::const_iterator start  = in.begin() + offset;
00353      string::const_iterator end    = in.end();
00354      string::const_iterator slash = find(start, end, '\\');
00355 
00356      if(slash == end)
00357      {
00358        // whoops, out of text
00359 
00360        rv.append(start, end);
00361        return rv;
00362 
00363      }
00364      else
00365      {
00366        // perform a substitituion
00367 
00368        rv.append(start, slash);
00369 
00370        ++slash;
00371 
00372        if(slash == end)  // last char in string was slash, thats baaaaad
00373          return rv;
00374         
00375        char c = *slash;
00376 
00377        if(c >= '0' && c < '9')
00378        {
00379          // parm substitution
00380         
00381          ++slash;
00382         
00383          c -= '0';
00384         
00385          // c is now a parm index
00386         
00387          if( c < match_count_)
00388          {
00389         
00390            rv.append(original.begin() + matches_[int(c)].offset,
00391                      original.begin() + matches_[int(c)].offset +
00392                                         matches_[int(c)].length
00393                     );
00394          }
00395         
00396        }
00397        else
00398        {
00399          rv += *slash++;
00400         
00401        }
00402 
00403        offset = slash - in.begin();
00404 
00405      }
00406   }
00407 
00408   return rv;
00409 
00410 }
00411 
00412 
00413 string
00414 SimpleRegex::
00415 substitute(string const &original, string const &replacement) const
00416 {
00417   if(match_count_ == 0)
00418     return original;
00419 
00420   // first, compute the replacement value
00421 
00422   string real_replacement = expand(original, replacement);
00423 
00424   string rv;
00425 
00426   size_t start = matches_[0].offset;
00427   size_t end   = start + matches_[0].length;
00428 
00429   rv.append(original.begin(),
00430             original.begin() + start
00431            );
00432         
00433   rv.append(real_replacement);
00434 
00435   rv.append(original.begin() + end, original.end() );
00436 
00437   return rv;
00438 }
00439 
00440 bool
00441 SimpleRegex::
00442 anchored() const
00443 {
00444   if(expression_.size() && expression_[0] == '^')
00445     return true;
00446 
00447   return false;
00448 }
00449 
00450 bool 
00451 SimpleRegex::
00452 replace(string &rv, string const &source, string const &replacement, int maxIterations) const
00453   //  Replace pattern within source with replacement at most maxReplacements times and return true if at least one replacement
00454   //  occurred.
00455 {
00456     SimpleRegex const &expr = *this;
00457 
00458     bool matchFound=false;
00459 
00460     string const &pattern = expr; 
00461 
00462     if(pattern.size() && pattern[0] == '^')
00463     {
00464       // if the regex is nailed to the front of the expression, then at most
00465       // one substitution is possible.
00466 
00467       maxIterations = 1;
00468     }
00469 
00470 
00471     // begin a loop to perform as many substitutions as are necessary
00472     {
00473       rv=source;
00474 
00475       int offset = 0;
00476       int length = rv.size();
00477 
00478       if(expr.empty())
00479         maxIterations = 0;
00480       else
00481         if(maxIterations && expr.anchored())
00482           maxIterations = 1;
00483 
00484       while(maxIterations--)
00485       {
00486         int matchCount = expr(rv, offset, length);
00487 
00488         if(matchCount)
00489         {
00490           // a match was found
00491 
00492           string result = expr.substitute(rv, replacement);
00493 
00494           int sizeDelta = int(result.size()) - int(rv.size());
00495 
00496           SimpleRegex::match const *matches = expr.matches();
00497 
00498           int endOfMatch = matches[0].offset + matches[0].length;
00499 
00500           offset = endOfMatch;
00501 
00502           offset += sizeDelta;
00503 
00504           length = result.size() - offset;
00505 
00506           matchFound = true;
00507 
00508           rv = result;
00509 
00510         }
00511         else
00512         {
00513           break;
00514         }
00515 
00516       } // maxIterations--
00517 
00518     }
00519 
00520     return matchFound;
00521 }
00522 } // namespace cxxtls
Generated on Wed Feb 29 22:50:05 2012 for CXXUtilities by  doxygen 1.6.3