qregexp.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 **
4 ** Implementation of QRegExp class
5 **
6 ** Created : 950126
7 **
8 ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9 **
10 ** This file is part of the tools module of the Qt GUI Toolkit.
11 **
12 ** This file may be distributed under the terms of the Q Public License
13 ** as defined by Trolltech AS of Norway and appearing in the file
14 ** LICENSE.QPL included in the packaging of this file.
15 **
16 ** This file may be distributed and/or modified under the terms of the
17 ** GNU General Public License version 2 as published by the Free Software
18 ** Foundation and appearing in the file LICENSE.GPL included in the
19 ** packaging of this file.
20 **
21 ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22 ** licenses may use this file in accordance with the Qt Commercial License
23 ** Agreement provided with the Software.
24 **
25 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27 **
28 ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29 ** information about Qt Commercial License Agreements.
30 ** See http://www.trolltech.com/qpl/ for QPL licensing information.
31 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
32 **
33 ** Contact info@trolltech.com if any conditions of this licensing are
34 ** not clear to you.
35 **
36 **********************************************************************/
37 
38 #include "qregexp.h"
39 #include <ctype.h>
40 #include <stdlib.h>
41 
42 // NOT REVISED
43 /*!
44  \class QRegExp qregexp.h
45  \ingroup tools
46  \ingroup misc
47  \brief The QRegExp class provides pattern matching using regular
48  expressions or wildcards.
49 
50  QRegExp knows these regexp primitives:
51  <ul plain>
52  <li><dfn>c</dfn> matches the character 'c'
53  <li><dfn>.</dfn> matches any character
54  <li><dfn>^</dfn> matches start of input
55  <li><dfn>$</dfn> matches end of input
56  <li><dfn>[]</dfn> matches a defined set of characters - see below.
57  <li><dfn>a*</dfn> matches a sequence of zero or more a's
58  <li><dfn>a+</dfn> matches a sequence of one or more a's
59  <li><dfn>a?</dfn> matches an optional a
60  <li><dfn>\c</dfn> escape code for matching special characters such
61  as \, [, *, +, . etc.
62  <li><dfn>\t</dfn> matches the TAB character (9)
63  <li><dfn>\n</dfn> matches newline (10)
64  <li><dfn>\r</dfn> matches return (13)
65  <li><dfn>\s</dfn> matches a white space (defined as any character
66  for which QChar::isSpace() returns TRUE. This includes at least
67  ASCII characters 9 (TAB), 10 (LF), 11 (VT), 12(FF), 13 (CR) and 32
68  (Space)).
69  <li><dfn>\d</dfn> matches a digit (defined as any character for
70  which QChar::isDigit() returns TRUE. This includes at least ASCII
71  characters '0'-'9').
72  <li><dfn>\x1f6b</dfn> matches the character with unicode point U1f6b
73  (hexadecimal 1f6b). \x0012 will match the ASCII/Latin1 character
74  0x12 (18 decimal, 12 hexadecimal).
75  <li><dfn>\022</dfn> matches the ASCII/Latin1 character 022 (18
76  decimal, 22 octal).
77  </ul>
78 
79  In wildcard mode, it only knows four primitives:
80  <ul plain>
81  <li><dfn>c</dfn> matches the character 'c'
82  <li><dfn>?</dfn> matches any character
83  <li><dfn>*</dfn> matches any sequence of characters
84  <li><dfn>[]</dfn> matches a defined set of characters - see below.
85  </ul>
86 
87  QRegExp supports Unicode both in the pattern strings and in the
88  strings to be matched.
89 
90  When writing regular expressions in C++ code, remember that C++
91  processes \ characters. So in order to match e.g. a "." character,
92  you must write "\\." in C++ source, not "\.".
93 
94  A character set matches a defined set of characters. For example,
95  [BSD] matches any of 'B', 'D' and 'S'. Within a character set, the
96  special characters '.', '*', '?', '^', '$', '+' and '[' lose their
97  special meanings. The following special characters apply:
98  <ul plain>
99  <li><dfn>^</dfn> When placed first in the list, changes the
100  character set to match any character \e not in the list. To include
101  the character '^' itself in the set, escape it or place it anywhere
102  but first.
103  <li><dfn>-</dfn> Defines a range of characters. To include the
104  character '-' itself in the set, escape it or place it last.
105  <li><dfn>]</dfn> Ends the character set definition. To include the
106  character ']' itself in the set, escape it or place it first (but
107  after the negation operator '^', if present)
108  </ul>
109  Thus, [a-zA-Z0-9.] matches upper and lower case ASCII letters,
110  digits and dot; and [^\s] matches everything except white space.
111 
112  \bug Case insensitive matching is not supported for non-ASCII/Latin1
113  (non-8bit) characters. Any character with a non-zero QChar.row() is
114  matched case sensitively even if the QRegExp is in case insensitive
115  mode.
116 
117  \note In Qt 3.0, the language of regular expressions will contain
118  five more special characters, namely '(', ')', '{', '|' and '}'. To
119  ease porting, it's a good idea to escape these characters with a
120  backslash in all the regular expressions you'll write from now on.
121 */
122 
123 
124 //
125 // The regexp pattern is internally represented as an array of uints,
126 // each element containing an 16-bit character or a 32-bit code
127 // (listed below). User-defined character classes (e.g. [a-zA-Z])
128 // are encoded as this:
129 // uint no: 1 2 3 ...
130 // value: CCL | n from | to from | to
131 //
132 // where n is the (16-bit) number of following range definitions and
133 // from and to define the ranges inclusive. from <= to is always true,
134 // otherwise it is a built-in charclass (Pxx, eg \s - PWS). Single
135 // characters in the class are coded as from==to. Negated classes
136 // (e.g. [^a-z]) use CCN instead of CCL.
137 
138 const uint END = 0x00000000;
139 const uint PWS = 0x10010000; // predef charclass: whitespace (\s)
140 const uint PDG = 0x10020000; // predef charclass: digit (\d)
141 const uint CCL = 0x20010000; // character class []
142 const uint CCN = 0x20020000; // neg character class [^]
143 const uint CHR = 0x40000000; // character
144 const uint BOL = 0x80010000; // beginning of line ^
145 const uint EOL = 0x80020000; // end of line $
146 const uint BOW = 0x80030000; // beginning of word <
147 const uint EOW = 0x80040000; // end of word >
148 const uint ANY = 0x80050000; // any character .
149 const uint CLO = 0x80070000; // Kleene closure *
150 const uint OPT = 0x80080000; // Optional closure ?
151 
152 const uint MCC = 0x20000000; // character class bitmask
153 const uint MCD = 0xffff0000; // code mask
154 const uint MVL = 0x0000ffff; // value mask
155 
156 //
157 // QRegExp::error codes (internal)
158 //
159 
160 const int PatOk = 0; // pattern ok
161 const int PatNull = 1; // no pattern defined
162 const int PatSyntax = 2; // pattern syntax error
163 const int PatOverflow = 4; // pattern too long
164 
165 
166 /*****************************************************************************
167  QRegExp member functions
168  *****************************************************************************/
169 
170 /*!
171  Constructs an empty regular expression.
172 */
173 
175 {
176  rxdata = 0;
177  cs = TRUE;
178  wc = FALSE;
179  error = PatOk;
180 }
181 
182 /*!
183  Constructs a regular expression.
184 
185  \arg \e pattern is the regular expression pattern string.
186  \arg \e caseSensitive specifies whether or not to use case sensitive
187  matching.
188  \arg \e wildcard specifies whether the pattern string should be used for
189  wildcard matching (also called globbing expression), normally used for
190  matching file names.
191 
192  \sa setWildcard()
193 */
194 
196 {
197  rxstring = pattern;
198  rxdata = 0;
199  cs = caseSensitive;
200  wc = wildcard;
201  compile();
202 }
203 
204 /*!
205  Constructs a regular expression which is a copy of \e r.
206  \sa operator=(const QRegExp&)
207 */
208 
210 {
211  rxstring = r.pattern();
212  rxdata = 0;
213  cs = r.caseSensitive();
214  wc = r.wildcard();
215  compile();
216 }
217 
218 /*!
219  Destructs the regular expression and cleans up its internal data.
220 */
221 
223 {
224  if ( rxdata ) // Avoid purify complaints
225  delete [] rxdata;
226 }
227 
228 /*!
229  Copies the regexp \e r and returns a reference to this regexp.
230  The case sensitivity and wildcard options are copied, as well.
231 */
232 
234 {
235  rxstring = r.rxstring;
236  cs = r.cs;
237  wc = r.wc;
238  compile();
239  return *this;
240 }
241 
242 /*!
243  \obsolete
244  Consider using setPattern() instead of this method.
245 
246  Sets the pattern string to \e pattern and returns a reference to this regexp.
247  The case sensitivity or wildcard options do not change.
248 */
249 
251 {
252  rxstring = pattern;
253  compile();
254  return *this;
255 }
256 
257 
258 /*!
259  Returns TRUE if this regexp is equal to \e r.
260 
261  Two regexp objects are equal if they have equal pattern strings,
262  case sensitivity options and wildcard options.
263 */
264 
265 bool QRegExp::operator==( const QRegExp &r ) const
266 {
267  return rxstring == r.rxstring && cs == r.cs && wc == r.wc;
268 }
269 
270 /*!
271  \fn bool QRegExp::operator!=( const QRegExp &r ) const
272 
273  Returns TRUE if this regexp is \e not equal to \e r.
274 
275  \sa operator==()
276 */
277 
278 /*!
279  \fn bool QRegExp::isEmpty() const
280  Returns TRUE if the regexp is empty.
281 */
282 
283 /*!
284  \fn bool QRegExp::isValid() const
285  Returns TRUE if the regexp is valid, or FALSE if it is invalid.
286 
287  The pattern "[a-z" is an example of an invalid pattern, since it lacks a
288  closing bracket.
289 */
290 
291 
292 /*!
293  \fn bool QRegExp::wildcard() const
294  Returns TRUE if wildcard mode is on, otherwise FALSE. \sa setWildcard().
295 */
296 
297 /*!
298  Sets the wildcard option for the regular expression. The default
299  is FALSE.
300 
301  Setting \e wildcard to TRUE makes it convenient to match filenames
302  instead of plain text.
303 
304  For example, "qr*.cpp" matches the string "qregexp.cpp" in wildcard mode,
305  but not "qicpp" (which would be matched in normal mode).
306 
307  \sa wildcard()
308 */
309 
311 {
312  if ( wildcard != wc ) {
313  wc = wildcard;
314  compile();
315  }
316 }
317 
318 /*!
319  \fn bool QRegExp::caseSensitive() const
320 
321  Returns TRUE if case sensitivity is enabled, otherwise FALSE. The
322  default is TRUE.
323 
324  \sa setCaseSensitive()
325 */
326 
327 /*!
328  Enables or disables case sensitive matching.
329 
330  In case sensitive mode, "a.e" matches "axe" but not "Axe".
331 
332  See also: caseSensitive()
333 */
334 
335 void QRegExp::setCaseSensitive( bool enable )
336 {
337  if ( cs != enable ) {
338  cs = enable;
339  compile();
340  }
341 }
342 
343 
344 /*!
345  \fn QCString QRegExp::pattern() const
346  Returns the pattern string of the regexp.
347 */
348 
349 
350 /*!
351  \fn void QRegExp::setPattern(const QCString & pattern)
352  Sets the pattern string to \a pattern and returns a reference to this regexp.
353  The case sensitivity or wildcard options do not change.
354 */
355 
356 static inline bool iswordchar( int x )
357 {
358  return isalnum(x) || x == '_'; //# Only 8-bit support
359 }
360 
361 
362 /*!
363  \internal
364  Match character class
365 */
366 
367 static bool matchcharclass( uint *rxd, char c )
368 {
369  uint *d = rxd;
370  uint clcode = *d & MCD;
371  bool neg = clcode == CCN;
372  if ( clcode != CCL && clcode != CCN)
373  qWarning("QRegExp: Internal error, please report to qt-bugs@trolltech.com");
374  uint numFields = *d & MVL;
375  uint cval = (unsigned char)c; //(((uint)(c.row())) << 8) | ((uint)c.cell());
376  bool found = FALSE;
377  for ( int i = 0; i < (int)numFields; i++ ) {
378  d++;
379  if ( *d == PWS && isspace(c) ) {
380  found = TRUE;
381  break;
382  }
383  if ( *d == PDG && isdigit(c) ) {
384  found = TRUE;
385  break;
386  }
387  else {
388  uint from = ( *d & MCD ) >> 16;
389  uint to = *d & MVL;
390  if ( (cval >= from) && (cval <= to) ) {
391  found = TRUE;
392  break;
393  }
394  }
395  }
396  return neg ? !found : found;
397 }
398 
399 
400 
401 /*
402  Internal: Recursively match string.
403 */
404 
405 static int matchstring( uint *rxd, const char *str, uint strlength,
406  const char *bol, bool cs )
407 {
408  const char *p = str;
409  const char *start = p;
410  uint pl = strlength;
411  uint *d = rxd;
412 
413  //### in all cases here: handle pl == 0! (don't read past strlen)
414  while ( *d ) {
415  if ( *d & CHR ) { // match char
416  if ( !pl )
417  return -1;
418  char c = *d;
419  if ( !cs /*&& !c.row()*/ ) { // case insensitive, #Only 8bit
420  if ( tolower(*p) != c )
421  return -1;
422  p++;
423  pl--;
424  } else { // case insensitive
425  if ( *p != c )
426  return -1;
427  p++;
428  pl--;
429  }
430  d++;
431  }
432  else if ( *d & MCC ) { // match char class
433  if ( !pl )
434  return -1;
435  if ( !matchcharclass( d, *p ) )
436  return -1;
437  p++;
438  pl--;
439  d += (*d & MVL) + 1;
440  }
441  else switch ( *d++ ) {
442  case PWS: // match whitespace
443  if ( !pl || !isspace(*p) )
444  return -1;
445  p++;
446  pl--;
447  break;
448  case PDG: // match digits
449  if ( !pl || !isdigit(*p) )
450  return -1;
451  p++;
452  pl--;
453  break;
454  case ANY: // match anything
455  if ( !pl )
456  return -1;
457  p++;
458  pl--;
459  break;
460  case BOL: // match beginning of line
461  if ( p != bol )
462  return -1;
463  break;
464  case EOL: // match end of line
465  if ( pl )
466  return -1;
467  break;
468  case BOW: // match beginning of word
469  if ( !iswordchar(*p) || (p > bol && iswordchar(*(p-1)) ) )
470  return -1;
471  break;
472  case EOW: // match end of word
473  if ( iswordchar(*p) || p == bol || !iswordchar(*(p-1)) )
474  return -1;
475  break;
476  case CLO: // Kleene closure
477  {
478  const char *first_p = p;
479  if ( *d & CHR ) { // match char
480  char c = *d;
481  if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit
482  while ( pl /*&& !p->row()*/ && tolower(*p)==c ) {
483  p++;
484  pl--;
485  }
486  }
487  else { // case sensitive
488  while ( pl && *p == c ) {
489  p++;
490  pl--;
491  }
492  }
493  d++;
494  }
495  else if ( *d & MCC ) { // match char class
496  while( pl && matchcharclass( d, *p ) ) {
497  p++;
498  pl--;
499  }
500  d += (*d & MVL) + 1;
501  }
502  else if ( *d == PWS ) {
503  while ( pl && isspace(*p) ) {
504  p++;
505  pl--;
506  }
507  d++;
508  }
509  else if ( *d == PDG ) {
510  while ( pl && isdigit(*p) ) {
511  p++;
512  pl--;
513  }
514  d++;
515  }
516  else if ( *d == ANY ) {
517  p += pl;
518  pl = 0;
519  d++;
520  }
521  else {
522  return -1; // error
523  }
524  d++; // skip CLO's END
525  while ( p >= first_p ) { // go backwards
526  int end = matchstring( d, p, pl, bol, cs );
527  if ( end >= 0 )
528  return ( (int)(p - start) ) + end;
529  if ( !p )
530  return -1;
531  --p;
532  ++pl;
533  }
534  }
535  return -1;
536  case OPT: // optional closure
537  {
538  const char *first_p = p;
539  if ( *d & CHR ) { // match char
540  char c = *d;
541  if ( !cs /*&& !c.row()*/ ) { // case insensitive, #only 8bit
542  if ( pl && /*!p->row() &&*/ tolower(*p) == c ) {
543  p++;
544  pl--;
545  }
546  }
547  else { // case sensitive
548  if ( pl && *p == c ) {
549  p++;
550  pl--;
551  }
552  }
553  d++;
554  }
555  else if ( *d & MCC ) { // match char class
556  if ( pl && matchcharclass( d, *p ) ) {
557  p++;
558  pl--;
559  }
560  d += (*d & MVL) + 1;
561  }
562  else if ( *d == PWS ) {
563  if ( pl && isspace(*p) ) {
564  p++;
565  pl--;
566  }
567  d++;
568  }
569  else if ( *d == PDG ) {
570  if ( pl && isdigit(*p) ) {
571  p++;
572  pl--;
573  }
574  d++;
575  }
576  else if ( *d == ANY ) {
577  if ( pl ) {
578  p++;
579  pl--;
580  }
581  d++;
582  }
583  else {
584  return -1; // error
585  }
586  d++; // skip OPT's END
587  while ( p >= first_p ) { // go backwards
588  int end = matchstring( d, p, pl, bol, cs );
589  if ( end >= 0 )
590  return ( (int)(p - start) ) + end;
591  if ( !p )
592  return -1;
593  --p;
594  ++pl;
595  }
596  }
597  return -1;
598 
599  default: // error
600  return -1;
601  }
602  }
603  return (int)(p - start);
604 }
605 
606 
607 /*!
608  \internal
609  Recursively match string.
610 */
611 
612 // This is obsolete now, but since it is protected (not private), it
613 // is still implemented on the off-chance that somebody has made a
614 // class derived from QRegExp and calls this directly.
615 // Qt 3.0: Remove this?
616 
617 #if 0
618 const char *QRegExp::matchstr( uint *rxd, const QChar *str, uint strlength,
619  const QChar *bol ) const
620 {
621  int len = matchstring( rxd, str, strlength, bol, cs );
622  if ( len < 0 )
623  return 0;
624  return str + len;
625 }
626 #endif
627 
628 /*!
629  Attempts to match in \e str, starting from position \e index.
630  Returns the position of the match, or -1 if there was no match.
631 
632  If \e len is not a null pointer, the length of the match is stored in
633  \e *len.
634 
635  If \e indexIsStart is TRUE (the default), the position \e index in
636  the string will match the start-of-input primitive (^) in the
637  regexp, if present. Otherwise, position 0 in \e str will match.
638 
639  Example:
640  \code
641  QRegExp r("[0-9]*\\.[0-9]+"); // matches floating point
642  int len;
643  r.match("pi = 3.1416", 0, &len); // returns 5, len == 6
644  \endcode
645 
646  \note In Qt 3.0, this function will be replaced by find().
647 */
648 
649 int QRegExp::match( const QCString &str, int index, int *len,
650  bool indexIsStart ) const
651 {
652  if ( !isValid() || isEmpty() )
653  return -1;
654  if ( str.length() < (uint)index )
655  return -1;
656  const char *start = str.data();
657  const char *p = start + index;
658  uint pl = str.length() - index;
659  uint *d = rxdata;
660  int ep = -1;
661 
662  if ( *d == BOL ) { // match from beginning of line
663  ep = matchstring( d, p, pl, indexIsStart ? p : start, cs );
664  } else {
665  if ( *d & CHR ) {
666  char c = *d;
667  if ( !cs /*&& !c.row()*/ ) { // case sensitive, # only 8bit
668  while ( pl && ( /*p->row() ||*/ tolower(*p) != c ) ) {
669  p++;
670  pl--;
671  }
672  } else { // case insensitive
673  while ( pl && *p != c ) {
674  p++;
675  pl--;
676  }
677  }
678  }
679  while( 1 ) { // regular match
680  ep = matchstring( d, p, pl, indexIsStart ? start+index : start, cs );
681  if ( ep >= 0 )
682  break;
683  if ( !pl )
684  break;
685  p++;
686  pl--;
687  }
688  }
689  if ( len )
690  *len = ep >= 0 ? ep : 0; // No match -> 0, for historical reasons
691  return ep >= 0 ? (int)(p - start) : -1; // return index;
692 }
693 
694 /*! \fn int QRegExp::find( const QCString& str, int index )
695 
696  Attempts to match in \e str, starting from position \e index.
697  Returns the position of the match, or -1 if there was no match.
698 
699  \sa match()
700 */
701 
702 //
703 // Translate wildcard pattern to standard regexp pattern.
704 // Ex: *.cpp ==> ^.*\.cpp$
705 //
706 
707 static QCString wc2rx( const QCString &pattern )
708 {
709  int patlen = (int)pattern.length();
710  QCString wcpattern("^");
711 
712  char c;
713  for( int i = 0; i < patlen; i++ ) {
714  c = pattern[i];
715  switch ( (char)c ) {
716  case '*': // '*' ==> '.*'
717  wcpattern += '.';
718  break;
719  case '?': // '?' ==> '.'
720  c = '.';
721  break;
722  case '.': // quote special regexp chars
723  case '+':
724  case '\\':
725  case '$':
726  case '^':
727  wcpattern += '\\';
728  break;
729  case '[':
730  if ( (char)pattern[i+1] == '^' ) { // don't quote '^' after '['
731  wcpattern += '[';
732  c = pattern[i+1];
733  i++;
734  }
735  break;
736  }
737  wcpattern += c;
738 
739  }
740  wcpattern += '$';
741  return wcpattern; // return new regexp pattern
742 }
743 
744 
745 //
746 // Internal: Get char value and increment pointer.
747 //
748 
749 static uint char_val( const char **str, uint *strlength ) // get char value
750 {
751  const char *p = *str;
752  uint pl = *strlength;
753  uint len = 1;
754  uint v = 0;
755  if ( (char)*p == '\\' ) { // escaped code
756  p++;
757  pl--;
758  if ( !pl ) { // it is just a '\'
759  (*str)++;
760  (*strlength)--;
761  return '\\';
762  }
763  len++; // length at least 2
764  int i;
765  char c;
766  char ch = tolower((char)*p);
767  switch ( ch ) {
768  case 'b': v = '\b'; break; // bell
769  case 'f': v = '\f'; break; // form feed
770  case 'n': v = '\n'; break; // newline
771  case 'r': v = '\r'; break; // return
772  case 't': v = '\t'; break; // tab
773  case 's': v = PWS; break; // whitespace charclass
774  case 'd': v = PDG; break; // digit charclass
775  case '<': v = BOW; break; // word beginning matcher
776  case '>': v = EOW; break; // word ending matcher
777 
778  case 'x': { // hex code
779  p++;
780  pl--;
781  for ( i = 0; (i < 4) && pl; i++ ) { //up to 4 hex digits
782  c = tolower((char)*p);
783  bool a = ( c >= 'a' && c <= 'f' );
784  if ( (c >= '0' && c <= '9') || a ) {
785  v <<= 4;
786  v += a ? 10 + c - 'a' : c - '0';
787  len++;
788  }
789  else {
790  break;
791  }
792  p++;
793  pl--;
794  }
795  }
796  break;
797 
798  default: {
799  if ( ch >= '0' && ch <= '7' ) { //octal code
800  len--;
801  for ( i = 0; (i < 3) && pl; i++ ) { // up to 3 oct digits
802  c = (char)*p;
803  if ( c >= '0' && c <= '7' ) {
804  v <<= 3;
805  v += c - '0';
806  len++;
807  }
808  else {
809  break;
810  }
811  p++;
812  pl--;
813  }
814  }
815  else { // not an octal number
816  v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
817  }
818  }
819  }
820  } else {
821  v = (uint)*p; //(((uint)(p->row())) << 8) | ((uint)p->cell());
822  }
823  *str += len;
824  *strlength -= len;
825  return v;
826 }
827 
828 
829 #if 0 //defined(DEBUG)
830 static uint *dump( uint *p )
831 {
832  while ( *p != END ) {
833  if ( *p & CHR ) {
834  uchar uc = (uchar)*p;
835  char c = (char)uc;
836  uint u = (uint)uc; //(((uint)(uc.row())) << 8) | ((uint)uc.cell());
837  qDebug( "\tCHR\tU%04x (%c)", u, (c ? c : ' '));
838  p++;
839  }
840  else if ( *p & MCC ) {
841  uint clcode = *p & MCD;
842  uint numFields = *p & MVL;
843  if ( clcode == CCL )
844  qDebug( "\tCCL\t%i", numFields );
845  else if ( clcode == CCN )
846  qDebug( "\tCCN\t%i", numFields );
847  else
848  qDebug("coding error!");
849  for ( int i = 0; i < (int)numFields; i++ ) {
850  p++;
851  if ( *p == PWS )
852  qDebug( "\t\tPWS" );
853  else if ( *p == PDG )
854  qDebug( "\t\tPDG" );
855  else {
856  uint from = ( *p & MCD ) >> 16;
857  uint to = *p & MVL;
858  char fc = (char)from;
859  char tc = (char)to;
860  qDebug( "\t\tU%04x (%c) - U%04x (%c)", from,
861  (fc ? fc : ' '), to, (tc ? tc : ' ') );
862  }
863  }
864  p++;
865  }
866  else switch ( *p++ ) {
867  case PWS:
868  qDebug( "\tPWS" );
869  break;
870  case PDG:
871  qDebug( "\tPDG" );
872  break;
873  case BOL:
874  qDebug( "\tBOL" );
875  break;
876  case EOL:
877  qDebug( "\tEOL" );
878  break;
879  case BOW:
880  qDebug( "\tBOW" );
881  break;
882  case EOW:
883  qDebug( "\tEOW" );
884  break;
885  case ANY:
886  qDebug( "\tANY" );
887  break;
888  case CLO:
889  qDebug( "\tCLO" );
890  p = dump( p );
891  break;
892  case OPT:
893  qDebug( "\tOPT" );
894  p = dump( p );
895  break;
896  }
897  }
898  qDebug( "\tEND" );
899  return p+1;
900 }
901 #endif // DEBUG
902 
903 
904 static const int maxlen = 1024; // max length of regexp array
905 static uint rxarray[ maxlen ]; // tmp regexp array
906 
907 /*!
908  \internal
909  Compiles the regular expression and stores the result in rxdata.
910  The 'error' flag is set to non-zero if an error is detected.
911  NOTE! This function is not reentrant!
912 */
913 
915 {
916  if ( rxdata ) { // delete old data
917  delete [] rxdata;
918  rxdata = 0;
919  }
920  if ( rxstring.isEmpty() ) { // no regexp pattern set
921  error = PatNull;
922  return;
923  }
924 
925  error = PatOk; // assume pattern is ok
926 
928  if ( wc )
929  pattern = wc2rx(rxstring);
930  else
931  pattern = rxstring;
932  const char *start = pattern.data(); // pattern pointer
933  const char *p = start; // pattern pointer
934  uint pl = pattern.length();
935  uint *d = rxarray; // data pointer
936  uint *prev_d = 0;
937 
938 #define GEN(x) *d++ = (x)
939 
940  while ( pl ) {
941  char ch = (char)*p;
942  switch ( ch ) {
943 
944  case '^': // beginning of line
945  prev_d = d;
946  GEN( p == start ? BOL : (CHR | ch) );
947  p++;
948  pl--;
949  break;
950 
951  case '$': // end of line
952  prev_d = d;
953  GEN( pl == 1 ? EOL : (CHR | ch) );
954  p++;
955  pl--;
956  break;
957 
958  case '.': // any char
959  prev_d = d;
960  GEN( ANY );
961  p++;
962  pl--;
963  break;
964 
965  case '[': // character class
966  {
967  prev_d = d;
968  p++;
969  pl--;
970  if ( !pl ) {
971  error = PatSyntax;
972  return;
973  }
974  bool firstIsEscaped = ( (char)*p == '\\' );
975  uint cch = char_val( &p, &pl );
976  if ( cch == '^' && !firstIsEscaped ) { // negate!
977  GEN( CCN );
978  if ( !pl ) {
979  error = PatSyntax;
980  return;
981  }
982  cch = char_val( &p, &pl );
983  } else {
984  GEN( CCL );
985  }
986  uint numFields = 0;
987  while ( pl ) {
988  if ((pl>2) && ((char)*p == '-') && ((char)*(p+1) != ']')) {
989  // Found a range
990  char_val( &p, &pl ); // Read the '-'
991  uint cch2 = char_val( &p, &pl ); // Read the range end
992  if ( cch > cch2 ) { // swap start and stop
993  int tmp = cch;
994  cch = cch2;
995  cch2 = tmp;
996  }
997  GEN( (cch << 16) | cch2 ); // from < to
998  numFields++;
999  }
1000  else {
1001  // Found a single character
1002  if ( cch & MCD ) // It's a code; will not be mistaken
1003  GEN( cch ); // for a range, since from > to
1004  else
1005  GEN( (cch << 16) | cch ); // from == to range
1006  numFields++;
1007  }
1008  if ( d >= rxarray + maxlen ) { // pattern too long
1009  error = PatOverflow;
1010  return;
1011  }
1012  if ( !pl ) { // At least ']' should be left
1013  error = PatSyntax;
1014  return;
1015  }
1016  bool nextIsEscaped = ( (char)*p == '\\' );
1017  cch = char_val( &p, &pl );
1018  if ( cch == (uint)']' && !nextIsEscaped )
1019  break;
1020  if ( !pl ) { // End, should have seen ']'
1021  error = PatSyntax;
1022  return;
1023  }
1024  }
1025  *prev_d |= numFields; // Store number of fields
1026  }
1027  break;
1028 
1029  case '*': // Kleene closure, or
1030  case '+': // positive closure, or
1031  case '?': // optional closure
1032  {
1033  if ( prev_d == 0 ) { // no previous expression
1034  error = PatSyntax; // empty closure
1035  return;
1036  }
1037  switch ( *prev_d ) { // test if invalid closure
1038  case BOL:
1039  case BOW:
1040  case EOW:
1041  case CLO:
1042  case OPT:
1043  error = PatSyntax;
1044  return;
1045  }
1046  int ddiff = (int)(d - prev_d);
1047  if ( *p == '+' ) { // convert to Kleene closure
1048  if ( d + ddiff >= rxarray + maxlen ) {
1049  error = PatOverflow; // pattern too long
1050  return;
1051  }
1052  memcpy( d, prev_d, ddiff*sizeof(uint) );
1053  d += ddiff;
1054  prev_d += ddiff;
1055  }
1056  memmove( prev_d+1, prev_d, ddiff*sizeof(uint) );
1057  *prev_d = ch == '?' ? OPT : CLO;
1058  d++;
1059  GEN( END );
1060  p++;
1061  pl--;
1062  }
1063  break;
1064 
1065  default:
1066  {
1067  prev_d = d;
1068  uint cv = char_val( &p, &pl );
1069  if ( cv & MCD ) { // It's a code
1070  GEN( cv );
1071  }
1072  else {
1073  if ( !cs && cv <= 0xff ) // #only 8bit support
1074  cv = tolower( cv );
1075  GEN( CHR | cv );
1076  }
1077  }
1078  }
1079  if ( d >= rxarray + maxlen ) { // oops!
1080  error = PatOverflow; // pattern too long
1081  return;
1082  }
1083  }
1084  GEN( END );
1085  int len = (int)(d - rxarray);
1086  rxdata = new uint[ len ]; // copy from rxarray to rxdata
1087  CHECK_PTR( rxdata );
1088  memcpy( rxdata, rxarray, len*sizeof(uint) );
1089 #if defined(DEBUG)
1090  //dump( rxdata ); // uncomment this line for debugging
1091 #endif
1092 }
end
while True: pbar.update(maxval-len(onlies[E][S])) #print iS, "/", len(onlies[E][S]) found = False for...
#define GEN(x)
void qDebug(const char *msg,...)
Definition: qglobal.cpp:376
const uint END
Definition: qregexp.cpp:138
~QRegExp()
Definition: qregexp.cpp:222
static int matchstring(uint *rxd, const char *str, uint strlength, const char *bol, bool cs)
Definition: qregexp.cpp:405
static uint char_val(const char **str, uint *strlength)
Definition: qregexp.cpp:749
bool isEmpty() const
Definition: qcstring.h:189
The QRegExp class provides pattern matching using regular expressions or wildcards.
Definition: qregexp.h:46
static const int maxlen
Definition: qregexp.cpp:904
uint length() const
Definition: qcstring.h:195
const uint CLO
Definition: qregexp.cpp:149
const uint PWS
Definition: qregexp.cpp:139
bool wc
Definition: qregexp.h:88
const uint BOL
Definition: qregexp.cpp:144
const bool FALSE
Definition: qglobal.h:370
error
Definition: include.cc:26
void qWarning(const char *msg,...)
Definition: qglobal.cpp:409
void compile()
Definition: qregexp.cpp:914
The QChar class provides a light-weight Unicode character.
Definition: qstring.h:56
static bool matchcharclass(uint *rxd, char c)
Definition: qregexp.cpp:367
QCString rxstring
Definition: qregexp.h:84
const uint MVL
Definition: qregexp.cpp:154
unsigned char uchar
Definition: nybbler.cc:11
bool isEmpty() const
Definition: qregexp.h:60
const uint EOL
Definition: qregexp.cpp:145
uint * rxdata
Definition: qregexp.h:85
bool operator==(const QRegExp &) const
Definition: qregexp.cpp:265
const uint PDG
Definition: qregexp.cpp:140
const int PatOverflow
Definition: qregexp.cpp:163
QRegExp & operator=(const QRegExp &)
Definition: qregexp.cpp:233
const uint BOW
Definition: qregexp.cpp:146
const uint CHR
Definition: qregexp.cpp:143
const char * matchstr(uint *, const char *, uint, const char *) const
const double a
static bool iswordchar(int x)
Definition: qregexp.cpp:356
def dump(input_file, output_file)
Definition: dumpTree.py:102
const uint CCN
Definition: qregexp.cpp:142
QRegExp()
Definition: qregexp.cpp:174
p
Definition: test.py:223
const int PatOk
Definition: qregexp.cpp:160
void setCaseSensitive(bool)
Definition: qregexp.cpp:335
const char * data() const
Definition: qcstring.h:207
string tmp
Definition: languages.py:63
const uint MCC
Definition: qregexp.cpp:152
int match(const QCString &str, int index=0, int *len=0, bool indexIsStart=TRUE) const
Definition: qregexp.cpp:649
const int PatNull
Definition: qregexp.cpp:161
const uint ANY
Definition: qregexp.cpp:148
const uint MCD
Definition: qregexp.cpp:153
static QCString wc2rx(const QCString &pattern)
Definition: qregexp.cpp:707
#define CHECK_PTR(p)
Definition: qglobal.h:601
static uint rxarray[maxlen]
Definition: qregexp.cpp:905
bool wildcard() const
Definition: qregexp.h:66
list x
Definition: train.py:276
bool caseSensitive() const
Definition: qregexp.h:63
const uint CCL
Definition: qregexp.cpp:141
bool cs
Definition: qregexp.h:87
bool isValid() const
Definition: qregexp.h:61
unsigned uint
Definition: qglobal.h:351
QCString pattern() const
Definition: qregexp.h:69
const bool TRUE
Definition: qglobal.h:371
const int PatSyntax
Definition: qregexp.cpp:162
static QCString str
void setWildcard(bool)
Definition: qregexp.cpp:310
const uint EOW
Definition: qregexp.cpp:147
const uint OPT
Definition: qregexp.cpp:150