--- NEWS 2007-02-28 19:24:08.000000000 +0100 +++ NEWS 2007-04-20 11:05:40.632933546 +0200 @@ -2,6 +2,7 @@ ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 28 Feb 2007, Version 4.4.6 +- Downgrade PCRE from 7.0 to version 6.6. Segfault using SPIP, Joomla Mambot etc. (Baco) - Updated PCRE to version 7.0. (Nuno) - Fixed segfault in ext/session when register_globals=On. (Tony) - Fixed bug #40635 (segfault in cURL extension). (Tony) --- configure 2007-02-28 19:28:15.000000000 +0100 +++ configure 2006-08-15 14:01:18.000000000 +0200 @@ -18364,7 +18346,7 @@ ext_builddir=ext/pcre ext_srcdir=$abs_srcdir/ext/pcre - ac_extra=`echo "-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib"|sed s#@ext_srcdir@#$ext_srcdir#g` + ac_extra=`echo "-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -I@ext_srcdir@/pcrelib"|sed s#@ext_srcdir@#$ext_srcdir#g` if test "$ext_shared" != "shared" && test "$ext_shared" != "yes" && test "" != "cli"; then @@ -18388,7 +18370,7 @@ old_IFS=$IFS - for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do + for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_dfa_exec.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do IFS=. set $ac_src @@ -18434,7 +18416,7 @@ old_IFS=$IFS - for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do + for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_dfa_exec.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do IFS=. set $ac_src @@ -18551,7 +18533,7 @@ old_IFS=$IFS - for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do + for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_dfa_exec.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do IFS=. set $ac_src @@ -18594,7 +18576,7 @@ old_IFS=$IFS - for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do + for ac_src in pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_dfa_exec.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c; do IFS=. set $ac_src @@ -18795,7 +18777,7 @@ ext_builddir=ext/pcre ext_srcdir=$abs_srcdir/ext/pcre - ac_extra=`echo "-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000"|sed s#@ext_srcdir@#$ext_srcdir#g` + ac_extra=`echo "-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000"|sed s#@ext_srcdir@#$ext_srcdir#g` if test "$ext_shared" != "shared" && test "$ext_shared" != "yes" && test "" != "cli"; then --- ext/pcre/config0.m4 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/config0.m4 2006-03-06 22:57:53.000000000 +0100 @@ -1,5 +1,5 @@ dnl -dnl $Id: config0.m4,v 1.29.2.7.2.4 2007/02/13 20:23:28 nlopess Exp $ +dnl $Id: config0.m4,v 1.29.2.7.2.2 2006/03/06 21:57:53 andrei Exp $ dnl dnl By default we'll compile and link against the bundled PCRE library @@ -13,7 +13,7 @@ if test "$PHP_PCRE_REGEX" != "no"; then if test "$PHP_PCRE_REGEX" = "yes"; then - PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_newline.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000 -DEBCDIC=0 -I@ext_srcdir@/pcrelib) + PHP_NEW_EXTENSION(pcre, pcrelib/pcre_chartables.c pcrelib/pcre_ucp_searchfuncs.c pcrelib/pcre_compile.c pcrelib/pcre_config.c pcrelib/pcre_dfa_exec.c pcrelib/pcre_exec.c pcrelib/pcre_fullinfo.c pcrelib/pcre_get.c pcrelib/pcre_globals.c pcrelib/pcre_info.c pcrelib/pcre_maketables.c pcrelib/pcre_ord2utf8.c pcrelib/pcre_refcount.c pcrelib/pcre_study.c pcrelib/pcre_tables.c pcrelib/pcre_try_flipped.c pcrelib/pcre_valid_utf8.c pcrelib/pcre_version.c pcrelib/pcre_xclass.c php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -I@ext_srcdir@/pcrelib) PHP_ADD_BUILD_DIR($ext_builddir/pcrelib) AC_DEFINE(HAVE_BUNDLED_PCRE, 1, [ ]) else @@ -50,7 +50,7 @@ AC_DEFINE(HAVE_PCRE, 1, [ ]) PHP_ADD_INCLUDE($PCRE_INCDIR) - PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000 -DMAX_NAME_SIZE=32 -DMAX_NAME_COUNT=10000 -DMAX_DUPLENGTH=30000) + PHP_NEW_EXTENSION(pcre, php_pcre.c, $ext_shared,,-DEXPORT= -DNEWLINE=10 -DSUPPORT_UTF8 -DSUPPORT_UCP -DLINK_SIZE=2 -DPOSIX_MALLOC_THRESHOLD=10 -DMATCH_LIMIT=10000000 -DMATCH_LIMIT_RECURSION=10000000) fi PHP_SUBST(PCRE_SHARED_LIBADD) fi --- ext/pcre/pcrelib/AUTHORS 2006-08-30 22:06:53.000000000 +0200 +++ ext/pcre/pcrelib/AUTHORS 2005-08-09 19:41:56.000000000 +0200 @@ -8,7 +8,7 @@ University of Cambridge Computing Service, Cambridge, England. Phone: +44 1223 334714. -Copyright (c) 1997-2006 University of Cambridge +Copyright (c) 1997-2005 University of Cambridge All rights reserved @@ -17,7 +17,7 @@ Written by: Google Inc. -Copyright (c) 2006 Google Inc +Copyright (c) 2005 Google Inc All rights reserved #### --- ext/pcre/pcrelib/COPYING 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/COPYING 2005-08-09 19:41:56.000000000 +0200 @@ -4,7 +4,7 @@ PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. -Release 7 of PCRE is distributed under the terms of the "BSD" licence, as +Release 6 of PCRE is distributed under the terms of the "BSD" licence, as specified below. The documentation for PCRE, supplied in the "doc" directory, is distributed under the same terms as the software itself. @@ -22,7 +22,7 @@ University of Cambridge Computing Service, Cambridge, England. Phone: +44 1223 334714. -Copyright (c) 1997-2006 University of Cambridge +Copyright (c) 1997-2005 University of Cambridge All rights reserved. @@ -31,7 +31,7 @@ Contributed by: Google Inc. -Copyright (c) 2006, Google Inc. +Copyright (c) 2005, Google Inc. All rights reserved. --- ext/pcre/pcrelib/ChangeLog 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/ChangeLog 2006-03-06 22:57:53.000000000 +0100 @@ -1,448 +1,6 @@ ChangeLog for PCRE ------------------ -Version 7.0 19-Dec-06 ---------------------- - - 1. Fixed a signed/unsigned compiler warning in pcre_compile.c, shown up by - moving to gcc 4.1.1. - - 2. The -S option for pcretest uses setrlimit(); I had omitted to #include - sys/time.h, which is documented as needed for this function. It doesn't - seem to matter on Linux, but it showed up on some releases of OS X. - - 3. It seems that there are systems where bytes whose values are greater than - 127 match isprint() in the "C" locale. The "C" locale should be the - default when a C program starts up. In most systems, only ASCII printing - characters match isprint(). This difference caused the output from pcretest - to vary, making some of the tests fail. I have changed pcretest so that: - - (a) When it is outputting text in the compiled version of a pattern, bytes - other than 32-126 are always shown as hex escapes. - - (b) When it is outputting text that is a matched part of a subject string, - it does the same, unless a different locale has been set for the match - (using the /L modifier). In this case, it uses isprint() to decide. - - 4. Fixed a major bug that caused incorrect computation of the amount of memory - required for a compiled pattern when options that changed within the - pattern affected the logic of the preliminary scan that determines the - length. The relevant options are -x, and -i in UTF-8 mode. The result was - that the computed length was too small. The symptoms of this bug were - either the PCRE error "internal error: code overflow" from pcre_compile(), - or a glibc crash with a message such as "pcretest: free(): invalid next - size (fast)". Examples of patterns that provoked this bug (shown in - pcretest format) are: - - /(?-x: )/x - /(?x)(?-x: \s*#\s*)/ - /((?i)[\x{c0}])/8 - /(?i:[\x{c0}])/8 - - HOWEVER: Change 17 below makes this fix obsolete as the memory computation - is now done differently. - - 5. Applied patches from Google to: (a) add a QuoteMeta function to the C++ - wrapper classes; (b) implement a new function in the C++ scanner that is - more efficient than the old way of doing things because it avoids levels of - recursion in the regex matching; (c) add a paragraph to the documentation - for the FullMatch() function. - - 6. The escape sequence \n was being treated as whatever was defined as - "newline". Not only was this contrary to the documentation, which states - that \n is character 10 (hex 0A), but it also went horribly wrong when - "newline" was defined as CRLF. This has been fixed. - - 7. In pcre_dfa_exec.c the value of an unsigned integer (the variable called c) - was being set to -1 for the "end of line" case (supposedly a value that no - character can have). Though this value is never used (the check for end of - line is "zero bytes in current character"), it caused compiler complaints. - I've changed it to 0xffffffff. - - 8. In pcre_version.c, the version string was being built by a sequence of - C macros that, in the event of PCRE_PRERELEASE being defined as an empty - string (as it is for production releases) called a macro with an empty - argument. The C standard says the result of this is undefined. The gcc - compiler treats it as an empty string (which was what was wanted) but it is - reported that Visual C gives an error. The source has been hacked around to - avoid this problem. - - 9. On the advice of a Windows user, included and in Windows - builds of pcretest, and changed the call to _setmode() to use _O_BINARY - instead of 0x8000. Made all the #ifdefs test both _WIN32 and WIN32 (not all - of them did). - -10. Originally, pcretest opened its input and output without "b"; then I was - told that "b" was needed in some environments, so it was added for release - 5.0 to both the input and output. (It makes no difference on Unix-like - systems.) Later I was told that it is wrong for the input on Windows. I've - now abstracted the modes into two macros, to make it easier to fiddle with - them, and removed "b" from the input mode under Windows. - -11. Added pkgconfig support for the C++ wrapper library, libpcrecpp. - -12. Added -help and --help to pcretest as an official way of being reminded - of the options. - -13. Removed some redundant semicolons after macro calls in pcrecpparg.h.in - and pcrecpp.cc because they annoy compilers at high warning levels. - -14. A bit of tidying/refactoring in pcre_exec.c in the main bumpalong loop. - -15. Fixed an occurrence of == in configure.ac that should have been = (shell - scripts are not C programs :-) and which was not noticed because it works - on Linux. - -16. pcretest is supposed to handle any length of pattern and data line (as one - line or as a continued sequence of lines) by extending its input buffer if - necessary. This feature was broken for very long pattern lines, leading to - a string of junk being passed to pcre_compile() if the pattern was longer - than about 50K. - -17. I have done a major re-factoring of the way pcre_compile() computes the - amount of memory needed for a compiled pattern. Previously, there was code - that made a preliminary scan of the pattern in order to do this. That was - OK when PCRE was new, but as the facilities have expanded, it has become - harder and harder to keep it in step with the real compile phase, and there - have been a number of bugs (see for example, 4 above). I have now found a - cunning way of running the real compile function in a "fake" mode that - enables it to compute how much memory it would need, while actually only - ever using a few hundred bytes of working memory and without too many - tests of the mode. This should make future maintenance and development - easier. A side effect of this work is that the limit of 200 on the nesting - depth of parentheses has been removed (though this was never a serious - limitation, I suspect). However, there is a downside: pcre_compile() now - runs more slowly than before (30% or more, depending on the pattern). I - hope this isn't a big issue. There is no effect on runtime performance. - -18. Fixed a minor bug in pcretest: if a pattern line was not terminated by a - newline (only possible for the last line of a file) and it was a - pattern that set a locale (followed by /Lsomething), pcretest crashed. - -19. Added additional timing features to pcretest. (1) The -tm option now times - matching only, not compiling. (2) Both -t and -tm can be followed, as a - separate command line item, by a number that specifies the number of - repeats to use when timing. The default is 50000; this gives better - precision, but takes uncomfortably long for very large patterns. - -20. Extended pcre_study() to be more clever in cases where a branch of a - subpattern has no definite first character. For example, (a*|b*)[cd] would - previously give no result from pcre_study(). Now it recognizes that the - first character must be a, b, c, or d. - -21. There was an incorrect error "recursive call could loop indefinitely" if - a subpattern (or the entire pattern) that was being tested for matching an - empty string contained only one non-empty item after a nested subpattern. - For example, the pattern (?>\x{100}*)\d(?R) provoked this error - incorrectly, because the \d was being skipped in the check. - -22. The pcretest program now has a new pattern option /B and a command line - option -b, which is equivalent to adding /B to every pattern. This causes - it to show the compiled bytecode, without the additional information that - -d shows. The effect of -d is now the same as -b with -i (and similarly, /D - is the same as /B/I). - -23. A new optimization is now able automatically to treat some sequences such - as a*b as a*+b. More specifically, if something simple (such as a character - or a simple class like \d) has an unlimited quantifier, and is followed by - something that cannot possibly match the quantified thing, the quantifier - is automatically "possessified". - -24. A recursive reference to a subpattern whose number was greater than 39 - went wrong under certain circumstances in UTF-8 mode. This bug could also - have affected the operation of pcre_study(). - -25. Realized that a little bit of performance could be had by replacing - (c & 0xc0) == 0xc0 with c >= 0xc0 when processing UTF-8 characters. - -26. Timing data from pcretest is now shown to 4 decimal places instead of 3. - -27. Possessive quantifiers such as a++ were previously implemented by turning - them into atomic groups such as ($>a+). Now they have their own opcodes, - which improves performance. This includes the automatically created ones - from 23 above. - -28. A pattern such as (?=(\w+))\1: which simulates an atomic group using a - lookahead was broken if it was not anchored. PCRE was mistakenly expecting - the first matched character to be a colon. This applied both to named and - numbered groups. - -29. The ucpinternal.h header file was missing its idempotency #ifdef. - -30. I was sent a "project" file called libpcre.a.dev which I understand makes - building PCRE on Windows easier, so I have included it in the distribution. - -31. There is now a check in pcretest against a ridiculously large number being - returned by pcre_exec() or pcre_dfa_exec(). If this happens in a /g or /G - loop, the loop is abandoned. - -32. Forward references to subpatterns in conditions such as (?(2)...) where - subpattern 2 is defined later cause pcre_compile() to search forwards in - the pattern for the relevant set of parentheses. This search went wrong - when there were unescaped parentheses in a character class, parentheses - escaped with \Q...\E, or parentheses in a #-comment in /x mode. - -33. "Subroutine" calls and backreferences were previously restricted to - referencing subpatterns earlier in the regex. This restriction has now - been removed. - -34. Added a number of extra features that are going to be in Perl 5.10. On the - whole, these are just syntactic alternatives for features that PCRE had - previously implemented using the Python syntax or my own invention. The - other formats are all retained for compatibility. - - (a) Named groups can now be defined as (?...) or (?'name'...) as well - as (?P...). The new forms, as well as being in Perl 5.10, are - also .NET compatible. - - (b) A recursion or subroutine call to a named group can now be defined as - (?&name) as well as (?P>name). - - (c) A backreference to a named group can now be defined as \k or - \k'name' as well as (?P=name). The new forms, as well as being in Perl - 5.10, are also .NET compatible. - - (d) A conditional reference to a named group can now use the syntax - (?() or (?('name') as well as (?(name). - - (e) A "conditional group" of the form (?(DEFINE)...) can be used to define - groups (named and numbered) that are never evaluated inline, but can be - called as "subroutines" from elsewhere. In effect, the DEFINE condition - is always false. There may be only one alternative in such a group. - - (f) A test for recursion can be given as (?(R1).. or (?(R&name)... as well - as the simple (?(R). The condition is true only if the most recent - recursion is that of the given number or name. It does not search out - through the entire recursion stack. - - (g) The escape \gN or \g{N} has been added, where N is a positive or - negative number, specifying an absolute or relative reference. - -35. Tidied to get rid of some further signed/unsigned compiler warnings and - some "unreachable code" warnings. - -36. Updated the Unicode property tables to Unicode version 5.0.0. Amongst other - things, this adds five new scripts. - -37. Perl ignores orphaned \E escapes completely. PCRE now does the same. - There were also incompatibilities regarding the handling of \Q..\E inside - character classes, for example with patterns like [\Qa\E-\Qz\E] where the - hyphen was adjacent to \Q or \E. I hope I've cleared all this up now. - -38. Like Perl, PCRE detects when an indefinitely repeated parenthesized group - matches an empty string, and forcibly breaks the loop. There were bugs in - this code in non-simple cases. For a pattern such as ^(a()*)* matched - against aaaa the result was just "a" rather than "aaaa", for example. Two - separate and independent bugs (that affected different cases) have been - fixed. - -39. Refactored the code to abolish the use of different opcodes for small - capturing bracket numbers. This is a tidy that I avoided doing when I - removed the limit on the number of capturing brackets for 3.5 back in 2001. - The new approach is not only tidier, it makes it possible to reduce the - memory needed to fix the previous bug (38). - -40. Implemented PCRE_NEWLINE_ANY to recognize any of the Unicode newline - sequences (http://unicode.org/unicode/reports/tr18/) as "newline" when - processing dot, circumflex, or dollar metacharacters, or #-comments in /x - mode. - -41. Add \R to match any Unicode newline sequence, as suggested in the Unicode - report. - -42. Applied patch, originally from Ari Pollak, modified by Google, to allow - copy construction and assignment in the C++ wrapper. - -43. Updated pcregrep to support "--newline=any". In the process, I fixed a - couple of bugs that could have given wrong results in the "--newline=crlf" - case. - -44. Added a number of casts and did some reorganization of signed/unsigned int - variables following suggestions from Dair Grant. Also renamed the variable - "this" as "item" because it is a C++ keyword. - -45. Arranged for dftables to add - - #include "pcre_internal.h" - - to pcre_chartables.c because without it, gcc 4.x may remove the array - definition from the final binary if PCRE is built into a static library and - dead code stripping is activated. - -46. For an unanchored pattern, if a match attempt fails at the start of a - newline sequence, and the newline setting is CRLF or ANY, and the next two - characters are CRLF, advance by two characters instead of one. - - -Version 6.7 04-Jul-06 ---------------------- - - 1. In order to handle tests when input lines are enormously long, pcretest has - been re-factored so that it automatically extends its buffers when - necessary. The code is crude, but this _is_ just a test program. The - default size has been increased from 32K to 50K. - - 2. The code in pcre_study() was using the value of the re argument before - testing it for NULL. (Of course, in any sensible call of the function, it - won't be NULL.) - - 3. The memmove() emulation function in pcre_internal.h, which is used on - systems that lack both memmove() and bcopy() - that is, hardly ever - - was missing a "static" storage class specifier. - - 4. When UTF-8 mode was not set, PCRE looped when compiling certain patterns - containing an extended class (one that cannot be represented by a bitmap - because it contains high-valued characters or Unicode property items, e.g. - [\pZ]). Almost always one would set UTF-8 mode when processing such a - pattern, but PCRE should not loop if you do not (it no longer does). - [Detail: two cases were found: (a) a repeated subpattern containing an - extended class; (b) a recursive reference to a subpattern that followed a - previous extended class. It wasn't skipping over the extended class - correctly when UTF-8 mode was not set.] - - 5. A negated single-character class was not being recognized as fixed-length - in lookbehind assertions such as (?<=[^f]), leading to an incorrect - compile error "lookbehind assertion is not fixed length". - - 6. The RunPerlTest auxiliary script was showing an unexpected difference - between PCRE and Perl for UTF-8 tests. It turns out that it is hard to - write a Perl script that can interpret lines of an input file either as - byte characters or as UTF-8, which is what "perltest" was being required to - do for the non-UTF-8 and UTF-8 tests, respectively. Essentially what you - can't do is switch easily at run time between having the "use utf8;" pragma - or not. In the end, I fudged it by using the RunPerlTest script to insert - "use utf8;" explicitly for the UTF-8 tests. - - 7. In multiline (/m) mode, PCRE was matching ^ after a terminating newline at - the end of the subject string, contrary to the documentation and to what - Perl does. This was true of both matching functions. Now it matches only at - the start of the subject and immediately after *internal* newlines. - - 8. A call of pcre_fullinfo() from pcretest to get the option bits was passing - a pointer to an int instead of a pointer to an unsigned long int. This - caused problems on 64-bit systems. - - 9. Applied a patch from the folks at Google to pcrecpp.cc, to fix "another - instance of the 'standard' template library not being so standard". - -10. There was no check on the number of named subpatterns nor the maximum - length of a subpattern name. The product of these values is used to compute - the size of the memory block for a compiled pattern. By supplying a very - long subpattern name and a large number of named subpatterns, the size - computation could be caused to overflow. This is now prevented by limiting - the length of names to 32 characters, and the number of named subpatterns - to 10,000. - -11. Subpatterns that are repeated with specific counts have to be replicated in - the compiled pattern. The size of memory for this was computed from the - length of the subpattern and the repeat count. The latter is limited to - 65535, but there was no limit on the former, meaning that integer overflow - could in principle occur. The compiled length of a repeated subpattern is - now limited to 30,000 bytes in order to prevent this. - -12. Added the optional facility to have named substrings with the same name. - -13. Added the ability to use a named substring as a condition, using the - Python syntax: (?(name)yes|no). This overloads (?(R)... and names that - are numbers (not recommended). Forward references are permitted. - -14. Added forward references in named backreferences (if you see what I mean). - -15. In UTF-8 mode, with the PCRE_DOTALL option set, a quantified dot in the - pattern could run off the end of the subject. For example, the pattern - "(?s)(.{1,5})"8 did this with the subject "ab". - -16. If PCRE_DOTALL or PCRE_MULTILINE were set, pcre_dfa_exec() behaved as if - PCRE_CASELESS was set when matching characters that were quantified with ? - or *. - -17. A character class other than a single negated character that had a minimum - but no maximum quantifier - for example [ab]{6,} - was not handled - correctly by pce_dfa_exec(). It would match only one character. - -18. A valid (though odd) pattern that looked like a POSIX character - class but used an invalid character after [ (for example [[,abc,]]) caused - pcre_compile() to give the error "Failed: internal error: code overflow" or - in some cases to crash with a glibc free() error. This could even happen if - the pattern terminated after [[ but there just happened to be a sequence of - letters, a binary zero, and a closing ] in the memory that followed. - -19. Perl's treatment of octal escapes in the range \400 to \777 has changed - over the years. Originally (before any Unicode support), just the bottom 8 - bits were taken. Thus, for example, \500 really meant \100. Nowadays the - output from "man perlunicode" includes this: - - The regular expression compiler produces polymorphic opcodes. That - is, the pattern adapts to the data and automatically switches to - the Unicode character scheme when presented with Unicode data--or - instead uses a traditional byte scheme when presented with byte - data. - - Sadly, a wide octal escape does not cause a switch, and in a string with - no other multibyte characters, these octal escapes are treated as before. - Thus, in Perl, the pattern /\500/ actually matches \100 but the pattern - /\500|\x{1ff}/ matches \500 or \777 because the whole thing is treated as a - Unicode string. - - I have not perpetrated such confusion in PCRE. Up till now, it took just - the bottom 8 bits, as in old Perl. I have now made octal escapes with - values greater than \377 illegal in non-UTF-8 mode. In UTF-8 mode they - translate to the appropriate multibyte character. - -29. Applied some refactoring to reduce the number of warnings from Microsoft - and Borland compilers. This has included removing the fudge introduced - seven years ago for the OS/2 compiler (see 2.02/2 below) because it caused - a warning about an unused variable. - -21. PCRE has not included VT (character 0x0b) in the set of whitespace - characters since release 4.0, because Perl (from release 5.004) does not. - [Or at least, is documented not to: some releases seem to be in conflict - with the documentation.] However, when a pattern was studied with - pcre_study() and all its branches started with \s, PCRE still included VT - as a possible starting character. Of course, this did no harm; it just - caused an unnecessary match attempt. - -22. Removed a now-redundant internal flag bit that recorded the fact that case - dependency changed within the pattern. This was once needed for "required - byte" processing, but is no longer used. This recovers a now-scarce options - bit. Also moved the least significant internal flag bit to the most- - significant bit of the word, which was not previously used (hangover from - the days when it was an int rather than a uint) to free up another bit for - the future. - -23. Added support for CRLF line endings as well as CR and LF. As well as the - default being selectable at build time, it can now be changed at runtime - via the PCRE_NEWLINE_xxx flags. There are now options for pcregrep to - specify that it is scanning data with non-default line endings. - -24. Changed the definition of CXXLINK to make it agree with the definition of - LINK in the Makefile, by replacing LDFLAGS to CXXFLAGS. - -25. Applied Ian Taylor's patches to avoid using another stack frame for tail - recursions. This makes a big different to stack usage for some patterns. - -26. If a subpattern containing a named recursion or subroutine reference such - as (?P>B) was quantified, for example (xxx(?P>B)){3}, the calculation of - the space required for the compiled pattern went wrong and gave too small a - value. Depending on the environment, this could lead to "Failed: internal - error: code overflow at offset 49" or "glibc detected double free or - corruption" errors. - -27. Applied patches from Google (a) to support the new newline modes and (b) to - advance over multibyte UTF-8 characters in GlobalReplace. - -28. Change free() to pcre_free() in pcredemo.c. Apparently this makes a - difference for some implementation of PCRE in some Windows version. - -29. Added some extra testing facilities to pcretest: - - \q in a data line sets the "match limit" value - \Q in a data line sets the "match recursion limt" value - -S sets the stack size, where is in megabytes - - The -S option isn't available for Windows. - - Version 6.6 06-Feb-06 --------------------- --- ext/pcre/pcrelib/LICENCE 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/LICENCE 2006-03-06 22:57:53.000000000 +0100 @@ -4,7 +4,7 @@ PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. -Release 7 of PCRE is distributed under the terms of the "BSD" licence, as +Release 6 of PCRE is distributed under the terms of the "BSD" licence, as specified below. The documentation for PCRE, supplied in the "doc" directory, is distributed under the same terms as the software itself. @@ -31,7 +31,7 @@ Contributed by: Google Inc. -Copyright (c) 2006, Google Inc. +Copyright (c) 2005, Google Inc. All rights reserved. --- ext/pcre/pcrelib/NEWS 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/NEWS 2006-03-06 22:57:53.000000000 +0100 @@ -1,47 +1,6 @@ News about PCRE releases ------------------------ -Release 7.0 23-Nov-06 ---------------------- - -This release has a new major number because there have been some internal -upheavals to facilitate the addition of new optimizations and other facilities, -and to make subsequent maintenance and extension easier. Compilation is likely -to be a bit slower, but there should be no major effect on runtime performance. -Previously compiled patterns are NOT upwards compatible with this release. If -you have saved compiled patterns from a previous release, you will have to -re-compile them. Important changes that are visible to users are: - -1. The Unicode property tables have been updated to Unicode 5.0.0, which adds - some more scripts. - -2. The option PCRE_NEWLINE_ANY causes PCRE to recognize any Unicode newline - sequence as a newline. - -3. The \R escape matches a single Unicode newline sequence as a single unit. - -4. New features that will appear in Perl 5.10 are now in PCRE. These include - alternative Perl syntax for named parentheses, and Perl syntax for - recursion. - -5. The C++ wrapper interface has been extended by the addition of a - QuoteMeta function and the ability to allow copy construction and - assignment. - -For a complete list of changes, see the ChangeLog file. - - -Release 6.7 04-Jul-06 ---------------------- - -The main additions to this release are the ability to use the same name for -multiple sets of parentheses, and support for CRLF line endings in both the -library and pcregrep (and in pcretest for testing). - -Thanks to Ian Taylor, the stack usage for many kinds of pattern has been -significantly reduced for certain subject strings. - - Release 6.5 01-Feb-06 --------------------- --- ext/pcre/pcrelib/NON-UNIX-USE 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/NON-UNIX-USE 2006-03-06 22:57:53.000000000 +0100 @@ -22,7 +22,7 @@ indented commands are suggestions from Mark Tetrode as to which commands you might use on a Windows system to build a static library. -(1) Copy or rename the file config.h.in as config.h, and change the macros that +(1) Copy or rename the file config.in as config.h, and change the macros that define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0. Unfortunately, because of the way Unix autoconf works, the default setting has to be 0. You may also want to make changes to other macros in config.h. In @@ -31,7 +31,7 @@ your compiler gives to '\n'. rem Mark Tetrode's commands - copy config.h.in config.h + copy config.in config.h rem Use write, because notepad cannot handle UNIX files. Change values. write config.h @@ -56,7 +56,6 @@ pcre_globals.c pcre_info.c pcre_maketables.c - pcre_newline.c pcre_ord2utf8.c pcre_refcount.c pcre_study.c @@ -94,10 +93,10 @@ cl /F0x400000 pcretest.c pcre.lib pcreposix.lib (6) Run pcretest on the testinput files in the testdata directory, and check -that the output matches the corresponding testoutput files. Note that the -supplied files are in Unix format, with just LF characters as line terminators. -You may need to edit them to change this if your system uses a different -convention. +that the output matches the corresponding testoutput files. You must use the +-i option when checking testinput2. Note that the supplied files are in Unix +format, with just LF characters as line terminators. You may need to edit them +to change this if your system uses a different convention. rem Mark Tetrode's commands pcretest testdata\testinput1 testdata\myoutput1 @@ -136,17 +135,6 @@ Makefile.in to create Makefile, substituting suitable values for the variables at the head of the file. -Michael Roy sent these comments about building PCRE under Windows with BCC5.5: - - Some of the core BCC libraries have a version of PCRE from 1998 built in, - which can lead to pcre_exec() giving an erroneous PCRE_ERROR_NULL from a - version mismatch. I'm including an easy workaround below, if you'd like to - include it in the non-unix instructions: - - When linking a project with BCC5.5, pcre.lib must be included before any of - the libraries cw32.lib, cw32i.lib, cw32mt.lib, and cw32mti.lib on the command - line. - Some help in building a Win32 DLL of PCRE in GnuWin32 environments was contributed by Paul Sokolovsky. These environments are Mingw32 (http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and CygWin --- ext/pcre/pcrelib/README 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/README 2006-03-06 22:57:53.000000000 +0100 @@ -34,7 +34,7 @@ ---------------------- If you install PCRE in the normal way, you will end up with an installed set of -man pages whose names all start with "pcre". The one that is just called "pcre" +man pages whose names all start with "pcre". The one that is called "pcre" lists all the others. In addition to these man pages, the PCRE documentation is supplied in two other forms; however, as there is no standard place to install them, they are left in the doc directory of the unpacked source distribution. @@ -114,17 +114,15 @@ . If, in addition to support for UTF-8 character strings, you want to include support for the \P, \p, and \X sequences that recognize Unicode character properties, you must add --enable-unicode-properties to the "configure" - command. This adds about 30K to the size of the library (in the form of a + command. This adds about 90K to the size of the library (in the form of a property table); only the basic two-letter properties such as Lu are supported. -. You can build PCRE to recognize either CR or LF or the sequence CRLF or any - of the Unicode newline sequences as indicating the end of a line. Whatever - you specify at build time is the default; the caller of PCRE can change the - selection at run time. The default newline indicator is a single LF character - (the Unix standard). You can specify the default newline indicator by adding - --newline-is-cr or --newline-is-lf or --newline-is-crlf or --newline-is-any - to the "configure" command, respectively. +. You can build PCRE to recognize either CR or LF as the newline character, + instead of whatever your compiler uses for "\n", by adding --newline-is-cr or + --newline-is-lf to the "configure" command, respectively. Only do this if you + really understand what you are doing. On traditional Unix-like systems, the + newline character is LF. . When called via the POSIX interface, PCRE uses malloc() to get additional storage for processing capturing parentheses if there are more than 10 of @@ -144,16 +142,6 @@ pcre_exec() can supply their own value. There is discussion on the pcreapi man page. -. There is a separate counter that limits the depth of recursive function calls - during a matching process. This also has a default of ten million, which is - essentially "unlimited". You can change the default by setting, for example, - - --with-match-limit-recursion=500000 - - Recursive function calls use up the runtime stack; running out of stack can - cause programs to crash in strange ways. There is a discussion about stack - sizes in the pcrestack man page. - . The default maximum compiled pattern size is around 64K. You can increase this by adding --with-link-size=3 to the "configure" command. You can increase it even more by setting --with-link-size=4, but this is unlikely @@ -177,6 +165,7 @@ The "configure" script builds eight files for the basic C library: +. pcre.h is the header file for C programs that call PCRE . Makefile is the makefile that builds the library . config.h contains build-time configuration options for the library . pcre-config is a script that shows the settings of "configure" options @@ -283,7 +272,7 @@ Using HP's ANSI C++ compiler (aCC) ---------------------------------- -Unless C++ support is disabled by specifying the "--disable-cpp" option of the +Unless C++ support is disabled by specifiying the "--disable-cpp" option of the "configure" script, you *must* include the "-AA" option in the CXXFLAGS environment variable in order for the C++ components to compile correctly. @@ -305,8 +294,8 @@ PCRE has been compiled on Windows systems and on Macintoshes, but I don't know the details because I don't use those systems. It should be straightforward to -build PCRE on any system that has a Standard C compiler and library, because it -uses only Standard C functions. +build PCRE on any system that has a Standard C compiler, because it uses only +Standard C functions. Testing PCRE @@ -325,15 +314,15 @@ The RunTest script runs the pcretest test program (which is documented in its own man page) on each of the testinput files (in the testdata directory) in turn, and compares the output with the contents of the corresponding testoutput -files. A file called testtry is used to hold the main output from pcretest +file. A file called testtry is used to hold the main output from pcretest (testsavedregex is also used as a working file). To run pcretest on just one of the test files, give its number as an argument to RunTest, for example: RunTest 2 -The first test file can also be fed directly into the perltest script to check -that Perl gives the same results. The only difference you should see is in the -first few lines, where the Perl version is given instead of the PCRE version. +The first file can also be fed directly into the perltest script to check that +Perl gives the same results. The only difference you should see is in the first +few lines, where the Perl version is given instead of the PCRE version. The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(), pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error @@ -442,26 +431,26 @@ pcre_globals.c ) and some internal functions that they use pcre_info.c ) pcre_maketables.c ) - pcre_newline.c ) pcre_ord2utf8.c ) - pcre_refcount.c ) + pcre_printint.c ) pcre_study.c ) pcre_tables.c ) pcre_try_flipped.c ) - pcre_ucp_searchfuncs.c) + pcre_ucp_findchar.c ) pcre_valid_utf8.c ) pcre_version.c ) pcre_xclass.c ) - ucptable.c ) - pcre_printint.src ) debugging function that is #included in pcretest, and - ) can also be #included in pcre_compile() + ucp_findchar.c ) + ucp.h ) source for the code that is used for + ucpinternal.h ) Unicode property handling + ucptable.c ) + ucptypetable.c ) - pcre.h the public PCRE header file + pcre.in "source" for the header for the external API; pcre.h + is built from this by "configure" pcreposix.h header for the external POSIX wrapper API pcre_internal.h header for internal use - ucp.h ) headers concerned with - ucpinternal.h ) Unicode property handling config.in template for config.h, which is built by configure pcrecpp.h the header file for the C++ wrapper @@ -488,9 +477,8 @@ RunGrepTest.in template for a Unix shell script for pcregrep tests config.guess ) files used by libtool, config.sub ) used only when building a shared library - config.h.in "source" for the config.h header file configure a configuring shell script (built by autoconf) - configure.ac the autoconf input used to build configure + configure.in the autoconf input used to build configure doc/Tech.Notes notes on the encoding doc/*.3 man page sources for the PCRE functions doc/*.1 man page sources for pcregrep and pcretest @@ -518,6 +506,7 @@ libpcre.def libpcreposix.def + pcre.def (D) Auxiliary file for VPASCAL @@ -526,4 +515,4 @@ Philip Hazel Email local part: ph10 Email domain: cam.ac.uk -November 2006 +January 2006 --- ext/pcre/pcrelib/dftables.c 2007-02-13 21:23:28.000000000 +0100 +++ ext/pcre/pcrelib/dftables.c 2006-03-06 22:57:53.000000000 +0100 @@ -86,16 +86,7 @@ fprintf(f, "This file contains the default tables for characters with codes less than\n" "128 (ASCII characters). These tables are used when no external tables are\n" - "passed to PCRE.\n\n"); -fprintf(f, - "The following #include is present because without it gcc 4.x may remove\n" - "the array definition from the final binary if PCRE is built into a static\n" - "library and dead code stripping is activated. This leads to link errors.\n" - "Pulling in the header ensures that the array gets flagged as \"someone\n" - "outside this compilation unit might reference this\" and so it will always\n" - "be supplied to the linker. */\n\n" - "#include \"pcre_internal.h\"\n\n"); -fprintf(f, + "passed to PCRE. */\n\n" "const unsigned char _pcre_default_tables[] = {\n\n" "/* This table is a lower casing table. */\n\n"); --- ext/pcre/pcrelib/doc/Tech.Notes 2007-02-13 21:23:29.000000000 +0100 +++ ext/pcre/pcrelib/doc/Tech.Notes 2006-03-06 22:57:53.000000000 +0100 @@ -1,9 +1,6 @@ Technical Notes about PCRE -------------------------- -These are very rough technical notes that record potentially useful information -about PCRE internals. - Historical note 1 ----------------- @@ -16,23 +13,21 @@ Perl code does, but instead checked all possibilities simultaneously by keeping a list of current states and checking all of them as it advanced through the subject string. In the terminology of Jeffrey Friedl's book, it was a "DFA -algorithm", though it was not a traditional Finite State Machine (FSM). When -the pattern was all used up, all remaining states were possible matches, and -the one matching the longest subset of the subject string was chosen. This did -not necessarily maximize the individual wild portions of the pattern, as is -expected in Unix and Perl-style regular expressions. +algorithm". When the pattern was all used up, all remaining states were +possible matches, and the one matching the longest subset of the subject string +was chosen. This did not necessarily maximize the individual wild portions of +the pattern, as is expected in Unix and Perl-style regular expressions. Historical note 2 ----------------- -By contrast, the code originally written by Henry Spencer (which was -subsequently heavily modified for Perl) compiles the expression twice: once in -a dummy mode in order to find out how much store will be needed, and then for -real. (The Perl version probably doesn't do this any more; I'm talking about -the original library.) The execution function operates by backtracking and -maximizing (or, optionally, minimizing in Perl) the amount of the subject that -matches individual wild portions of the pattern. This is an "NFA algorithm" in -Friedl's terminology. +By contrast, the code originally written by Henry Spencer and subsequently +heavily modified for Perl actually compiles the expression twice: once in a +dummy mode in order to find out how much store will be needed, and then for +real. The execution function operates by backtracking and maximizing (or, +optionally, minimizing in Perl) the amount of the subject that matches +individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's +terminology. OK, here's the real stuff ------------------------- @@ -42,38 +37,14 @@ that used an amount of store bounded by a multiple of the number of characters in the pattern, to save on compiling time. However, because of the greater complexity in Perl regular expressions, I couldn't do this. In any case, a -first pass through the pattern is helpful for other reasons. - -Computing the memory requirement: how it was --------------------------------------------- - -Up to and including release 6.7, PCRE worked by running a very degenerate first -pass to calculate a maximum store size, and then a second pass to do the real -compile - which might use a bit less than the predicted amount of memory. The -idea was that this would turn out faster than the Henry Spencer code because -the first pass is degenerate and the second pass can just store stuff straight -into the vector, which it knows is big enough. - -Computing the memory requirement: how it is -------------------------------------------- - -By the time I was working on a potential 6.8 release, the degenerate first pass -had become very complicated and hard to maintain. Indeed one of the early -things I did for 6.8 was to fix Yet Another Bug in the memory computation. Then -I had a flash of inspiration as to how I could run the real compile function in -a "fake" mode that enables it to compute how much memory it would need, while -actually only ever using a few hundred bytes of working memory, and without too -many tests of the mode that might slow it down. So I re-factored the compiling -functions to work this way. This got rid of about 600 lines of source. It -should make future maintenance and development easier. As this was such a major -change, I never released 6.8, instead upping the number to 7.0 (other quite -major changes are also present in the 7.0 release). - -A side effect of this work is that the previous limit of 200 on the nesting -depth of parentheses was removed. However, there is a downside: pcre_compile() -runs more slowly than before (30% or more, depending on the pattern) because it -is doing a full analysis of the pattern. My hope is that this is not a big -issue. +first pass through the pattern is needed, for a number of reasons. PCRE works +by running a very degenerate first pass to calculate a maximum store size, and +then a second pass to do the real compile - which may use a bit less than the +predicted amount of store. The idea is that this is going to turn out faster +because the first pass is degenerate and the second pass can just store stuff +straight into the vector, which it knows is big enough. It does make the +compiling functions bigger, of course, but they have got quite big anyway to +handle all the Perl stuff. Traditional matching function ----------------------------- @@ -92,15 +63,9 @@ simultaneously for all possible matches that start at one point in the subject string. (Going back to my roots: see Historical Note 1 above.) This function intreprets the same compiled pattern data as pcre_exec(); however, not all the -facilities are available, and those that are do not always work in quite the +facilities are available, and those that are don't always work in quite the same way. See the user documentation for details. -The algorithm that is used for pcre_dfa_exec() is not a traditional FSM, -because it may have a number of states active at one time. More work would be -needed at compile time to produce a traditional FSM where only one state is -ever active at once. I believe some other regex matchers work this way. - - Format of compiled patterns --------------------------- @@ -110,12 +75,10 @@ follow it. In many cases below "two-byte" data values are specified. This is in fact just -a default when the number is an offset within the compiled pattern. PCRE can be -compiled to use 3-byte or 4-byte values for these offsets (impairing the +a default. PCRE can be compiled to use 3-byte or 4-byte values (impairing the performance). This is necessary only when patterns whose compiled length is -greater than 64K are going to be processed. In this description, we assume the -"normal" compilation options. "Two-byte" data values that are counts (e.g. for -quantifiers) are always just two bytes. +greater than 64K are going to be processed. In this description, we assume the +"normal" compilation options. A list of all the opcodes follows: @@ -142,7 +105,6 @@ OP_EOD match end of data: \z OP_DOLL $ (end of data, or before \n in multiline) OP_EXTUNI match an extended Unicode character - OP_ANYNL match any Unicode newline sequence Repeating single characters @@ -153,28 +115,23 @@ OP_STAR OP_MINSTAR - OP_POSSTAR OP_PLUS OP_MINPLUS - OP_POSPLUS OP_QUERY OP_MINQUERY - OP_POSQUERY In ASCII mode, these are two-byte items; in UTF-8 mode, the length is variable. -Those with "MIN" in their name are the minimizing versions. Those with "POS" in -their names are possessive versions. Each is followed by the character that is -to be repeated. Other repeats make use of +Those with "MIN" in their name are the minimizing versions. Each is followed by +the character that is to be repeated. Other repeats make use of OP_UPTO OP_MINUPTO - OP_POSUPTO OP_EXACT which are followed by a two-byte count (most significant first) and the repeated character. OP_UPTO matches from 0 to the given number. A repeat with a non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an -OP_UPTO (or OP_MINUPTO or OPT_POSUPTO). +OP_UPTO (or OP_MINUPTO). Repeating character types @@ -186,16 +143,12 @@ OP_TYPESTAR OP_TYPEMINSTAR - OP_TYPEPOSSTAR OP_TYPEPLUS OP_TYPEMINPLUS - OP_TYPEPOSPLUS OP_TYPEQUERY OP_TYPEMINQUERY - OP_TYPEPOSQUERY OP_TYPEUPTO OP_TYPEMINUPTO - OP_TYPEPOSUPTO OP_TYPEEXACT @@ -204,12 +157,10 @@ OP_PROP and OP_NOTPROP are used for positive and negative matches of a character by testing its Unicode property (the \p and \P escape sequences). -Each is followed by two bytes that encode the desired property as a type and a -value. +Each is followed by a single byte that encodes the desired property value. -Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by -three bytes: OP_PROP or OP_NOTPROP and then the desired property type and -value. +Repeats of these items use the OP_TYPESTAR etc. set of opcodes, followed by two +bytes: OP_PROP or OP_NOTPROP and then the desired property value. Matching literal characters @@ -259,10 +210,9 @@ Repeating character classes and back references ----------------------------------------------- -Single-character classes are handled specially (see above). This section -applies to OP_CLASS and OP_REF. In both cases, the repeat information follows -the base item. The matching code looks at the following opcode to see if it is -one of +Single-character classes are handled specially (see above). This applies to +OP_CLASS and OP_REF. In both cases, the repeat information follows the base +item. The matching code looks at the following opcode to see if it is one of OP_CRSTAR OP_CRMINSTAR @@ -274,9 +224,7 @@ OP_CRMINRANGE All but the last two are just single-byte items. The others are followed by -four bytes of data, comprising the minimum and maximum repeat counts. There are -no special possessive opcodes for these repeats; a possessive repeat is -compiled into an atomic group. +four bytes of data, comprising the minimum and maximum repeat counts. Brackets and alternation @@ -285,25 +233,29 @@ A pair of non-capturing (round) brackets is wrapped round each expression at compile time, so alternation always happens in the context of brackets. -[Note for North Americans: "bracket" to some English speakers, including -myself, can be round, square, curly, or pointy. Hence this usage.] - -Non-capturing brackets use the opcode OP_BRA. Originally PCRE was limited to 99 -capturing brackets and it used a different opcode for each one. From release -3.5, the limit was removed by putting the bracket number into the data for -higher-numbered brackets. From release 7.0 all capturing brackets are handled -this way, using the single opcode OP_CBRA. +Non-capturing brackets use the opcode OP_BRA, while capturing brackets use +OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English +speakers, including myself, can be round, square, curly, or pointy. Hence this +usage.] + +Originally PCRE was limited to 99 capturing brackets (so as not to use up all +the opcodes). From release 3.5, there is no limit. What happens is that the +first ones, up to EXTRACT_BASIC_MAX are handled with separate opcodes, as +above. If there are more, the opcode is set to EXTRACT_BASIC_MAX+1, and the +first operation in the bracket is OP_BRANUMBER, followed by a 2-byte bracket +number. This opcode is ignored while matching, but is fished out when handling +the bracket itself. (They could have all been done like this, but I was making +minimal changes.) A bracket opcode is followed by LINK_SIZE bytes which give the offset to the next alternative OP_ALT or, if there aren't any branches, to the matching OP_KET opcode. Each OP_ALT is followed by LINK_SIZE bytes giving the offset to -the next one, or to the OP_KET opcode. For capturing brackets, the bracket -number immediately follows the offset, always as a 2-byte item. +the next one, or to the OP_KET opcode. OP_KET is used for subpatterns that do not repeat indefinitely, while OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or maximally respectively. All three are followed by LINK_SIZE bytes giving (as a -positive number) the offset back to the matching bracket opcode. +positive number) the offset back to the matching OP_BRA opcode. If a subpattern is quantified such that it is permitted to match zero times, it is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte @@ -318,14 +270,7 @@ A subpattern with a bounded maximum repetition is replicated in a nested fashion up to the maximum number of times, with OP_BRAZERO or OP_BRAMINZERO before each replication after the minimum, so that, for example, (abc){2,5} is -compiled as (abc)(abc)((abc)((abc)(abc)?)?)?, except that each bracketed group -has the same number. - -When a repeated subpattern has an unbounded upper limit, it is checked to see -whether it could match an empty string. If this is the case, the opcode in the -final replication is changed to OP_SBRA or OP_SCBRA. This tells the matcher -that it needs to check for matching an empty string when it hits OP_KETRMIN or -OP_KETRMAX, and if so, to break the loop. +compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. Assertions @@ -341,27 +286,22 @@ fixed lengths. -Once-only (atomic) subpatterns ------------------------------- +Once-only subpatterns +--------------------- These are also just like other subpatterns, but they start with the opcode -OP_ONCE. The check for matching an empty string in an unbounded repeat is -handled entirely at runtime, so there is just this one opcode. +OP_ONCE. Conditional subpatterns ----------------------- -These are like other subpatterns, but they start with the opcode OP_COND, or -OP_SCOND for one that might match an empty string in an unbounded repeat. If +These are like other subpatterns, but they start with the opcode OP_COND. If the condition is a back reference, this is stored at the start of the subpattern using the opcode OP_CREF followed by two bytes containing the -reference number. If the condition is "in recursion" (coded as "(?(R)"), or "in -recursion of group x" (coded as "(?(Rx)"), the group number is stored at the -start of the subpattern using the opcode OP_RREF, and a value of zero for "the -whole pattern". For a DEFINE condition, just the single byte OP_DEF is used (it -has no associated data). Otherwise, a conditional subpattern always starts with -one of the assertions. +reference number. If the condition is "in recursion" (coded as "(?(R)"), the +same scheme is used, with a "reference number" of 0xffff. Otherwise, a +conditional subpattern always starts with one of the assertions. Recursion @@ -399,4 +339,4 @@ data. Philip Hazel -November 2006 +January 2006 --- ext/pcre/pcrelib/doc/pcre.txt 2007-02-13 21:23:29.000000000 +0100 +++ ext/pcre/pcrelib/doc/pcre.txt 2006-03-06 22:57:53.000000000 +0100 @@ -18,57 +18,52 @@ The PCRE library is a set of functions that implement regular expres- sion pattern matching using the same syntax and semantics as Perl, with - just a few differences. (Certain features that appeared in Python and - PCRE before they appeared in Perl are also available using the Python - syntax.) - - The current implementation of PCRE (release 7.x) corresponds approxi- - mately with Perl 5.10, including support for UTF-8 encoded strings and - Unicode general category properties. However, UTF-8 and Unicode support - has to be explicitly enabled; it is not the default. The Unicode tables - correspond to Unicode release 5.0.0. - - In addition to the Perl-compatible matching function, PCRE contains an - alternative matching function that matches the same compiled patterns - in a different way. In certain circumstances, the alternative function - has some advantages. For a discussion of the two matching algorithms, - see the pcrematching page. - - PCRE is written in C and released as a C library. A number of people - have written wrappers and interfaces of various kinds. In particular, - Google Inc. have provided a comprehensive C++ wrapper. This is now + just a few differences. The current implementation of PCRE (release + 6.x) corresponds approximately with Perl 5.8, including support for + UTF-8 encoded strings and Unicode general category properties. However, + this support has to be explicitly enabled; it is not the default. + + In addition to the Perl-compatible matching function, PCRE also con- + tains an alternative matching function that matches the same compiled + patterns in a different way. In certain circumstances, the alternative + function has some advantages. For a discussion of the two matching + algorithms, see the pcrematching page. + + PCRE is written in C and released as a C library. A number of people + have written wrappers and interfaces of various kinds. In particular, + Google Inc. have provided a comprehensive C++ wrapper. This is now included as part of the PCRE distribution. The pcrecpp page has details - of this interface. Other people's contributions can be found in the + of this interface. Other people's contributions can be found in the Contrib directory at the primary FTP site, which is: ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre - Details of exactly which Perl regular expression features are and are + Details of exactly which Perl regular expression features are and are not supported by PCRE are given in separate documents. See the pcrepat- tern and pcrecompat pages. - Some features of PCRE can be included, excluded, or changed when the - library is built. The pcre_config() function makes it possible for a - client to discover which features are available. The features them- - selves are described in the pcrebuild page. Documentation about build- - ing PCRE for various operating systems can be found in the README file + Some features of PCRE can be included, excluded, or changed when the + library is built. The pcre_config() function makes it possible for a + client to discover which features are available. The features them- + selves are described in the pcrebuild page. Documentation about build- + ing PCRE for various operating systems can be found in the README file in the source distribution. - The library contains a number of undocumented internal functions and - data tables that are used by more than one of the exported external - functions, but which are not intended for use by external callers. - Their names all begin with "_pcre_", which hopefully will not provoke + The library contains a number of undocumented internal functions and + data tables that are used by more than one of the exported external + functions, but which are not intended for use by external callers. + Their names all begin with "_pcre_", which hopefully will not provoke any name clashes. In some environments, it is possible to control which - external symbols are exported when a shared library is built, and in + external symbols are exported when a shared library is built, and in these cases the undocumented symbols are not exported. USER DOCUMENTATION - The user documentation for PCRE comprises a number of different sec- - tions. In the "man" format, each of these is a separate "man page". In - the HTML format, each is a separate page, linked from the index page. - In the plain text format, all the sections are concatenated, for ease + The user documentation for PCRE comprises a number of different sec- + tions. In the "man" format, each of these is a separate "man page". In + the HTML format, each is a separate page, linked from the index page. + In the plain text format, all the sections are concatenated, for ease of searching. The sections are as follows: pcre this document @@ -86,105 +81,98 @@ pcreposix the POSIX-compatible C API pcreprecompile details of saving and re-using precompiled patterns pcresample discussion of the sample program - pcrestack discussion of stack usage pcretest description of the pcretest testing command - In addition, in the "man" and HTML formats, there is a short page for + In addition, in the "man" and HTML formats, there is a short page for each C library function, listing its arguments and results. LIMITATIONS - There are some size limitations in PCRE but it is hoped that they will + There are some size limitations in PCRE but it is hoped that they will never in practice be relevant. - The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE + The maximum length of a compiled pattern is 65539 (sic) bytes if PCRE is compiled with the default internal linkage size of 2. If you want to - process regular expressions that are truly enormous, you can compile - PCRE with an internal linkage size of 3 or 4 (see the README file in - the source distribution and the pcrebuild documentation for details). - In these cases the limit is substantially larger. However, the speed - of execution is slower. - - All values in repeating quantifiers must be less than 65536. The maxi- - mum compiled length of subpattern with an explicit repeat count is - 30000 bytes. The maximum number of capturing subpatterns is 65535. + process regular expressions that are truly enormous, you can compile + PCRE with an internal linkage size of 3 or 4 (see the README file in + the source distribution and the pcrebuild documentation for details). + In these cases the limit is substantially larger. However, the speed + of execution will be slower. + + All values in repeating quantifiers must be less than 65536. The maxi- + mum number of capturing subpatterns is 65535. + + There is no limit to the number of non-capturing subpatterns, but the + maximum depth of nesting of all kinds of parenthesized subpattern, + including capturing subpatterns, assertions, and other types of subpat- + tern, is 200. - There is no limit to the number of parenthesized subpatterns, but there - can be no more than 65535 capturing subpatterns. - - The maximum length of name for a named subpattern is 32 characters, and - the maximum number of named subpatterns is 10000. - - The maximum length of a subject string is the largest positive number - that an integer variable can hold. However, when using the traditional + The maximum length of a subject string is the largest positive number + that an integer variable can hold. However, when using the traditional matching function, PCRE uses recursion to handle subpatterns and indef- - inite repetition. This means that the available stack space may limit + inite repetition. This means that the available stack space may limit the size of a subject string that can be processed by certain patterns. - For a discussion of stack issues, see the pcrestack documentation. UTF-8 AND UNICODE PROPERTY SUPPORT - From release 3.3, PCRE has had some support for character strings - encoded in the UTF-8 format. For release 4.0 this was greatly extended - to cover most common requirements, and in release 5.0 additional sup- + From release 3.3, PCRE has had some support for character strings + encoded in the UTF-8 format. For release 4.0 this was greatly extended + to cover most common requirements, and in release 5.0 additional sup- port for Unicode general category properties was added. - In order process UTF-8 strings, you must build PCRE to include UTF-8 - support in the code, and, in addition, you must call pcre_compile() - with the PCRE_UTF8 option flag. When you do this, both the pattern and - any subject strings that are matched against it are treated as UTF-8 + In order process UTF-8 strings, you must build PCRE to include UTF-8 + support in the code, and, in addition, you must call pcre_compile() + with the PCRE_UTF8 option flag. When you do this, both the pattern and + any subject strings that are matched against it are treated as UTF-8 strings instead of just strings of bytes. - If you compile PCRE with UTF-8 support, but do not use it at run time, - the library will be a bit bigger, but the additional run time overhead - is limited to testing the PCRE_UTF8 flag occasionally, so should not be - very big. + If you compile PCRE with UTF-8 support, but do not use it at run time, + the library will be a bit bigger, but the additional run time overhead + is limited to testing the PCRE_UTF8 flag in several places, so should + not be very large. If PCRE is built with Unicode character property support (which implies - UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup- + UTF-8 support), the escape sequences \p{..}, \P{..}, and \X are sup- ported. The available properties that can be tested are limited to the - general category properties such as Lu for an upper case letter or Nd - for a decimal number, the Unicode script names such as Arabic or Han, - and the derived properties Any and L&. A full list is given in the + general category properties such as Lu for an upper case letter or Nd + for a decimal number, the Unicode script names such as Arabic or Han, + and the derived properties Any and L&. A full list is given in the pcrepattern documentation. Only the short names for properties are sup- - ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let- - ter}, is not supported. Furthermore, in Perl, many properties may - optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE + ported. For example, \p{L} matches a letter. Its Perl synonym, \p{Let- + ter}, is not supported. Furthermore, in Perl, many properties may + optionally be prefixed by "Is", for compatibility with Perl 5.6. PCRE does not support this. The following comments apply when PCRE is running in UTF-8 mode: - 1. When you set the PCRE_UTF8 flag, the strings passed as patterns and - subjects are checked for validity on entry to the relevant functions. + 1. When you set the PCRE_UTF8 flag, the strings passed as patterns and + subjects are checked for validity on entry to the relevant functions. If an invalid UTF-8 string is passed, an error return is given. In some - situations, you may already know that your strings are valid, and + situations, you may already know that your strings are valid, and therefore want to skip these checks in order to improve performance. If - you set the PCRE_NO_UTF8_CHECK flag at compile time or at run time, - PCRE assumes that the pattern or subject it is given (respectively) - contains only valid UTF-8 codes. In this case, it does not diagnose an - invalid UTF-8 string. If you pass an invalid UTF-8 string to PCRE when - PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program may + you set the PCRE_NO_UTF8_CHECK flag at compile time or at run time, + PCRE assumes that the pattern or subject it is given (respectively) + contains only valid UTF-8 codes. In this case, it does not diagnose an + invalid UTF-8 string. If you pass an invalid UTF-8 string to PCRE when + PCRE_NO_UTF8_CHECK is set, the results are undefined. Your program may crash. - 2. An unbraced hexadecimal escape sequence (such as \xb3) matches a + 2. An unbraced hexadecimal escape sequence (such as \xb3) matches a two-byte UTF-8 character if the value is greater than 127. - 3. Octal numbers up to \777 are recognized, and match two-byte UTF-8 - characters for values greater than \177. - - 4. Repeat quantifiers apply to complete UTF-8 characters, not to indi- + 3. Repeat quantifiers apply to complete UTF-8 characters, not to indi- vidual bytes, for example: \x{100}{3}. - 5. The dot metacharacter matches one UTF-8 character instead of a sin- + 4. The dot metacharacter matches one UTF-8 character instead of a sin- gle byte. - 6. The escape sequence \C can be used to match a single byte in UTF-8 + 5. The escape sequence \C can be used to match a single byte in UTF-8 mode, but its use can lead to some strange effects. This facility is not available in the alternative matching function, pcre_dfa_exec(). - 7. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly + 6. The character escapes \b, \B, \d, \D, \s, \S, \w, and \W correctly test characters of any code value, but the characters that PCRE recog- nizes as digits, spaces, or word characters remain the same set as before, all with values less than 256. This remains true even when PCRE @@ -193,10 +181,10 @@ sense of, say, "digit", you must use Unicode property tests such as \p{Nd}. - 8. Similarly, characters that match the POSIX named character classes + 7. Similarly, characters that match the POSIX named character classes are all low-valued characters. - 9. Case-insensitive matching applies only to characters whose values + 8. Case-insensitive matching applies only to characters whose values are less than 128, unless PCRE is built with Unicode property support. Even when Unicode property support is available, PCRE still uses its own character tables when checking the case of low-valued characters, @@ -212,13 +200,13 @@ Philip Hazel University Computing Service, - Cambridge CB2 3QH, England. + Cambridge CB2 3QG, England. Putting an actual email address here seems to have been a spam magnet, so I've taken it away. If you want to email me, use my initial and sur- name, separated by a dot, at the domain ucs.cam.ac.uk. -Last updated: 23 November 2006 +Last updated: 24 January 2006 Copyright (c) 1997-2006 University of Cambridge. ------------------------------------------------------------------------------ @@ -293,36 +281,21 @@ CODE VALUE OF NEWLINE - By default, PCRE interprets character 10 (linefeed, LF) as indicating - the end of a line. This is the normal newline character on Unix-like - systems. You can compile PCRE to use character 13 (carriage return, CR) - instead, by adding + By default, PCRE treats character 10 (linefeed) as the newline charac- + ter. This is the normal newline character on Unix-like systems. You can + compile PCRE to use character 13 (carriage return) instead by adding --enable-newline-is-cr - to the configure command. There is also a --enable-newline-is-lf - option, which explicitly specifies linefeed as the newline character. - - Alternatively, you can specify that line endings are to be indicated by - the two character sequence CRLF. If you want this, add - - --enable-newline-is-crlf - - to the configure command. There is a fourth option, specified by - - --enable-newline-is-any - - which causes PCRE to recognize any Unicode newline sequence. - - Whatever line ending convention is selected when PCRE is built can be - overridden when the library functions are called. At build time it is - conventional to use the standard for your operating system. + to the configure command. For completeness there is also a --enable- + newline-is-lf option, which explicitly specifies linefeed as the new- + line character. BUILDING SHARED AND STATIC LIBRARIES - The PCRE building process uses libtool to build both shared and static - Unix libraries by default. You can suppress one of these by adding one + The PCRE building process uses libtool to build both shared and static + Unix libraries by default. You can suppress one of these by adding one of --disable-shared @@ -334,9 +307,9 @@ POSIX MALLOC USAGE When PCRE is called through the POSIX interface (see the pcreposix doc- - umentation), additional working storage is required for holding the - pointers to capturing substrings, because PCRE requires three integers - per substring, whereas the POSIX interface provides only two. If the + umentation), additional working storage is required for holding the + pointers to capturing substrings, because PCRE requires three integers + per substring, whereas the POSIX interface provides only two. If the number of expected substrings is small, the wrapper function uses space on the stack, because this is faster than using malloc() for each call. The default threshold above which the stack is no longer used is 10; it @@ -347,85 +320,70 @@ to the configure command. +LIMITING PCRE RESOURCE USAGE + + Internally, PCRE has a function called match(), which it calls repeat- + edly (possibly recursively) when matching a pattern with the + pcre_exec() function. By controlling the maximum number of times this + function may be called during a single matching operation, a limit can + be placed on the resources used by a single call to pcre_exec(). The + limit can be changed at run time, as described in the pcreapi documen- + tation. The default is 10 million, but this can be changed by adding a + setting such as + + --with-match-limit=500000 + + to the configure command. This setting has no effect on the + pcre_dfa_exec() matching function. + + HANDLING VERY LARGE PATTERNS - Within a compiled pattern, offset values are used to point from one - part to another (for example, from an opening parenthesis to an alter- - nation metacharacter). By default, two-byte values are used for these - offsets, leading to a maximum size for a compiled pattern of around - 64K. This is sufficient to handle all but the most gigantic patterns. - Nevertheless, some people do want to process enormous patterns, so it - is possible to compile PCRE to use three-byte or four-byte offsets by + Within a compiled pattern, offset values are used to point from one + part to another (for example, from an opening parenthesis to an alter- + nation metacharacter). By default, two-byte values are used for these + offsets, leading to a maximum size for a compiled pattern of around + 64K. This is sufficient to handle all but the most gigantic patterns. + Nevertheless, some people do want to process enormous patterns, so it + is possible to compile PCRE to use three-byte or four-byte offsets by adding a setting such as --with-link-size=3 - to the configure command. The value given must be 2, 3, or 4. Using - longer offsets slows down the operation of PCRE because it has to load + to the configure command. The value given must be 2, 3, or 4. Using + longer offsets slows down the operation of PCRE because it has to load additional bytes when handling them. - If you build PCRE with an increased link size, test 2 (and test 5 if - you are using UTF-8) will fail. Part of the output of these tests is a - representation of the compiled pattern, and this changes with the link + If you build PCRE with an increased link size, test 2 (and test 5 if + you are using UTF-8) will fail. Part of the output of these tests is a + representation of the compiled pattern, and this changes with the link size. AVOIDING EXCESSIVE STACK USAGE When matching with the pcre_exec() function, PCRE implements backtrack- - ing by making recursive calls to an internal function called match(). - In environments where the size of the stack is limited, this can se- - verely limit PCRE's operation. (The Unix environment does not usually - suffer from this problem, but it may sometimes be necessary to increase - the maximum stack size. There is a discussion in the pcrestack docu- - mentation.) An alternative approach to recursion that uses memory from - the heap to remember data, instead of using recursive function calls, - has been implemented to work round the problem of limited stack size. - If you want to build a version of PCRE that works this way, add + ing by making recursive calls to an internal function called match(). + In environments where the size of the stack is limited, this can se- + verely limit PCRE's operation. (The Unix environment does not usually + suffer from this problem.) An alternative approach that uses memory + from the heap to remember data, instead of using recursive function + calls, has been implemented to work round this problem. If you want to + build a version of PCRE that works this way, add --disable-stack-for-recursion - to the configure command. With this configuration, PCRE will use the - pcre_stack_malloc and pcre_stack_free variables to call memory manage- - ment functions. Separate functions are provided because the usage is - very predictable: the block sizes requested are always the same, and - the blocks are always freed in reverse order. A calling program might - be able to implement optimized functions that perform better than the - standard malloc() and free() functions. PCRE runs noticeably more + to the configure command. With this configuration, PCRE will use the + pcre_stack_malloc and pcre_stack_free variables to call memory manage- + ment functions. Separate functions are provided because the usage is + very predictable: the block sizes requested are always the same, and + the blocks are always freed in reverse order. A calling program might + be able to implement optimized functions that perform better than the + standard malloc() and free() functions. PCRE runs noticeably more slowly when built in this way. This option affects only the pcre_exec() function; it is not relevant for the the pcre_dfa_exec() function. -LIMITING PCRE RESOURCE USAGE - - Internally, PCRE has a function called match(), which it calls repeat- - edly (sometimes recursively) when matching a pattern with the - pcre_exec() function. By controlling the maximum number of times this - function may be called during a single matching operation, a limit can - be placed on the resources used by a single call to pcre_exec(). The - limit can be changed at run time, as described in the pcreapi documen- - tation. The default is 10 million, but this can be changed by adding a - setting such as - - --with-match-limit=500000 - - to the configure command. This setting has no effect on the - pcre_dfa_exec() matching function. - - In some environments it is desirable to limit the depth of recursive - calls of match() more strictly than the total number of calls, in order - to restrict the maximum amount of stack (or heap, if --disable-stack- - for-recursion is specified) that is used. A second limit controls this; - it defaults to the value that is set for --with-match-limit, which - imposes no additional constraints. However, you can set a lower limit - by adding, for example, - - --with-match-limit-recursion=10000 - - to the configure command. This value can also be overridden at run - time. - - USING EBCDIC CODE PCRE assumes by default that it will run in an environment where the @@ -437,13 +395,8 @@ to the configure command. - -SEE ALSO - - pcreapi(3), pcre_config(3). - -Last updated: 30 November 2006 -Copyright (c) 1997-2006 University of Cambridge. +Last updated: 15 August 2005 +Copyright (c) 1997-2005 University of Cambridge. ------------------------------------------------------------------------------ @@ -479,7 +432,7 @@ there are three possible answers. The standard algorithm finds only one - of them, whereas the alternative algorithm finds all three. + of them, whereas the DFA algorithm finds all three. REGULAR EXPRESSIONS AS TREES @@ -488,9 +441,9 @@ resented as a tree structure. An unlimited repetition in the pattern makes the tree of infinite size, but it is still a tree. Matching the pattern to a given subject string (from a given starting point) can be - thought of as a search of the tree. There are two ways to search a - tree: depth-first and breadth-first, and these correspond to the two - matching algorithms provided by PCRE. + thought of as a search of the tree. There are two standard ways to + search a tree: depth-first and breadth-first, and these correspond to + the two matching algorithms provided by PCRE. THE STANDARD MATCHING ALGORITHM @@ -520,22 +473,21 @@ This provides support for capturing parentheses and back references. -THE ALTERNATIVE MATCHING ALGORITHM +THE DFA MATCHING ALGORITHM - This algorithm conducts a breadth-first search of the tree. Starting - from the first matching point in the subject, it scans the subject - string from left to right, once, character by character, and as it does - this, it remembers all the paths through the tree that represent valid - matches. In Friedl's terminology, this is a kind of "DFA algorithm", - though it is not implemented as a traditional finite state machine (it - keeps multiple states active simultaneously). - - The scan continues until either the end of the subject is reached, or - there are no more unterminated paths. At this point, terminated paths - represent the different matching possibilities (if there are none, the - match has failed). Thus, if there is more than one possible match, + DFA stands for "deterministic finite automaton", but you do not need to + understand the origins of that name. This algorithm conducts a breadth- + first search of the tree. Starting from the first matching point in the + subject, it scans the subject string from left to right, once, charac- + ter by character, and as it does this, it remembers all the paths + through the tree that represent valid matches. + + The scan continues until either the end of the subject is reached, or + there are no more unterminated paths. At this point, terminated paths + represent the different matching possibilities (if there are none, the + match has failed). Thus, if there is more than one possible match, this algorithm finds all of them, and in particular, it finds the long- - est. In PCRE, there is an option to stop the algorithm after the first + est. In PCRE, there is an option to stop the algorithm after the first match (which is necessarily the shortest) has been found. Note that all the matches that are found start at the same point in the @@ -543,87 +495,76 @@ cat(er(pillar)?) - is matched against the string "the caterpillar catchment", the result - will be the three strings "cat", "cater", and "caterpillar" that start + is matched against the string "the caterpillar catchment", the result + will be the three strings "cat", "cater", and "caterpillar" that start at the fourth character of the subject. The algorithm does not automat- ically move on to find matches that start at later positions. There are a number of features of PCRE regular expressions that are not - supported by the alternative matching algorithm. They are as follows: + supported by the DFA matching algorithm. They are as follows: - 1. Because the algorithm finds all possible matches, the greedy or - ungreedy nature of repetition quantifiers is not relevant. Greedy and - ungreedy quantifiers are treated in exactly the same way. However, pos- - sessive quantifiers can make a difference when what follows could also - match what is quantified, for example in a pattern like this: - - ^a++\w! - - This pattern matches "aaab!" but not "aaa!", which would be matched by - a non-possessive quantifier. Similarly, if an atomic group is present, - it is matched as if it were a standalone pattern at the current point, - and the longest match is then "locked in" for the rest of the overall - pattern. + 1. Because the algorithm finds all possible matches, the greedy or + ungreedy nature of repetition quantifiers is not relevant. Greedy and + ungreedy quantifiers are treated in exactly the same way. 2. When dealing with multiple paths through the tree simultaneously, it - is not straightforward to keep track of captured substrings for the - different matching possibilities, and PCRE's implementation of this + is not straightforward to keep track of captured substrings for the + different matching possibilities, and PCRE's implementation of this algorithm does not attempt to do this. This means that no captured sub- strings are available. - 3. Because no substrings are captured, back references within the pat- + 3. Because no substrings are captured, back references within the pat- tern are not supported, and cause errors if encountered. - 4. For the same reason, conditional expressions that use a backrefer- - ence as the condition or test for a specific group recursion are not - supported. + 4. For the same reason, conditional expressions that use a backrefer- + ence as the condition are not supported. 5. Callouts are supported, but the value of the capture_top field is always 1, and the value of the capture_last field is always -1. 6. The \C escape sequence, which (in the standard algorithm) matches a - single byte, even in UTF-8 mode, is not supported because the alterna- - tive algorithm moves through the subject string one character at a - time, for all active paths through the tree. + single byte, even in UTF-8 mode, is not supported because the DFA algo- + rithm moves through the subject string one character at a time, for all + active paths through the tree. -ADVANTAGES OF THE ALTERNATIVE ALGORITHM +ADVANTAGES OF THE DFA ALGORITHM - Using the alternative matching algorithm provides the following advan- - tages: + Using the DFA matching algorithm provides the following advantages: 1. All possible matches (at a single point in the subject) are automat- - ically found, and in particular, the longest match is found. To find + ically found, and in particular, the longest match is found. To find more than one match using the standard algorithm, you have to do kludgy things with callouts. - 2. There is much better support for partial matching. The restrictions - on the content of the pattern that apply when using the standard algo- - rithm for partial matching do not apply to the alternative algorithm. - For non-anchored patterns, the starting position of a partial match is - available. + 2. There is much better support for partial matching. The restrictions + on the content of the pattern that apply when using the standard algo- + rithm for partial matching do not apply to the DFA algorithm. For non- + anchored patterns, the starting position of a partial match is avail- + able. - 3. Because the alternative algorithm scans the subject string just - once, and never needs to backtrack, it is possible to pass very long - subject strings to the matching function in several pieces, checking - for partial matching each time. + 3. Because the DFA algorithm scans the subject string just once, and + never needs to backtrack, it is possible to pass very long subject + strings to the matching function in several pieces, checking for par- + tial matching each time. -DISADVANTAGES OF THE ALTERNATIVE ALGORITHM +DISADVANTAGES OF THE DFA ALGORITHM - The alternative algorithm suffers from a number of disadvantages: + The DFA algorithm suffers from a number of disadvantages: - 1. It is substantially slower than the standard algorithm. This is - partly because it has to search for all possible matches, but is also + 1. It is substantially slower than the standard algorithm. This is + partly because it has to search for all possible matches, but is also because it is less susceptible to optimization. 2. Capturing parentheses and back references are not supported. - 3. Although atomic groups are supported, their use does not provide the - performance advantage that it does for the standard algorithm. + 3. The "atomic group" feature of PCRE regular expressions is supported, + but does not provide the advantage that it does for the standard algo- + rithm. -Last updated: 24 November 2006 -Copyright (c) 1997-2006 University of Cambridge. +Last updated: 28 February 2005 +Copyright (c) 1997-2005 University of Cambridge. ------------------------------------------------------------------------------ @@ -676,9 +617,6 @@ int pcre_get_stringnumber(const pcre *code, const char *name); - int pcre_get_stringtable_entries(const pcre *code, - const char *name, char **first, char **last); - int pcre_get_substring(const char *subject, int *ovector, int stringcount, int stringnumber, const char **stringptr); @@ -717,7 +655,7 @@ PCRE API OVERVIEW PCRE has its own native API, which is described in this document. There - are also some wrapper functions that correspond to the POSIX regular + is also a set of wrapper functions that correspond to the POSIX regular expression API. These are described in the pcreposix documentation. Both of these APIs define a set of C function calls. A C++ wrapper is distributed with PCRE. It is documented in the pcrecpp page. @@ -739,11 +677,11 @@ A second matching function, pcre_dfa_exec(), which is not Perl-compati- ble, is also provided. This uses a different algorithm for the match- - ing. The alternative algorithm finds all possible matches (at a given - point in the subject), and scans the subject just once. However, this - algorithm does not return captured substrings. A description of the two - matching algorithms and their advantages and disadvantages is given in - the pcrematching documentation. + ing. This allows it to find all possible matches (at a given point in + the subject), not just one. However, this algorithm does not return + captured substrings. A description of the two matching algorithms and + their advantages and disadvantages is given in the pcrematching docu- + mentation. In addition to the main compiling and matching functions, there are convenience functions for extracting captured substrings from a subject @@ -755,7 +693,6 @@ pcre_get_named_substring() pcre_get_substring_list() pcre_get_stringnumber() - pcre_get_stringtable_entries() pcre_free_substring() and pcre_free_substring_list() are also provided, to free the memory used for extracted strings. @@ -787,15 +724,12 @@ indirections to memory management functions. These special functions are used only when PCRE is compiled to use the heap for remembering data, instead of recursive function calls, when running the pcre_exec() - function. See the pcrebuild documentation for details of how to do - this. It is a non-standard way of building PCRE, for use in environ- - ments that have limited stacks. Because of the greater use of memory - management, it runs more slowly. Separate functions are provided so - that special-purpose external code can be used for this case. When - used, these functions are always called in a stack-like manner (last - obtained, first freed), and always for memory blocks of the same size. - There is a discussion about PCRE's stack usage in the pcrestack docu- - mentation. + function. This is a non-standard way of building PCRE, for use in envi- + ronments that have limited stacks. Because of the greater use of memory + management, it runs more slowly. Separate functions are provided so + that special-purpose external code can be used for this case. When + used, these functions are always called in a stack-like manner (last + obtained, first freed), and always for memory blocks of the same size. The global variable pcre_callout initially contains NULL. It can be set by the caller to a "callout" function, which PCRE will then call at @@ -803,31 +737,6 @@ pcrecallout documentation. -NEWLINES - - PCRE supports four different conventions for indicating line breaks in - strings: a single CR (carriage return) character, a single LF (line- - feed) character, the two-character sequence CRLF, or any Unicode new- - line sequence. The Unicode newline sequences are the three just men- - tioned, plus the single characters VT (vertical tab, U+000B), FF (form- - feed, U+000C), NEL (next line, U+0085), LS (line separator, U+2028), - and PS (paragraph separator, U+2029). - - Each of the first three conventions is used by at least one operating - system as its standard newline sequence. When PCRE is built, a default - can be specified. The default default is LF, which is the Unix stan- - dard. When PCRE is run, the default can be overridden, either when a - pattern is compiled, or when it is matched. - - In the PCRE documentation the word "newline" is used to mean "the char- - acter or pair of characters that indicate a line break". The choice of - newline convention affects the handling of the dot, circumflex, and - dollar metacharacters, the handling of #-comments in /x mode, and, when - CRLF is a recognized line ending sequence, the match position advance- - ment for a non-anchored pattern. The choice of newline convention does - not affect the interpretation of the \n or \r escape sequences. - - MULTITHREADING The PCRE functions can be used in multi-threading applications, with @@ -874,47 +783,46 @@ PCRE_CONFIG_NEWLINE - The output is an integer whose value specifies the default character - sequence that is recognized as meaning "newline". The four values that - are supported are: 10 for LF, 13 for CR, 3338 for CRLF, and -1 for ANY. - The default should normally be the standard sequence for your operating - system. + The output is an integer that is set to the value of the code that is + used for the newline character. It is either linefeed (10) or carriage + return (13), and should normally be the standard character for your + operating system. PCRE_CONFIG_LINK_SIZE - The output is an integer that contains the number of bytes used for + The output is an integer that contains the number of bytes used for internal linkage in compiled regular expressions. The value is 2, 3, or - 4. Larger values allow larger regular expressions to be compiled, at - the expense of slower matching. The default value of 2 is sufficient - for all but the most massive patterns, since it allows the compiled + 4. Larger values allow larger regular expressions to be compiled, at + the expense of slower matching. The default value of 2 is sufficient + for all but the most massive patterns, since it allows the compiled pattern to be up to 64K in size. PCRE_CONFIG_POSIX_MALLOC_THRESHOLD - The output is an integer that contains the threshold above which the - POSIX interface uses malloc() for output vectors. Further details are + The output is an integer that contains the threshold above which the + POSIX interface uses malloc() for output vectors. Further details are given in the pcreposix documentation. PCRE_CONFIG_MATCH_LIMIT The output is an integer that gives the default limit for the number of - internal matching function calls in a pcre_exec() execution. Further + internal matching function calls in a pcre_exec() execution. Further details are given with pcre_exec() below. PCRE_CONFIG_MATCH_LIMIT_RECURSION - The output is an integer that gives the default limit for the depth of - recursion when calling the internal matching function in a pcre_exec() + The output is an integer that gives the default limit for the depth of + recursion when calling the internal matching function in a pcre_exec() execution. Further details are given with pcre_exec() below. PCRE_CONFIG_STACKRECURSE - The output is an integer that is set to one if internal recursion when + The output is an integer that is set to one if internal recursion when running pcre_exec() is implemented by recursive function calls that use - the stack to remember their state. This is the usual way that PCRE is + the stack to remember their state. This is the usual way that PCRE is compiled. The output is zero if PCRE was compiled to use blocks of data - on the heap instead of recursive function calls. In this case, - pcre_stack_malloc and pcre_stack_free are called to manage memory + on the heap instead of recursive function calls. In this case, + pcre_stack_malloc and pcre_stack_free are called to manage memory blocks on the heap, thus avoiding the use of the stack. @@ -931,55 +839,55 @@ Either of the functions pcre_compile() or pcre_compile2() can be called to compile a pattern into an internal form. The only difference between - the two interfaces is that pcre_compile2() has an additional argument, + the two interfaces is that pcre_compile2() has an additional argument, errorcodeptr, via which a numerical error code can be returned. The pattern is a C string terminated by a binary zero, and is passed in - the pattern argument. A pointer to a single block of memory that is - obtained via pcre_malloc is returned. This contains the compiled code + the pattern argument. A pointer to a single block of memory that is + obtained via pcre_malloc is returned. This contains the compiled code and related data. The pcre type is defined for the returned block; this is a typedef for a structure whose contents are not externally defined. - It is up to the caller to free the memory (via pcre_free) when it is no - longer required. + It is up to the caller to free the memory when it is no longer + required. - Although the compiled code of a PCRE regex is relocatable, that is, it + Although the compiled code of a PCRE regex is relocatable, that is, it does not depend on memory location, the complete pcre data block is not - fully relocatable, because it may contain a copy of the tableptr argu- + fully relocatable, because it may contain a copy of the tableptr argu- ment, which is an address (see below). - The options argument contains various bit settings that affect the com- - pilation. It should be zero if no options are required. The available - options are described below. Some of them, in particular, those that - are compatible with Perl, can also be set and unset from within the - pattern (see the detailed description in the pcrepattern documenta- - tion). For these options, the contents of the options argument speci- - fies their initial settings at the start of compilation and execution. - The PCRE_ANCHORED and PCRE_NEWLINE_xxx options can be set at the time - of matching as well as at compile time. + The options argument contains independent bits that affect the compila- + tion. It should be zero if no options are required. The available + options are described below. Some of them, in particular, those that + are compatible with Perl, can also be set and unset from within the + pattern (see the detailed description in the pcrepattern documenta- + tion). For these options, the contents of the options argument speci- + fies their initial settings at the start of compilation and execution. + The PCRE_ANCHORED option can be set at the time of matching as well as + at compile time. If errptr is NULL, pcre_compile() returns NULL immediately. Otherwise, - if compilation of a pattern fails, pcre_compile() returns NULL, and + if compilation of a pattern fails, pcre_compile() returns NULL, and sets the variable pointed to by errptr to point to a textual error mes- sage. This is a static string that is part of the library. You must not try to free it. The offset from the start of the pattern to the charac- ter where the error was discovered is placed in the variable pointed to - by erroffset, which must not be NULL. If it is, an immediate error is + by erroffset, which must not be NULL. If it is, an immediate error is given. - If pcre_compile2() is used instead of pcre_compile(), and the error- - codeptr argument is not NULL, a non-zero error code number is returned - via this argument in the event of an error. This is in addition to the + If pcre_compile2() is used instead of pcre_compile(), and the error- + codeptr argument is not NULL, a non-zero error code number is returned + via this argument in the event of an error. This is in addition to the textual error message. Error codes and messages are listed below. - If the final argument, tableptr, is NULL, PCRE uses a default set of - character tables that are built when PCRE is compiled, using the - default C locale. Otherwise, tableptr must be an address that is the - result of a call to pcre_maketables(). This value is stored with the - compiled pattern, and used again by pcre_exec(), unless another table + If the final argument, tableptr, is NULL, PCRE uses a default set of + character tables that are built when PCRE is compiled, using the + default C locale. Otherwise, tableptr must be an address that is the + result of a call to pcre_maketables(). This value is stored with the + compiled pattern, and used again by pcre_exec(), unless another table pointer is passed to it. For more discussion, see the section on locale support below. - This code fragment shows a typical straightforward call to pcre_com- + This code fragment shows a typical straightforward call to pcre_com- pile(): pcre *re; @@ -992,95 +900,86 @@ &erroffset, /* for error offset */ NULL); /* use default character tables */ - The following names for option bits are defined in the pcre.h header + The following names for option bits are defined in the pcre.h header file: PCRE_ANCHORED If this bit is set, the pattern is forced to be "anchored", that is, it - is constrained to match only at the first matching point in the string - that is being searched (the "subject string"). This effect can also be - achieved by appropriate constructs in the pattern itself, which is the + is constrained to match only at the first matching point in the string + that is being searched (the "subject string"). This effect can also be + achieved by appropriate constructs in the pattern itself, which is the only way to do it in Perl. PCRE_AUTO_CALLOUT If this bit is set, pcre_compile() automatically inserts callout items, - all with number 255, before each pattern item. For discussion of the + all with number 255, before each pattern item. For discussion of the callout facility, see the pcrecallout documentation. PCRE_CASELESS - If this bit is set, letters in the pattern match both upper and lower - case letters. It is equivalent to Perl's /i option, and it can be - changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE - always understands the concept of case for characters whose values are - less than 128, so caseless matching is always possible. For characters - with higher values, the concept of case is supported if PCRE is com- - piled with Unicode property support, but not otherwise. If you want to - use caseless matching for characters 128 and above, you must ensure - that PCRE is compiled with Unicode property support as well as with + If this bit is set, letters in the pattern match both upper and lower + case letters. It is equivalent to Perl's /i option, and it can be + changed within a pattern by a (?i) option setting. In UTF-8 mode, PCRE + always understands the concept of case for characters whose values are + less than 128, so caseless matching is always possible. For characters + with higher values, the concept of case is supported if PCRE is com- + piled with Unicode property support, but not otherwise. If you want to + use caseless matching for characters 128 and above, you must ensure + that PCRE is compiled with Unicode property support as well as with UTF-8 support. PCRE_DOLLAR_ENDONLY - If this bit is set, a dollar metacharacter in the pattern matches only - at the end of the subject string. Without this option, a dollar also - matches immediately before a newline at the end of the string (but not - before any other newlines). The PCRE_DOLLAR_ENDONLY option is ignored - if PCRE_MULTILINE is set. There is no equivalent to this option in - Perl, and no way to set it within a pattern. + If this bit is set, a dollar metacharacter in the pattern matches only + at the end of the subject string. Without this option, a dollar also + matches immediately before the final character if it is a newline (but + not before any other newlines). The PCRE_DOLLAR_ENDONLY option is + ignored if PCRE_MULTILINE is set. There is no equivalent to this option + in Perl, and no way to set it within a pattern. PCRE_DOTALL If this bit is set, a dot metacharater in the pattern matches all char- - acters, including those that indicate newline. Without it, a dot does - not match when the current position is at a newline. This option is - equivalent to Perl's /s option, and it can be changed within a pattern - by a (?s) option setting. A negative class such as [^a] always matches - newline characters, independent of the setting of this option. - - PCRE_DUPNAMES - - If this bit is set, names used to identify capturing subpatterns need - not be unique. This can be helpful for certain types of pattern when it - is known that only one instance of the named subpattern can ever be - matched. There are more details of named subpatterns below; see also - the pcrepattern documentation. + acters, including newlines. Without it, newlines are excluded. This + option is equivalent to Perl's /s option, and it can be changed within + a pattern by a (?s) option setting. A negative class such as [^a] + always matches a newline character, independent of the setting of this + option. PCRE_EXTENDED - If this bit is set, whitespace data characters in the pattern are + If this bit is set, whitespace data characters in the pattern are totally ignored except when escaped or inside a character class. White- space does not include the VT character (code 11). In addition, charac- ters between an unescaped # outside a character class and the next new- - line, inclusive, are also ignored. This is equivalent to Perl's /x - option, and it can be changed within a pattern by a (?x) option set- - ting. - - This option makes it possible to include comments inside complicated - patterns. Note, however, that this applies only to data characters. - Whitespace characters may never appear within special character - sequences in a pattern, for example within the sequence (?( which + line character, inclusive, are also ignored. This is equivalent to + Perl's /x option, and it can be changed within a pattern by a (?x) + option setting. + + This option makes it possible to include comments inside complicated + patterns. Note, however, that this applies only to data characters. + Whitespace characters may never appear within special character + sequences in a pattern, for example within the sequence (?( which introduces a conditional subpattern. PCRE_EXTRA - This option was invented in order to turn on additional functionality - of PCRE that is incompatible with Perl, but it is currently of very - little use. When set, any backslash in a pattern that is followed by a - letter that has no special meaning causes an error, thus reserving - these combinations for future expansion. By default, as in Perl, a - backslash followed by a letter with no special meaning is treated as a - literal. (Perl can, however, be persuaded to give a warning for this.) - There are at present no other features controlled by this option. It - can also be set by a (?X) option setting within a pattern. + This option was invented in order to turn on additional functionality + of PCRE that is incompatible with Perl, but it is currently of very + little use. When set, any backslash in a pattern that is followed by a + letter that has no special meaning causes an error, thus reserving + these combinations for future expansion. By default, as in Perl, a + backslash followed by a letter with no special meaning is treated as a + literal. There are at present no other features controlled by this + option. It can also be set by a (?X) option setting within a pattern. PCRE_FIRSTLINE If this option is set, an unanchored pattern is required to match - before or at the first newline in the subject string, though the - matched text may continue over the newline. + before or at the first newline character in the subject string, though + the matched text may continue over the newline. PCRE_MULTILINE @@ -1092,91 +991,55 @@ is set). This is the same as Perl. When PCRE_MULTILINE it is set, the "start of line" and "end of line" - constructs match immediately following or immediately before internal - newlines in the subject string, respectively, as well as at the very - start and end. This is equivalent to Perl's /m option, and it can be - changed within a pattern by a (?m) option setting. If there are no new- - lines in a subject string, or no occurrences of ^ or $ in a pattern, + constructs match immediately following or immediately before any new- + line in the subject string, respectively, as well as at the very start + and end. This is equivalent to Perl's /m option, and it can be changed + within a pattern by a (?m) option setting. If there are no "\n" charac- + ters in a subject string, or no occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect. - PCRE_NEWLINE_CR - PCRE_NEWLINE_LF - PCRE_NEWLINE_CRLF - PCRE_NEWLINE_ANY - - These options override the default newline definition that was chosen - when PCRE was built. Setting the first or the second specifies that a - newline is indicated by a single character (CR or LF, respectively). - Setting PCRE_NEWLINE_CRLF specifies that a newline is indicated by the - two-character CRLF sequence. Setting PCRE_NEWLINE_ANY specifies that - any Unicode newline sequence should be recognized. The Unicode newline - sequences are the three just mentioned, plus the single characters VT - (vertical tab, U+000B), FF (formfeed, U+000C), NEL (next line, U+0085), - LS (line separator, U+2028), and PS (paragraph separator, U+2029). The - last two are recognized only in UTF-8 mode. - - The newline setting in the options word uses three bits that are - treated as a number, giving eight possibilities. Currently only five - are used (default plus the four values above). This means that if you - set more than one newline option, the combination may or may not be - sensible. For example, PCRE_NEWLINE_CR with PCRE_NEWLINE_LF is equiva- - lent to PCRE_NEWLINE_CRLF, but other combinations yield unused numbers - and cause an error. - - The only time that a line break is specially recognized when compiling - a pattern is if PCRE_EXTENDED is set, and an unescaped # outside a - character class is encountered. This indicates a comment that lasts - until after the next line break sequence. In other circumstances, line - break sequences are treated as literal data, except that in - PCRE_EXTENDED mode, both CR and LF are treated as whitespace characters - and are therefore ignored. - - The newline option that is set at compile time becomes the default that - is used for pcre_exec() and pcre_dfa_exec(), but it can be overridden. - PCRE_NO_AUTO_CAPTURE If this option is set, it disables the use of numbered capturing paren- - theses in the pattern. Any opening parenthesis that is not followed by - ? behaves as if it were followed by ?: but named parentheses can still - be used for capturing (and they acquire numbers in the usual way). + theses in the pattern. Any opening parenthesis that is not followed by + ? behaves as if it were followed by ?: but named parentheses can still + be used for capturing (and they acquire numbers in the usual way). There is no equivalent of this option in Perl. PCRE_UNGREEDY - This option inverts the "greediness" of the quantifiers so that they - are not greedy by default, but become greedy if followed by "?". It is - not compatible with Perl. It can also be set by a (?U) option setting + This option inverts the "greediness" of the quantifiers so that they + are not greedy by default, but become greedy if followed by "?". It is + not compatible with Perl. It can also be set by a (?U) option setting within the pattern. PCRE_UTF8 - This option causes PCRE to regard both the pattern and the subject as - strings of UTF-8 characters instead of single-byte character strings. - However, it is available only when PCRE is built to include UTF-8 sup- - port. If not, the use of this option provokes an error. Details of how - this option changes the behaviour of PCRE are given in the section on + This option causes PCRE to regard both the pattern and the subject as + strings of UTF-8 characters instead of single-byte character strings. + However, it is available only when PCRE is built to include UTF-8 sup- + port. If not, the use of this option provokes an error. Details of how + this option changes the behaviour of PCRE are given in the section on UTF-8 support in the main pcre page. PCRE_NO_UTF8_CHECK When PCRE_UTF8 is set, the validity of the pattern as a UTF-8 string is - automatically checked. If an invalid UTF-8 sequence of bytes is found, - pcre_compile() returns an error. If you already know that your pattern - is valid, and you want to skip this check for performance reasons, you - can set the PCRE_NO_UTF8_CHECK option. When it is set, the effect of + automatically checked. If an invalid UTF-8 sequence of bytes is found, + pcre_compile() returns an error. If you already know that your pattern + is valid, and you want to skip this check for performance reasons, you + can set the PCRE_NO_UTF8_CHECK option. When it is set, the effect of passing an invalid UTF-8 string as a pattern is undefined. It may cause - your program to crash. Note that this option can also be passed to - pcre_exec() and pcre_dfa_exec(), to suppress the UTF-8 validity check- + your program to crash. Note that this option can also be passed to + pcre_exec() and pcre_dfa_exec(), to suppress the UTF-8 validity check- ing of subject strings. COMPILATION ERROR CODES - The following table lists the error codes than may be returned by - pcre_compile2(), along with the error messages that may be returned by - both compiling functions. As PCRE has developed, some error codes have - fallen out of use. To avoid confusion, they have not been re-used. + The following table lists the error codes than may be returned by + pcre_compile2(), along with the error messages that may be returned by + both compiling functions. 0 no error 1 \ at end of pattern @@ -1188,7 +1051,7 @@ 7 invalid escape sequence in character class 8 range out of order in character class 9 nothing to repeat - 10 [this code is not in use] + 10 operand of unlimited repeat could match the empty string 11 internal error: unexpected repeat 12 unrecognized character after (? 13 POSIX named classes are supported only within a class @@ -1197,21 +1060,21 @@ 16 erroffset passed as NULL 17 unknown option bit(s) set 18 missing ) after comment - 19 [this code is not in use] + 19 parentheses nested too deeply 20 regular expression too large 21 failed to get memory 22 unmatched parentheses 23 internal error: code overflow 24 unrecognized character after (?< 25 lookbehind assertion is not fixed length - 26 malformed number or name after (?( + 26 malformed number after (?( 27 conditional group contains more than two branches 28 assertion expected after (?( 29 (?R or (?digits must be followed by ) 30 unknown POSIX class name 31 POSIX collating elements are not supported 32 this version of PCRE is not compiled with PCRE_UTF8 support - 33 [this code is not in use] + 33 spare error 34 character value in \x{...} sequence is too large 35 invalid condition (?(0) 36 \C not allowed in lookbehind assertion @@ -1220,22 +1083,12 @@ 39 closing ) for (?C expected 40 recursive call could loop indefinitely 41 unrecognized character after (?P - 42 syntax error in subpattern name (missing terminator) - 43 two named subpatterns have the same name +