hbpcre

**c:\harbour\source\hbpcre**
Type	Function	Source	Line
pcrecomp.c
STATIC CONST CHAR *	find_error_text(int n) static const char * find_error_text(int n) { const char s = error_texts; for (; n > 0; n--) while (s++ != 0) {}; return s; }	pcrecomp.c	454
STATIC INT	check_escape(const uschar *ptrptr, int errorcodeptr, int bracount, int options, BOOL isclass) static int check_escape(const uschar *ptrptr, int errorcodeptr, int bracount, int options, BOOL isclass) { BOOL utf8 = (options & PCRE_UTF8) != 0; const uschar ptr = ptrptr + 1; int c, i; GETCHARINCTEST(c, ptr); /* Get character value, increment pointer / ptr--; / Set pointer back to the last byte / / If backslash is at the end of the pattern, it's an error. / if (c == 0) errorcodeptr = ERR1; /* Non-alphanumerics are literals. For digits or letters, do an initial lookup in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. / #ifndef EBCDIC / ASCII coding / else if (c < '0' \|\| c > 'z') {} / Not alphanumeric / else if ((i = escapes[c - '0']) != 0) c = i; #else / EBCDIC coding / else if (c < 'a' \|\| (ebcdic_chartab[c] & 0x0E) == 0) {} / Not alphanumeric / else if ((i = escapes[c - 0x48]) != 0) c = i; #endif / Escapes that need further processing, or are illegal. / else { const uschar oldptr; BOOL braced, negated; switch (c) { /* A number of Perl escapes are not handled by PCRE. We give an explicit error. / case 'l': case 'L': case 'N': case 'u': case 'U': errorcodeptr = ERR37; break; /* \g must be followed by one of a number of specific things: (1) A number, either plain or braced. If positive, it is an absolute backreference. If negative, it is a relative backreference. This is a Perl 5.10 feature. (2) Perl 5.10 also supports \g{name} as a reference to a named group. This is part of Perl's movement towards a unified syntax for back references. As this is synonymous with \k{name}, we fudge it up by pretending it really was \k. (3) For Oniguruma compatibility we also support \g followed by a name or a number either in angle brackets or in single quotes. However, these are (possibly recursive) subroutine calls, _not_ backreferences. Just return the -ESC_g code (cf \k). / case 'g': if (ptr[1] == '<' \|\| ptr[1] == '\'') { c = -ESC_g; break; } / Handle the Perl-compatible cases / if (ptr[1] == '{') { const uschar p; for (p = ptr+2; p != 0 && p != '}'; p++) if (p != '-' && (digitab[p] & ctype_digit) == 0) break; if (p != 0 && p != '}') { c = -ESC_k; break; } braced = TRUE; ptr++; } else braced = FALSE; if (ptr[1] == '-') { negated = TRUE; ptr++; } else negated = FALSE; c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + (++ptr) - '0'; if (c < 0) / Integer overflow / { errorcodeptr = ERR61; break; } if (braced && (++ptr) != '}') { errorcodeptr = ERR57; break; } if (c == 0) { errorcodeptr = ERR58; break; } if (negated) { if (c > bracount) { errorcodeptr = ERR15; break; } c = bracount - (c - 1); } c = -(ESC_REF + c); break; /* The handling of escape sequences consisting of a string of digits starting with one that is not zero is not straightforward. By experiment, the way Perl works seems to be as follows: Outside a character class, the digits are read as a decimal number. If the number is less than 10, or if there are that many previous extracting left brackets, then it is a back reference. Otherwise, up to three octal digits are read to form an escaped byte. Thus \123 is likely to be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. / case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (!isclass) { oldptr = ptr; c -= '0'; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c 10 + (++ptr) - '0'; if (c < 0) / Integer overflow / { errorcodeptr = ERR61; break; } if (c < 10 \|\| c <= bracount) { c = -(ESC_REF + c); break; } ptr = oldptr; /* Put the pointer back and fall through / } / Handle an octal number following \. If the first digit is 8 or 9, Perl generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. / if ((c = ptr) >= '8') { ptr--; c = 0; break; } /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. / case '0': c -= '0'; while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') c = c 8 + (++ptr) - '0'; if (!utf8 && c > 255) errorcodeptr = ERR51; break; /* \x is complicated. \x{ddd} is a character number which can be greater than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. / case 'x': if (ptr[1] == '{') { const uschar pt = ptr + 2; int count = 0; c = 0; while ((digitab[pt] & ctype_xdigit) != 0) { register int cc = pt++; if (c == 0 && cc == '0') continue; /* Leading zeroes / count++; #ifndef EBCDIC / ASCII coding / if (cc >= 'a') cc -= 32; / Convert to upper case / c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); #else / EBCDIC coding / if (cc >= 'a' && cc <= 'z') cc += 64; / Convert to upper case / c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } if (pt == '}') { if (c < 0 \|\| count > (utf8? 8 : 2)) errorcodeptr = ERR34; ptr = pt; break; } / If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. / } / Read just a single-byte hex-defined char / c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { int cc; / Some compilers don't like ++ / cc = (++ptr); /* in initializers / #ifndef EBCDIC / ASCII coding / if (cc >= 'a') cc -= 32; / Convert to upper case / c = c 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); #else /* EBCDIC coding / if (cc <= 'z') cc += 64; / Convert to upper case / c = c 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) / case 'c': c = (++ptr); if (c == 0) { errorcodeptr = ERR2; break; } #ifndef EBCDIC / ASCII coding / if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40; #else / EBCDIC coding / if (c >= 'a' && c <= 'z') c += 64; c ^= 0xC0; #endif break; / PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphanumeric following \ is an error if PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a literal. This code looks a bit odd, but there used to be some cases other than the default, and there may be again in future, so I haven't "optimized" it. / default: if ((options & PCRE_EXTRA) != 0) switch(c) { default: errorcodeptr = ERR3; break; } break; } } *ptrptr = ptr; return c; } #ifdef SUPPORT_UCP	pcrecomp.c	487
STATIC INT	get_ucp(const uschar *ptrptr, BOOL negptr, int dptr, int errorcodeptr) static int get_ucp(const uschar *ptrptr, BOOL negptr, int dptr, int errorcodeptr) { int c, i, bot, top; const uschar ptr = ptrptr; char name[32]; c = (++ptr); if (c == 0) goto ERROR_RETURN; negptr = FALSE; /* \P or \p can be followed by a name in {}, optionally preceded by ^ for negation. / if (c == '{') { if (ptr[1] == '^') { negptr = TRUE; ptr++; } for (i = 0; i < (int)sizeof(name) - 1; i++) { c = (++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } if (c !='}') goto ERROR_RETURN; name[i] = 0; } / Otherwise there is just one following character / else { name[0] = c; name[1] = 0; } ptrptr = ptr; /* Search for a recognized property name using binary chop / bot = 0; top = _pcre_utt_size; while (bot < top) { i = (bot + top) >> 1; c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { dptr = _pcre_utt[i].value; return _pcre_utt[i].type; } if (c > 0) bot = i + 1; else top = i; } errorcodeptr = ERR47; ptrptr = ptr; return -1; ERROR_RETURN: errorcodeptr = ERR46; ptrptr = ptr; return -1; } #endif	pcrecomp.c	792
STATIC BOOL	is_counted_repeat(const uschar p) static BOOL is_counted_repeat(const uschar p) { if ((digitab[p++] & ctype_digit) == 0) return FALSE; while ((digitab[p] & ctype_digit) != 0) p++; if (p == '}') return TRUE; if (p++ != ',') return FALSE; if (p == '}') return TRUE; if ((digitab[p++] & ctype_digit) == 0) return FALSE; while ((digitab[p] & ctype_digit) != 0) p++; return (p == '}'); }	pcrecomp.c	881
STATIC CONST USCHAR *	read_repeat_counts(const uschar p, int minp, int maxp, int errorcodeptr) static const uschar * read_repeat_counts(const uschar p, int minp, int maxp, int errorcodeptr) { int min = 0; int max = -1; /* Read the minimum value and do a paranoid check: a negative value indicates an integer overflow. / while ((digitab[p] & ctype_digit) != 0) min = min * 10 + p++ - '0'; if (min < 0 \|\| min > 65535) { errorcodeptr = ERR5; return p; } /* Read the maximum value if there is one, and again do a paranoid on its size. Also, max must not be less than min. / if (p == '}') max = min; else { if ((++p) != '}') { max = 0; while((digitab[p] & ctype_digit) != 0) max = max * 10 + p++ - '0'; if (max < 0 \|\| max > 65535) { errorcodeptr = ERR5; return p; } if (max < min) { errorcodeptr = ERR4; return p; } } } / Fill in the required variables, and pass back the pointer to the terminating '}'. / minp = min; *maxp = max; return p; }	pcrecomp.c	918
STATIC INT	find_parens(const uschar ptr, compile_data cd, const uschar name, int lorn, BOOL xmode) static int find_parens(const uschar ptr, compile_data cd, const uschar name, int lorn, BOOL xmode) { const uschar thisname; int count = cd->bracount; for (; ptr != 0; ptr++) { int term; /* Skip over backslashed characters and also entire \Q...\E / if (ptr == '\\') { if ((++ptr) == 0) return -1; if (ptr == 'Q') for (;;) { while ((++ptr) != 0 && ptr != '\\') {}; if (ptr == 0) return -1; if ((++ptr) == 'E') break; } continue; } /* Skip over character classes; this logic must be similar to the way they are handled for real. If the first character is '^', skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them too. This makes for compatibility with Perl. / if (ptr == '[') { BOOL negate_class = FALSE; for (;;) { int c = (++ptr); if (c == '\\') { if (ptr[1] == 'E') ptr++; else if (strncmp((const char )ptr+1, "Q\\E", 3) == 0) ptr += 3; else break; } else if (!negate_class && c == '^') negate_class = TRUE; else break; } /* If the next character is ']', it is a data character that must be skipped, except in JavaScript compatibility mode. / if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) ptr++; while ((++ptr) != ']') { if (ptr == 0) return -1; if (ptr == '\\') { if ((++ptr) == 0) return -1; if (ptr == 'Q') for (;;) { while ((++ptr) != 0 && ptr != '\\') {}; if (ptr == 0) return -1; if ((++ptr) == 'E') break; } continue; } } continue; } /* Skip comments in /x mode / if (xmode && ptr == '#') { while ((++ptr) != 0 && ptr != '\n') {}; if (ptr == 0) return -1; continue; } / An opening parens must now be a real metacharacter / if (ptr != '(') continue; if (ptr[1] != '?' && ptr[1] != '') { count++; if (name == NULL && count == lorn) return count; continue; } ptr += 2; if (ptr == 'P') ptr++; /* Allow optional P / / We have to disambiguate (? / if ((ptr != '<' \|\| ptr[1] == '!' \|\| ptr[1] == '=') && ptr != '\'') continue; count++; if (name == NULL && count == lorn) return count; term = ptr++; if (term == '<') term = '>'; thisname = ptr; while (ptr != term) ptr++; if (name != NULL && lorn == ptr - thisname && strncmp((const char )name, (const char *)thisname, lorn) == 0) return count; } return -1; }	pcrecomp.c	987
STATIC CONST USCHAR*	first_significant_code(const uschar code, int options, int optbit, BOOL skipassert) static const uschar* first_significant_code(const uschar code, int options, int optbit, BOOL skipassert) { for (;;) { switch ((int)code) { case OP_OPT: if (optbit > 0 && ((int)code[1] & optbit) != (options & optbit)) options = (int)code[1]; code += 2; break; case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: if (!skipassert) return code; do code += GET(code, 1); while (code == OP_ALT); code += _pcre_OP_lengths[code]; break; case OP_WORD_BOUNDARY: case OP_NOT_WORD_BOUNDARY: if (!skipassert) return code; / Fall through / case OP_CALLOUT: case OP_CREF: case OP_RREF: case OP_DEF: code += _pcre_OP_lengths[code]; break; default: return code; } } /* Control never reaches here */ }	pcrecomp.c	1123
STATIC INT	find_fixedlength(uschar code, int options) static int find_fixedlength(uschar code, int options) { int length = -1; register int branchlength = 0; register uschar cc = code + 1 + LINK_SIZE; / Scan along the opcodes for this branch. If we get to the end of the branch, check the length against that of the other branches. / for (;;) { int d; register int op = cc; switch (op) { case OP_CBRA: case OP_BRA: case OP_ONCE: case OP_COND: d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (cc == OP_ALT); cc += 1 + LINK_SIZE; break; / Reached end of a branch; if it's a ket it is the end of a nested call. If it's ALT it is an alternation in a nested call. If it is END it's the end of the outer call. All can be handled by the same code. / case OP_ALT: case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_END: if (length < 0) length = branchlength; else if (length != branchlength) return -1; if (cc != OP_ALT) return length; cc += 1 + LINK_SIZE; branchlength = 0; break; /* Skip over assertive subpatterns / case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do cc += GET(cc, 1); while (cc == OP_ALT); /* Fall through / / Skip over things that don't match chars / case OP_REVERSE: case OP_CREF: case OP_RREF: case OP_DEF: case OP_OPT: case OP_CALLOUT: case OP_SOD: case OP_SOM: case OP_EOD: case OP_EODN: case OP_CIRC: case OP_DOLL: case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: cc += _pcre_OP_lengths[cc]; break; /* Handle literal characters / case OP_CHAR: case OP_CHARNC: case OP_NOT: branchlength++; cc += 2; #ifdef SUPPORT_UTF8 if ((options & PCRE_UTF8) != 0) { while ((cc & 0xc0) == 0x80) cc++; } #endif break; /* Handle exact repetitions. The count is already in characters, but we need to skip over a multibyte character in UTF8 mode. / case OP_EXACT: branchlength += GET2(cc,1); cc += 4; #ifdef SUPPORT_UTF8 if ((options & PCRE_UTF8) != 0) { while((cc & 0x80) == 0x80) cc++; } #endif break; case OP_TYPEEXACT: branchlength += GET2(cc,1); if (cc[3] == OP_PROP \|\| cc[3] == OP_NOTPROP) cc += 2; cc += 4; break; /* Handle single-char matchers / case OP_PROP: case OP_NOTPROP: cc += 2; / Fall through / case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: case OP_ALLANY: branchlength++; cc++; break; / The single-byte matcher isn't allowed / case OP_ANYBYTE: return -2; / Check a class for variable quantification / #ifdef SUPPORT_UTF8 case OP_XCLASS: cc += GET(cc, 1) - 33; / Fall through / #endif case OP_CLASS: case OP_NCLASS: cc += 33; switch (cc) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: return -1; case OP_CRRANGE: case OP_CRMINRANGE: if (GET2(cc,1) != GET2(cc,3)) return -1; branchlength += GET2(cc,1); cc += 5; break; default: branchlength++; } break; /* Anything else is variable length / default: return -1; } } / Control never gets here */ }	pcrecomp.c	1183
STATIC CONST USCHAR *	find_bracket(const uschar code, BOOL utf8, int number) static const uschar find_bracket(const uschar code, BOOL utf8, int number) { for (;;) { register int c = code; if (c == OP_END) return NULL; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. / if (c == OP_XCLASS) code += GET(code, 1); / Handle capturing bracket / else if (c == OP_CBRA) { int n = GET2(code, 1+LINK_SIZE); if (n == number) return (uschar )code; code += _pcre_OP_lengths[c]; } /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. / else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2; break; } / Add in the fixed length from the table / code += _pcre_OP_lengths[c]; / In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. / #ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } #else (void)(utf8); / Keep compiler happy by referencing function argument */ #endif } } }	pcrecomp.c	1373
STATIC CONST USCHAR *	find_recurse(const uschar code, BOOL utf8) static const uschar find_recurse(const uschar code, BOOL utf8) { for (;;) { register int c = code; if (c == OP_END) return NULL; if (c == OP_RECURSE) return code; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. / if (c == OP_XCLASS) code += GET(code, 1); / Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. / else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEPOSUPTO: case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2; break; } / Add in the fixed length from the table / code += _pcre_OP_lengths[c]; / In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. / #ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } #else (void)(utf8); / Keep compiler happy by referencing function argument */ #endif } } }	pcrecomp.c	1476
STATIC BOOL	could_be_empty_branch(const uschar code, const uschar endcode, BOOL utf8) static BOOL could_be_empty_branch(const uschar code, const uschar endcode, BOOL utf8) { register int c; for (code = first_significant_code(code + _pcre_OP_lengths[code], NULL, 0, TRUE); code < endcode; code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) { const uschar ccode; c = code; / Skip over forward assertions; the other assertions are skipped by first_significant_code() with a TRUE final argument. / if (c == OP_ASSERT) { do code += GET(code, 1); while (code == OP_ALT); c = code; continue; } / Groups with zero repeats can of course be empty; skip them. / if (c == OP_BRAZERO \|\| c == OP_BRAMINZERO \|\| c == OP_SKIPZERO) { code += _pcre_OP_lengths[c]; do code += GET(code, 1); while (code == OP_ALT); c = code; continue; } / For other groups, scan the branches. / if (c == OP_BRA \|\| c == OP_CBRA \|\| c == OP_ONCE \|\| c == OP_COND) { BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; / Hit unclosed bracket / / Scan a closed bracket / empty_branch = FALSE; do { if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) empty_branch = TRUE; code += GET(code, 1); } while (code == OP_ALT); if (!empty_branch) return FALSE; /* All branches are non-empty / c = code; continue; } /* Handle the other opcodes / switch (c) { / Check for quantifiers after a class. XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in _pcre_OP_lengths[] is zero; the actual length is stored in the compiled code, so we must update "code" here. / #ifdef SUPPORT_UTF8 case OP_XCLASS: ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; #endif case OP_CLASS: case OP_NCLASS: ccode = code + 33; #ifdef SUPPORT_UTF8 CHECK_CLASS_REPEAT: #endif switch (ccode) { case OP_CRSTAR: /* These could be empty; continue / case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: break; default: / Non-repeat => class must match / case OP_CRPLUS: / These repeats aren't empty / case OP_CRMINPLUS: return FALSE; case OP_CRRANGE: case OP_CRMINRANGE: if (GET2(ccode, 1) > 0) return FALSE; / Minimum > 0 / break; } break; / Opcodes that must match a character / case OP_PROP: case OP_NOTPROP: case OP_EXTUNI: case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: case OP_ALLANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: case OP_NOT: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_EXACT: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTPOSPLUS: case OP_NOTEXACT: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEPOSPLUS: case OP_TYPEEXACT: return FALSE; / These are going to continue, as they may be empty, but we have to fudge the length for the \p and \P cases. / case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP \|\| code[1] == OP_NOTPROP) code += 2; break; / Same for these / case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP \|\| code[3] == OP_NOTPROP) code += 2; break; / End of branch / case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_ALT: return TRUE; / In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a multibyte character */ #ifdef SUPPORT_UTF8 case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: if (utf8) while ((code[2] & 0xc0) == 0x80) code++; break; #endif } } return TRUE; }	pcrecomp.c	1577
STATIC BOOL	could_be_empty(const uschar code, const uschar endcode, branch_chain bcptr, BOOL utf8) static BOOL could_be_empty(const uschar code, const uschar endcode, branch_chain bcptr, BOOL utf8) { while (bcptr != NULL && bcptr->current >= code) { if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; bcptr = bcptr->outer; } return TRUE; }	pcrecomp.c	1776
STATIC BOOL	check_posix_syntax(const uschar ptr, const uschar endptr) static BOOL check_posix_syntax(const uschar ptr, const uschar *endptr) { int terminator; / Don't combine these lines; the Solaris cc / terminator = (++ptr); /* compiler warns about "non-constant" initializer. / for (++ptr; ptr != 0; ptr++) { if (ptr == '\\' && ptr[1] == ']') ptr++; else { if (ptr == ']') return FALSE; if (ptr == terminator && ptr[1] == ']') { endptr = ptr; return TRUE; } } } return FALSE; }	pcrecomp.c	1821
STATIC INT	check_posix_name(const uschar ptr, int len) static int check_posix_name(const uschar ptr, int len) { const char pn = posix_names; register int yield = 0; while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && strncmp((const char )ptr, pn, len) == 0) return yield; pn += posix_name_lengths[yield] + 1; yield++; } return -1; }	pcrecomp.c	1858
STATIC VOID	adjust_recurse(uschar group, int adjust, BOOL utf8, compile_data cd, uschar save_hwm) static void adjust_recurse(uschar group, int adjust, BOOL utf8, compile_data cd, uschar save_hwm) { uschar ptr = group; while ((ptr = (uschar )find_recurse(ptr, utf8)) != NULL) { int offset; uschar hc; / See if this recursion is on the forward reference list. If so, adjust the reference. / for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) { offset = GET(hc, 0); if (cd->start_code + offset == ptr + 1) { PUT(hc, 0, offset + adjust); break; } } / Otherwise, adjust the recursion offset if it's after the start of this group. */ if (hc >= cd->hwm) { offset = GET(ptr, 1); if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); } ptr += 1 + LINK_SIZE; } }	pcrecomp.c	1905
STATIC USCHAR *	auto_callout(uschar code, const uschar ptr, compile_data cd) static uschar auto_callout(uschar code, const uschar ptr, compile_data cd) { code++ = OP_CALLOUT; code++ = 255; PUT(code, 0, ptr - cd->start_pattern); / Pattern offset / PUT(code, LINK_SIZE, 0); / Default length / return code + 2LINK_SIZE; }	pcrecomp.c	1959
STATIC VOID	complete_callout(uschar previous_callout, const uschar ptr, compile_data cd) static void complete_callout(uschar previous_callout, const uschar ptr, compile_data cd) { int length = ptr - cd->start_pattern - GET(previous_callout, 2); PUT(previous_callout, 2 + LINK_SIZE, length); } #ifdef SUPPORT_UCP	pcrecomp.c	1987
STATIC BOOL	get_othercase_range(unsigned int cptr, unsigned int d, unsigned int ocptr, unsigned int odptr) static BOOL get_othercase_range(unsigned int cptr, unsigned int d, unsigned int ocptr, unsigned int odptr) { unsigned int c, othercase, next; for (c = cptr; c <= d; c++) { if ((othercase = UCD_OTHERCASE(c)) != c) break; } if (c > d) return FALSE; ocptr = othercase; next = othercase + 1; for (++c; c <= d; c++) { if (UCD_OTHERCASE(c) != next) break; next++; } odptr = next - 1; cptr = c; return TRUE; } #endif /* SUPPORT_UCP */	pcrecomp.c	2015
STATIC BOOL	check_auto_possessive(int op_code, int item, BOOL utf8, uschar utf8_char, const uschar ptr, int options, compile_data cd) static BOOL check_auto_possessive(int op_code, int item, BOOL utf8, uschar utf8_char, const uschar ptr, int options, compile_data cd) { int next; /* Skip whitespace and comments in extended mode / if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[ptr] & ctype_space) != 0) ptr++; if (ptr == '#') { while ((++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } } /* If the next item is one that we can handle, get its value. A non-negative value is a character, a negative value is an escape value. / if (ptr == '\\') { int temperrorcode = 0; next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); if (temperrorcode != 0) return FALSE; ptr++; /* Point after the escape sequence / } else if ((cd->ctypes[ptr] & ctype_meta) == 0) { #ifdef SUPPORT_UTF8 if (utf8) { GETCHARINC(next, ptr); } else #endif next = ptr++; } else return FALSE; / Skip whitespace and comments in extended mode / if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[ptr] & ctype_space) != 0) ptr++; if (ptr == '#') { while ((++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } } /* If the next thing is itself optional, we have to give up. / if (ptr == '' \|\| ptr == '?' \|\| strncmp((char )ptr, "{0,", 3) == 0) return FALSE; / Now compare the next item with the previous opcode. If the previous is a positive single character match, "item" either contains the character or, if "item" is greater than 127 in utf8 mode, the character's bytes are in utf8_char. / / Handle cases when the next item is a character. / if (next >= 0) switch(op_code) { case OP_CHAR: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #else (void)(utf8_char); / Keep compiler happy by referencing function argument / #endif return item != next; / For CHARNC (caseless character) we must check the other case. If we have Unicode property support, we can use it to test the other case of high-valued characters. / case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #endif if (item == next) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE((unsigned int)next); #else othercase = NOTACHAR; #endif return (unsigned int)item != othercase; } else #endif / SUPPORT_UTF8 / return (item != cd->fcc[next]); / Non-UTF-8 mode / / For OP_NOT, "item" must be a single-byte character. / case OP_NOT: if (item == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE(next); #else othercase = NOTACHAR; #endif return (unsigned int)item == othercase; } else #endif / SUPPORT_UTF8 / return (item == cd->fcc[next]); / Non-UTF-8 mode / case OP_DIGIT: return next > 127 \|\| (cd->ctypes[next] & ctype_digit) == 0; case OP_NOT_DIGIT: return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; case OP_WHITESPACE: return next > 127 \|\| (cd->ctypes[next] & ctype_space) == 0; case OP_NOT_WHITESPACE: return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; case OP_WORDCHAR: return next > 127 \|\| (cd->ctypes[next] & ctype_word) == 0; case OP_NOT_WORDCHAR: return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; case OP_HSPACE: case OP_NOT_HSPACE: switch(next) { case 0x09: case 0x20: case 0xa0: case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200A: case 0x202f: case 0x205f: case 0x3000: return op_code != OP_HSPACE; default: return op_code == OP_HSPACE; } case OP_VSPACE: case OP_NOT_VSPACE: switch(next) { case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x85: case 0x2028: case 0x2029: return op_code != OP_VSPACE; default: return op_code == OP_VSPACE; } default: return FALSE; } / Handle the case when the next item is \d, \s, etc. / switch(op_code) { case OP_CHAR: case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #endif switch(-next) { case ESC_d: return item > 127 \|\| (cd->ctypes[item] & ctype_digit) == 0; case ESC_D: return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; case ESC_s: return item > 127 \|\| (cd->ctypes[item] & ctype_space) == 0; case ESC_S: return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; case ESC_w: return item > 127 \|\| (cd->ctypes[item] & ctype_word) == 0; case ESC_W: return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; case ESC_h: case ESC_H: switch(item) { case 0x09: case 0x20: case 0xa0: case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200A: case 0x202f: case 0x205f: case 0x3000: return -next != ESC_h; default: return -next == ESC_h; } case ESC_v: case ESC_V: switch(item) { case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x85: case 0x2028: case 0x2029: return -next != ESC_v; default: return -next == ESC_v; } default: return FALSE; } case OP_DIGIT: return next == -ESC_D \|\| next == -ESC_s \|\| next == -ESC_W \|\| next == -ESC_h \|\| next == -ESC_v; case OP_NOT_DIGIT: return next == -ESC_d; case OP_WHITESPACE: return next == -ESC_S \|\| next == -ESC_d \|\| next == -ESC_w; case OP_NOT_WHITESPACE: return next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v; case OP_HSPACE: return next == -ESC_S \|\| next == -ESC_H \|\| next == -ESC_d \|\| next == -ESC_w; case OP_NOT_HSPACE: return next == -ESC_h; / Can't have \S in here because VT matches \S (Perl anomaly) / case OP_VSPACE: return next == -ESC_V \|\| next == -ESC_d \|\| next == -ESC_w; case OP_NOT_VSPACE: return next == -ESC_v; case OP_WORDCHAR: return next == -ESC_W \|\| next == -ESC_s \|\| next == -ESC_h \|\| next == -ESC_v; case OP_NOT_WORDCHAR: return next == -ESC_w \|\| next == -ESC_d; default: return FALSE; } / Control does not reach here */ }	pcrecomp.c	2064
STATIC BOOL	compile_branch(int optionsptr, uschar codeptr, const uschar ptrptr, int errorcodeptr, int firstbyteptr, int reqbyteptr, branch_chain bcptr, compile_data cd, int lengthptr) static BOOL compile_branch(int optionsptr, uschar codeptr, const uschar ptrptr, int errorcodeptr, int firstbyteptr, int reqbyteptr, branch_chain bcptr, compile_data cd, int lengthptr) { int repeat_type, op_type; int repeat_min = 0, repeat_max = 0; /* To please picky compilers / int bravalue = 0; int greedy_default, greedy_non_default; int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; int req_caseopt, reqvary, tempreqvary; int options = optionsptr; int after_manual_callout = 0; int length_prevgroup = 0; register int c; register uschar code = codeptr; uschar last_code = code; uschar orig_code = code; uschar tempcode; BOOL inescq = FALSE; BOOL groupsetfirstbyte = FALSE; const uschar ptr = ptrptr; const uschar tempptr; uschar previous = NULL; uschar previous_callout = NULL; uschar save_hwm = NULL; uschar classbits[32]; #ifdef SUPPORT_UTF8 BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; uschar class_utf8data; uschar class_utf8data_base; uschar utf8_char[6]; #else BOOL utf8 = FALSE; uschar utf8_char = NULL; #endif #ifdef DEBUG if (lengthptr != NULL) DPRINTF((">> start branch\n")); #endif /* Set up the default and non-default settings for greediness / greedy_default = ((options & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; / Initialize no first byte, no required byte. REQ_UNSET means "no char matching encountered yet". It gets changed to REQ_NONE if we hit something that matches a non-fixed char first char; reqbyte just remains unset if we never find one. When we hit a repeat whose minimum is zero, we may have to adjust these values to take the zero repeat into account. This is implemented by setting them to zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual item types that can be repeated set these backoff variables appropriately. / firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; / The variable req_caseopt contains either the REQ_CASELESS value or zero, according to the current setting of the caseless flag. REQ_CASELESS is a bit value > 255. It is added into the firstbyte or reqbyte variables to record the case status of the value. This is used only for ASCII characters. / req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; / Switch on next character until the end of the branch / for (;; ptr++) { BOOL negate_class; BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; BOOL reset_bracount; int class_charcount; int class_lastchar; int newoptions; int recno; int refsign; int skipbytes; int subreqbyte; int subfirstbyte; int terminator; int mclength; uschar mcbuffer[8]; / Get next byte in the pattern / c = ptr; /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop. / if (lengthptr != NULL) { #ifdef DEBUG if (code > cd->hwm) cd->hwm = code; / High water info / #endif if (code > cd->start_workspace + COMPILE_WORK_SIZE) / Check for overrun / { errorcodeptr = ERR52; goto FAILED; } /* There is at least one situation where code goes backwards: this is the case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, the class is simply eliminated. However, it is created first, so we have to allow memory for it. Therefore, don't ever reduce the length at this point. / if (code < last_code) code = last_code; / Paranoid check for integer overflow / if (OFLOW_MAX - lengthptr < code - last_code) { errorcodeptr = ERR20; goto FAILED; } lengthptr += code - last_code; DPRINTF(("length=%d added %d c=%c\n", lengthptr, code - last_code, c)); / If "previous" is set and it is not at the start of the work space, move it back to there, in order to avoid filling up the work space. Otherwise, if "previous" is NULL, reset the current code pointer to the start. / if (previous != NULL) { if (previous > orig_code) { memmove(orig_code, previous, code - previous); code -= previous - orig_code; previous = orig_code; } } else code = orig_code; / Remember where this code item starts so we can pick up the length next time round. / last_code = code; } / In the real compile phase, just check the workspace used by the forward reference list. / else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) { errorcodeptr = ERR52; goto FAILED; } /* If in \Q...\E, check for the end; if not, we have a literal / if (inescq && c != 0) { if (c == '\\' && ptr[1] == 'E') { inescq = FALSE; ptr++; continue; } else { if (previous_callout != NULL) { if (lengthptr == NULL) / Don't attempt in pre-compile phase / complete_callout(previous_callout, ptr, cd); previous_callout = NULL; } if ((options & PCRE_AUTO_CALLOUT) != 0) { previous_callout = code; code = auto_callout(code, ptr, cd); } goto NORMAL_CHAR; } } / Fill in length of a previous callout, except when the next thing is a quantifier. / is_quantifier = c == '' \|\| c == '+' \|\| c == '?' \|\| (c == '{' && is_counted_repeat(ptr+1)); if (!is_quantifier && previous_callout != NULL && after_manual_callout-- <= 0) { if (lengthptr == NULL) /* Don't attempt in pre-compile phase / complete_callout(previous_callout, ptr, cd); previous_callout = NULL; } / In extended mode, skip white space and comments / if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while ((++ptr) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } } if (ptr != 0) continue; / Else fall through to handle end of string / c = 0; } } / No auto callout for quantifiers. / if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) { previous_callout = code; code = auto_callout(code, ptr, cd); } switch(c) { / ===================================================================/ case 0: / The branch terminates at string end / case '\|': / or \| or ) / case ')': firstbyteptr = firstbyte; reqbyteptr = reqbyte; codeptr = code; ptrptr = ptr; if (lengthptr != NULL) { if (OFLOW_MAX - lengthptr < code - last_code) { errorcodeptr = ERR20; goto FAILED; } lengthptr += code - last_code; /* To include callout length / DPRINTF((">> end branch\n")); } return TRUE; / ===================================================================/ / Handle single-character metacharacters. In multiline mode, ^ disables the setting of any following char as a first character. / case '^': if ((options & PCRE_MULTILINE) != 0) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; } previous = NULL; code++ = OP_CIRC; break; case '$': previous = NULL; code++ = OP_DOLL; break; / There can never be a first char if '.' is first, whatever happens about repeats. The value of reqbyte doesn't change either. / case '.': if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; previous = code; code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; /* ===================================================================/ / Character classes. If the included characters are all < 256, we build a 32-byte bitmap of the permitted characters, except in the special case where there is only one such character. For negated classes, we build the map as usual, then invert it at the end. However, we use a different opcode so that data characters > 255 can be handled correctly. If the class contains characters outside the 0-255 range, a different opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. In JavaScript compatibility mode, an isolated ']' causes an error. In default (Perl) mode, it is treated as a data character. / case ']': if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { errorcodeptr = ERR64; goto FAILED; } goto NORMAL_CHAR; case '[': previous = code; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. / if ((ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') && check_posix_syntax(ptr, &tempptr)) { errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; goto FAILED; } /* If the first character is '^', set the negation flag and skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them too. This makes for compatibility with Perl. / negate_class = FALSE; for (;;) { c = (++ptr); if (c == '\\') { if (ptr[1] == 'E') ptr++; else if (strncmp((const char )ptr+1, "Q\\E", 3) == 0) ptr += 3; else break; } else if (!negate_class && c == '^') negate_class = TRUE; else break; } / Empty classes are allowed in JavaScript compatibility mode. Otherwise, an initial ']' is taken as a data character -- the code below handles that. In JS mode, [] must always fail, so generate OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. / if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { code++ = negate_class? OP_ALLANY : OP_FAIL; if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; break; } /* If a class contains a negative special such as \S, we need to flip the negation flag at the end, so that support for characters > 255 works correctly (they are all included in the class). / should_flip_negation = FALSE; / Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). However, For higher valued UTF-8 characters, we don't yet do any optimization. / class_charcount = 0; class_lastchar = -1; / Initialize the 32-char bit map to all zeros. We build the map in a temporary bit of memory, in case the class contains only 1 character (less than 256), because in that case the compiled code doesn't use the bit map. / memset(classbits, 0, 32 sizeof(uschar)); #ifdef SUPPORT_UTF8 class_utf8 = FALSE; /* No chars >= 256 / class_utf8data = code + LINK_SIZE + 2; / For UTF-8 items / class_utf8data_base = class_utf8data; / For resetting in pass 1 / #endif / Process characters until ] is reached. By writing this as a "do" it means that an initial ] is taken as a data character. At the start of the loop, c contains the first byte of the character. / if (c != 0) do { const uschar oldptr; #ifdef SUPPORT_UTF8 if (utf8 && c > 127) { /* Braces are required because the / GETCHARLEN(c, ptr, ptr); / macro generates multiple statements / } / In the pre-compile phase, accumulate the length of any UTF-8 extra data and reset the pointer. This is so that very large classes that contain a zillion UTF-8 characters no longer overwrite the work space (which is on the stack). / if (lengthptr != NULL) { lengthptr += class_utf8data - class_utf8data_base; class_utf8data = class_utf8data_base; } #endif /* Inside \Q...\E everything is literal except \E / if (inescq) { if (c == '\\' && ptr[1] == 'E') / If we are at \E / { inescq = FALSE; / Reset literal state / ptr++; / Skip the 'E' / continue; / Carry on with next / } goto CHECK_RANGE; / Could be range if \E follows / } / Handle POSIX class names. Perl allows a negation extension of the form [:^name:]. A square bracket that doesn't match the syntax is treated as a literal. We also recognize the POSIX constructions [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5.6 and 5.8 do. / if (c == '[' && (ptr[1] == ':' \|\| ptr[1] == '.' \|\| ptr[1] == '=') && check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; register const uschar cbits = cd->cbits; uschar pbits[32]; if (ptr[1] != ':') { errorcodeptr = ERR31; goto FAILED; } ptr += 2; if (ptr == '^') { local_negate = TRUE; should_flip_negation = TRUE; /* Note negative special / ptr++; } posix_class = check_posix_name(ptr, tempptr - ptr); if (posix_class < 0) { errorcodeptr = ERR30; goto FAILED; } /* If matching is caseless, upper and lower are converted to alpha. This relies on the fact that the class table starts with alpha, lower, upper as the first 3 entries. / if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; / We build the bit map for the POSIX class in a chunk of local store because we may be adding and subtracting from it, and we don't want to subtract bits that may be in the main map already. At the end we or the result into the bit map that is being built. / posix_class = 3; /* Copy in the first table (always present) / memcpy(pbits, cbits + posix_class_maps[posix_class], 32 sizeof(uschar)); /* If there is a second table, add or remove it as required. / taboffset = posix_class_maps[posix_class + 1]; tabopt = posix_class_maps[posix_class + 2]; if (taboffset >= 0) { if (tabopt >= 0) for (c = 0; c < 32; c++) pbits[c] \|= cbits[c + taboffset]; else for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; } / Not see if we need to remove any special characters. An option value of 1 removes vertical space and 2 removes underscore. / if (tabopt < 0) tabopt = -tabopt; if (tabopt == 1) pbits[1] &= ~0x3c; else if (tabopt == 2) pbits[11] &= 0x7f; / Add the POSIX table or its complement into the main table that is being built and we are done. / if (local_negate) for (c = 0; c < 32; c++) classbits[c] \|= ~pbits[c]; else for (c = 0; c < 32; c++) classbits[c] \|= pbits[c]; ptr = tempptr + 1; class_charcount = 10; / Set > 1; assumes more than 1 per class / continue; / End of POSIX syntax handling / } / Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready to 'or' into the one we are building. We assume they have more than one character in them, so set class_charcount bigger than one. / if (c == '\\') { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (errorcodeptr != 0) goto FAILED; if (-c == ESC_b) c = '\b'; /* \b is backspace in a class / else if (-c == ESC_X) c = 'X'; / \X is literal X in a class / else if (-c == ESC_R) c = 'R'; / \R is literal R in a class / else if (-c == ESC_Q) / Handle start of quoted string / { if (ptr[1] == '\\' && ptr[2] == 'E') { ptr += 2; / avoid empty string / } else inescq = TRUE; continue; } else if (-c == ESC_E) continue; / Ignore orphan \E / if (c < 0) { register const uschar cbits = cd->cbits; class_charcount += 2; /* Greater than 1 is what matters / / Save time by not doing this in the pre-compile phase. / if (lengthptr == NULL) switch (-c) { case ESC_d: for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_digit]; continue; case ESC_D: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_digit]; continue; case ESC_w: for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_word]; continue; case ESC_W: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_word]; continue; case ESC_s: for (c = 0; c < 32; c++) classbits[c] \|= cbits[c+cbit_space]; classbits[1] &= ~0x08; / Perl 5.004 onwards omits VT from \s / continue; case ESC_S: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] \|= ~cbits[c+cbit_space]; classbits[1] \|= 0x08; / Perl 5.004 onwards omits VT from \s / continue; default: / Not recognized; fall through / break; / Need "default" setting to stop compiler warning. / } / In the pre-compile phase, just do the recognition. / else if (c == -ESC_d \|\| c == -ESC_D \|\| c == -ESC_w \|\| c == -ESC_W \|\| c == -ESC_s \|\| c == -ESC_S) continue; / We need to deal with \H, \h, \V, and \v in both phases because they use extra memory. / if (-c == ESC_h) { SETBIT(classbits, 0x09); / VT / SETBIT(classbits, 0x20); / SPACE / SETBIT(classbits, 0xa0); / NSBP / #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); } #endif continue; } if (-c == ESC_H) { for (c = 0; c < 32; c++) { int x = 0xff; switch (c) { case 0x09/8: x ^= 1 << (0x09%8); break; case 0x20/8: x ^= 1 << (0x20%8); break; case 0xa0/8: x ^= 1 << (0xa0%8); break; default: break; } classbits[c] \|= x; } #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); } #endif continue; } if (-c == ESC_v) { SETBIT(classbits, 0x0a); /* LF / SETBIT(classbits, 0x0b); / VT / SETBIT(classbits, 0x0c); / FF / SETBIT(classbits, 0x0d); / CR / SETBIT(classbits, 0x85); / NEL / #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); } #endif continue; } if (-c == ESC_V) { for (c = 0; c < 32; c++) { int x = 0xff; switch (c) { case 0x0a/8: x ^= 1 << (0x0a%8); x ^= 1 << (0x0b%8); x ^= 1 << (0x0c%8); x ^= 1 << (0x0d%8); break; case 0x85/8: x ^= 1 << (0x85%8); break; default: break; } classbits[c] \|= x; } #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); } #endif continue; } /* We need to deal with \P and \p in both phases. / #ifdef SUPPORT_UCP if (-c == ESC_p \|\| -c == ESC_P) { BOOL negated; int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; class_utf8 = TRUE; class_utf8data++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; class_utf8data++ = ptype; class_utf8data++ = pdata; class_charcount -= 2; /* Not a < 256 character / continue; } #endif / Unrecognized escapes are faulted if PCRE is running in its strict mode. By default, for compatibility with Perl, they are treated as literals. / if ((options & PCRE_EXTRA) != 0) { errorcodeptr = ERR7; goto FAILED; } class_charcount -= 2; /* Undo the default count from above / c = ptr; /* Get the final character and fall through / } / Fall through if we have a single character (c >= 0). This may be greater than 256 in UTF-8 mode. / } / End of backslash handling / / A single character may be followed by '-' to form a range. However, Perl does not permit ']' to be the end of the range. A '-' character at the end is treated as a literal. Perl ignores orphaned \E sequences entirely. The code for handling \Q and \E is messy. / CHECK_RANGE: while (ptr[1] == '\\' && ptr[2] == 'E') { inescq = FALSE; ptr += 2; } oldptr = ptr; / Remember \r or \n / if (c == '\r' \|\| c == '\n') cd->external_flags \|= PCRE_HASCRORLF; / Check for range / if (!inescq && ptr[1] == '-') { int d; ptr += 2; while (ptr == '\\' && ptr[1] == 'E') ptr += 2; /* If we hit \Q (not followed by \E) at this point, go into escaped mode. / while (ptr == '\\' && ptr[1] == 'Q') { ptr += 2; if (ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } inescq = TRUE; break; } if (ptr == 0 \|\| (!inescq && ptr == ']')) { ptr = oldptr; goto LONE_SINGLE_CHARACTER; } #ifdef SUPPORT_UTF8 if (utf8) { / Braces are required because the / GETCHARLEN(d, ptr, ptr); / macro generates multiple statements / } else #endif d = ptr; /* Not UTF-8 mode / / The second part of a range can be a single-character escape, but not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. / if (!inescq && d == '\\') { d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (errorcodeptr != 0) goto FAILED; /* \b is backspace; \X is literal X; \R is literal R; any other special means the '-' was literal / if (d < 0) { if (d == -ESC_b) d = '\b'; else if (d == -ESC_X) d = 'X'; else if (d == -ESC_R) d = 'R'; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; / A few lines below / } } } / Check that the two values are in the correct order. Optimize one-character ranges / if (d < c) { errorcodeptr = ERR8; goto FAILED; } if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below / / Remember \r or \n / if (d == '\r' \|\| d == '\n') cd->external_flags \|= PCRE_HASCRORLF; / In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is available. / #ifdef SUPPORT_UTF8 if (utf8 && (d > 255 \|\| ((options & PCRE_CASELESS) != 0 && d > 127))) { class_utf8 = TRUE; / With UCP support, we can find the other case equivalents of the relevant characters. There may be several ranges. Optimize how they fit with the basic range. / #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { unsigned int occ, ocd; unsigned int cc = c; unsigned int origd = d; while (get_othercase_range(&cc, origd, &occ, &ocd)) { if (occ >= (unsigned int)c && ocd <= (unsigned int)d) continue; / Skip embedded ranges / if (occ < (unsigned int)c && ocd >= (unsigned int)c - 1) / Extend the basic range / { / if there is overlap, / c = occ; / noting that if occ < c / continue; / we can't have ocd > d / } / because a subrange is / if (ocd > (unsigned int)d && occ <= (unsigned int)d + 1) / always shorter than / { / the basic range. / d = ocd; continue; } if (occ == ocd) { class_utf8data++ = XCL_SINGLE; } else { class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(occ, class_utf8data); } class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); } } #endif / SUPPORT_UCP / / Now record the original range, possibly modified for UCP caseless overlapping ranges. / class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(c, class_utf8data); class_utf8data += _pcre_ord2utf8(d, class_utf8data); /* With UCP support, we are done. Without UCP support, there is no caseless matching for UTF-8 characters > 127; we can use the bit map for the smaller ones. / #ifdef SUPPORT_UCP continue; / With next character in the class / #else if ((options & PCRE_CASELESS) == 0 \|\| c > 127) continue; / Adjust upper limit and fall through to set up the map / d = 127; #endif / SUPPORT_UCP / } #endif / SUPPORT_UTF8 / / We use the bit map for all cases when not in UTF-8 mode; else ranges that lie entirely within 0-127 when there is UCP support; else for partial ranges without UCP support. / class_charcount += d - c + 1; class_lastchar = d; / We can save a bit of time by skipping this in the pre-compile. / if (lengthptr == NULL) for (; c <= d; c++) { classbits[c/8] \|= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { int uc = cd->fcc[c]; / flip case / classbits[uc/8] \|= (1 << (uc&7)); } } continue; / Go get the next char in the class / } / Handle a lone single character - we can get here for a normal non-escape char, or after \ that introduces a single character or for an apparent range that isn't. / LONE_SINGLE_CHARACTER: / Handle a character that cannot go in the bit map / #ifdef SUPPORT_UTF8 if (utf8 && (c > 255 \|\| ((options & PCRE_CASELESS) != 0 && c > 127))) { class_utf8 = TRUE; class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(c, class_utf8data); #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { unsigned int othercase; if ((othercase = UCD_OTHERCASE(c)) != c) { class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); } } #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / / Handle a single-byte character / { classbits[c/8] \|= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { c = cd->fcc[c]; / flip case / classbits[c/8] \|= (1 << (c&7)); } class_charcount++; class_lastchar = c; } } / Loop until ']' reached. This "while" is the end of the "do" above. / while ((c = (++ptr)) != 0 && (c != ']' \|\| inescq)); if (c == 0) /* Missing terminating ']' / { errorcodeptr = ERR6; goto FAILED; } /* This code has been disabled because it would mean that \s counts as an explicit \r or \n reference, and that's not really what is wanted. Now we set the flag only if there is a literal "\r" or "\n" in the class. / #if 0 / Remember whether \r or \n are in this class / if (negate_class) { if ((classbits[1] & 0x24) != 0x24) cd->external_flags \|= PCRE_HASCRORLF; } else { if ((classbits[1] & 0x24) != 0) cd->external_flags \|= PCRE_HASCRORLF; } #endif / If class_charcount is 1, we saw precisely one character whose value is less than 256. As long as there were no characters >= 128 and there was no use of \p or \P, in other words, no use of any XCLASS features, we can optimize. In UTF-8 mode, we can optimize the negative case only if there were no characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR operate on single-bytes only. This is an historical hangover. Maybe one day we can tidy these opcodes to handle multi-byte characters. The optimization throws away the bit map. We turn the item into a 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note that OP_NOT does not support multibyte characters. In the positive case, it can cause firstbyte to be set. Otherwise, there can be no first char if this item is first, whatever repeat count may follow. In the case of reqbyte, save the previous value for reinstating. / #ifdef SUPPORT_UTF8 if (class_charcount == 1 && !class_utf8 && (!utf8 \|\| !negate_class \|\| class_lastchar < 128)) #else if (class_charcount == 1) #endif { zeroreqbyte = reqbyte; / The OP_NOT opcode works on one-byte characters only. / if (negate_class) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; code++ = OP_NOT; code++ = class_lastchar; break; } / For a single, positive character, get the value into mcbuffer, and then we can handle this with the normal one-character code. / #ifdef SUPPORT_UTF8 if (utf8 && class_lastchar > 127) mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); else #endif { mcbuffer[0] = class_lastchar; mclength = 1; } goto ONE_CHAR; } / End of 1-char optimization / / The general case - not the one-char optimization. If this is the first thing in the branch, there can be no first char setting, whatever the repeat count. Any reqbyte setting must remain unchanged after any kind of repeat. / if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; / If there are characters with values > 255, we have to compile an extended class, with its own opcode, unless there was a negated special such as \S in the class, because in that case all characters > 255 are in the class, so any that were explicitly given as well can be ignored. If (when there are explicit characters > 255 that must be listed) there are no characters < 256, we can omit the bitmap in the actual compiled code. / #ifdef SUPPORT_UTF8 if (class_utf8 && !should_flip_negation) { class_utf8data++ = XCL_END; /* Marks the end of extra data / code++ = OP_XCLASS; code += LINK_SIZE; code = negate_class? XCL_NOT : 0; / If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. / if (class_charcount > 0) { code++ \|= XCL_MAP; memmove(code + 32, code, class_utf8data - code); memcpy(code, classbits, 32); code = class_utf8data + 32; } else code = class_utf8data; /* Now fill in the complete length of the item / PUT(previous, 1, code - previous); break; / End of class handling / } #endif / If there are no characters > 255, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the whole class was negated and whether there were negative specials such as \S in the class. Then copy the 32-byte map into the code vector, negating it if necessary. / code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { if (lengthptr == NULL) /* Save time in the pre-compile phase / for (c = 0; c < 32; c++) code[c] = ~classbits[c]; } else { memcpy(code, classbits, 32); } code += 32; break; / ===================================================================/ / Various kinds of repeat; '{' is not necessarily a quantifier, but this has been tested above. / case '{': if (!is_quantifier) goto NORMAL_CHAR; ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); if (errorcodeptr != 0) goto FAILED; goto REPEAT; case '': repeat_min = 0; repeat_max = -1; goto REPEAT; case '+': repeat_min = 1; repeat_max = -1; goto REPEAT; case '?': repeat_min = 0; repeat_max = 1; REPEAT: if (previous == NULL) { errorcodeptr = ERR9; goto FAILED; } if (repeat_min == 0) { firstbyte = zerofirstbyte; /* Adjust for zero repeat / reqbyte = zeroreqbyte; / Ditto / } / Remember whether this is a variable length repeat / reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; op_type = 0; / Default single-char op codes / possessive_quantifier = FALSE; / Default not possessive quantifier / / Save start of previous item, in case we have to move it up to make space for an inserted OP_ONCE for the additional '+' extension. / tempcode = previous; / If the next character is '+', we have a possessive quantifier. This implies greediness, whatever the setting of the PCRE_UNGREEDY option. If the next character is '?' this is a minimizing repeat, by default, but if PCRE_UNGREEDY is set, it works the other way round. We change the repeat type to the non-default. / if (ptr[1] == '+') { repeat_type = 0; / Force greedy / possessive_quantifier = TRUE; ptr++; } else if (ptr[1] == '?') { repeat_type = greedy_non_default; ptr++; } else repeat_type = greedy_default; / If previous was a character match, abolish the item and generate a repeat item instead. If a char item has a minumum of more than one, ensure that it is set in reqbyte - it might not be if a sequence such as x{3} is the first thing in a branch because the x will have gone into firstbyte instead. / if (previous == OP_CHAR \|\| previous == OP_CHARNC) { / Deal with UTF-8 characters that take up more than one byte. It's easier to write this out separately than try to macrify it. Use c to hold the length of the character in bytes, plus 0x80 to flag that it's a length rather than a small character. / #ifdef SUPPORT_UTF8 if (utf8 && (code[-1] & 0x80) != 0) { uschar lastchar = code - 1; while((lastchar & 0xc0) == 0x80) lastchar--; c = code - lastchar; / Length of UTF-8 character / memcpy(utf8_char, lastchar, c); / Save the char / c \|= 0x80; / Flag c as a length / } else #endif / Handle the case of a single byte - either with no UTF8 support, or with UTF-8 disabled, or for a UTF-8 character < 128. / { c = code[-1]; if (repeat_min > 1) reqbyte = c \| req_caseopt \| cd->req_varyopt; } / If the repetition is unlimited, it pays to see if the next thing on the line is something that cannot possibly match this character. If so, automatically possessifying this item gains some performance in the case where the match fails. / if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(previous, c, utf8, utf8_char, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy / possessive_quantifier = TRUE; } goto OUTPUT_SINGLE_REPEAT; / Code shared with single character types / } / If previous was a single negated character ([^a] or similar), we use one of the special opcodes, replacing it. The code is shared with single- character repeats by setting opt_type to add a suitable offset into repeat_type. We can also test for auto-possessification. OP_NOT is currently used only for single-byte chars. / else if (previous == OP_NOT) { op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes / c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) { repeat_type = 0; / Force greedy / possessive_quantifier = TRUE; } goto OUTPUT_SINGLE_REPEAT; } / If previous was a character type match (\d or similar), abolish it and create a suitable repeat item. The code is shared with single-character repeats by setting op_type to add a suitable offset into repeat_type. Note the the Unicode property types will be present only when SUPPORT_UCP is defined, but we don't wrap the little bits of code here because it just makes it horribly messy. / else if (previous < OP_EODN) { uschar oldcode; int prop_type, prop_value; op_type = OP_TYPESTAR - OP_STAR; / Use type opcodes / c = previous; if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy / possessive_quantifier = TRUE; } OUTPUT_SINGLE_REPEAT: if (previous == OP_PROP \|\| previous == OP_NOTPROP) { prop_type = previous[1]; prop_value = previous[2]; } else prop_type = prop_value = -1; oldcode = code; code = previous; / Usually overwrite previous item / / If the maximum is zero then the minimum must also be zero; Perl allows this case, so we do too - by simply omitting the item altogether. / if (repeat_max == 0) goto END_REPEAT; / All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). / if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL; / Combine the op_type with the repeat_type / repeat_type += op_type; / A minimum of zero is handled either as the special case * or ?, or as an UPTO, with the maximum given. / if (repeat_min == 0) { if (repeat_max == -1) code++ = OP_STAR + repeat_type; else if (repeat_max == 1) code++ = OP_QUERY + repeat_type; else { code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); } } /* A repeat minimum of 1 is optimized into some special cases. If the maximum is unlimited, we use OP_PLUS. Otherwise, the original item is left in place and, if the maximum is greater than 1, we use OP_UPTO with one less than the maximum. / else if (repeat_min == 1) { if (repeat_max == -1) code++ = OP_PLUS + repeat_type; else { code = oldcode; /* leave previous item in place / if (repeat_max == 1) goto END_REPEAT; code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max - 1); } } /* The case {n,n} is just an EXACT, while the general case {n,m} is handled as an EXACT followed by an UPTO. / else { code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type / PUT2INC(code, 0, repeat_min); / If the maximum is unlimited, insert an OP_STAR. Before doing so, we have to insert the character for the previous code. For a repeated Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in c, with the 0x80 bit as a flag. / if (repeat_max < 0) { #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif { code++ = c; if (prop_type >= 0) { code++ = prop_type; code++ = prop_value; } } code++ = OP_STAR + repeat_type; } / Else insert an UPTO if the max is greater than the min, again preceded by the character, for the previously inserted code. If the UPTO is just for 1 instance, we can use QUERY instead. / else if (repeat_max != repeat_min) { #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif code++ = c; if (prop_type >= 0) { code++ = prop_type; code++ = prop_value; } repeat_max -= repeat_min; if (repeat_max == 1) { code++ = OP_QUERY + repeat_type; } else { code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); } } } /* The character or character type itself comes last in all cases. / #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif code++ = c; /* For a repeated Unicode property match, there are two extra bytes that define the required property. / #ifdef SUPPORT_UCP if (prop_type >= 0) { code++ = prop_type; code++ = prop_value; } #endif } / If previous was a character class or a back reference, we put the repeat stuff after it, but just skip the item if the repeat was {0,0}. / else if (previous == OP_CLASS \|\| previous == OP_NCLASS \|\| #ifdef SUPPORT_UTF8 previous == OP_XCLASS \|\| #endif previous == OP_REF) { if (repeat_max == 0) { code = previous; goto END_REPEAT; } / All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). / if (repeat_max != 1) cd->external_flags \|= PCRE_NOPARTIAL; if (repeat_min == 0 && repeat_max == -1) code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) code++ = OP_CRPLUS + repeat_type; else if (repeat_min == 0 && repeat_max == 1) code++ = OP_CRQUERY + repeat_type; else { code++ = OP_CRRANGE + repeat_type; PUT2INC(code, 0, repeat_min); if (repeat_max == -1) repeat_max = 0; / 2-byte encoding for max / PUT2INC(code, 0, repeat_max); } } / If previous was a bracket group, we may have to replicate it in certain cases. / else if (previous == OP_BRA \|\| previous == OP_CBRA \|\| previous == OP_ONCE \|\| previous == OP_COND) { register int i; int ketoffset = 0; int len = code - previous; uschar bralink = NULL; /* Repeating a DEFINE group is pointless / if (previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) { errorcodeptr = ERR55; goto FAILED; } / If the maximum repeat count is unlimited, find the end of the bracket by scanning through from the start, and compute the offset back to it from the current code pointer. There may be an OP_OPT setting following the final KET, so we can't find the end just by going back from the code pointer. / if (repeat_max == -1) { register uschar ket = previous; do ket += GET(ket, 1); while (ket != OP_KET); ketoffset = code - ket; } / The case of a zero minimum is special because of the need to stick OP_BRAZERO in front of it, and because the group appears once in the data, whereas in other cases it appears the minimum number of times. For this reason, it is simplest to treat this case separately, as otherwise the code gets far too messy. There are several special subcases when the minimum is zero. / if (repeat_min == 0) { / If the maximum is also zero, we used to just omit the group from the output altogether, like this: if (repeat_max == 0) { code = previous; goto END_REPEAT; ** } However, that fails when a group is referenced as a subroutine from elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it so that it is skipped on execution. As we don't have a list of which groups are referenced, we cannot do this selectively. If the maximum is 1 or unlimited, we just have to stick in the BRAZERO and do no more at this point. However, we do need to adjust any OP_RECURSE calls inside the group that refer to the group itself or any internal or forward referenced group, because the offset is from the start of the whole regex. Temporarily terminate the pattern while doing this. / if (repeat_max <= 1) / Covers 0, 1, and unlimited / { code = OP_END; adjust_recurse(previous, 1, utf8, cd, save_hwm); memmove(previous+1, previous, len); code++; if (repeat_max == 0) { previous++ = OP_SKIPZERO; goto END_REPEAT; } previous++ = OP_BRAZERO + repeat_type; } /* If the maximum is greater than 1 and limited, we have to replicate in a nested fashion, sticking OP_BRAZERO before each set of brackets. The first one has to be handled carefully because it's the original copy, which has to be moved up. The remainder can be handled by code that is common with the non-zero minimum case below. We have to adjust the value or repeat_max, since one less copy is required. Once again, we may have to adjust any OP_RECURSE calls inside the group. / else { int offset; code = OP_END; adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); memmove(previous + 2 + LINK_SIZE, previous, len); code += 2 + LINK_SIZE; previous++ = OP_BRAZERO + repeat_type; previous++ = OP_BRA; /* We chain together the bracket offset fields that have to be filled in later when the ends of the brackets are reached. / offset = (bralink == NULL)? 0 : previous - bralink; bralink = previous; PUTINC(previous, 0, offset); } repeat_max--; } / If the minimum is greater than zero, replicate the group as many times as necessary, and adjust the maximum to the number of subsequent copies that we need. If we set a first char from the group, and didn't set a required char, copy the latter from the former. If there are any forward reference subroutine calls in the group, there will be entries on the workspace list; replicate these with an appropriate increment. / else { if (repeat_min > 1) { / In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. Do some paranoid checks for potential integer overflow. / if (lengthptr != NULL) { int delta = (repeat_min - 1)length_prevgroup; if ((double)(repeat_min - 1)(double)length_prevgroup > (double)INT_MAX \|\| OFLOW_MAX - lengthptr < delta) { errorcodeptr = ERR20; goto FAILED; } lengthptr += delta; } /* This is compiling for real / else { if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; for (i = 1; i < repeat_min; i++) { uschar hc; uschar this_hwm = cd->hwm; memcpy(code, previous, len); for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len); cd->hwm += LINK_SIZE; } save_hwm = this_hwm; code += len; } } } if (repeat_max > 0) repeat_max -= repeat_min; } / This code is common to both the zero and non-zero minimum cases. If the maximum is limited, it replicates the group in a nested fashion, remembering the bracket starts on a stack. In the case of a zero minimum, the first one was set up above. In all cases the repeat_max now specifies the number of additional copies needed. Again, we must remember to replicate entries on the forward reference list. / if (repeat_max >= 0) { / In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. For each repetition we must add 1 to the length for BRAZERO and for all but the last repetition we must add 2 + 2LINKSIZE to allow for the nesting that occurs. Do some paranoid checks to avoid integer overflow. / if (lengthptr != NULL && repeat_max > 0) { int delta = repeat_max * (length_prevgroup + 1 + 2 + 2LINK_SIZE) - 2 - 2LINK_SIZE; /* Last one doesn't nest / if ((double)repeat_max (double)(length_prevgroup + 1 + 2 + 2LINK_SIZE) > (double)INT_MAX \|\| OFLOW_MAX - lengthptr < delta) { errorcodeptr = ERR20; goto FAILED; } lengthptr += delta; } /* This is compiling for real / else for (i = repeat_max - 1; i >= 0; i--) { uschar hc; uschar this_hwm = cd->hwm; code++ = OP_BRAZERO + repeat_type; /* All but the final copy start a new nesting, maintaining the chain of brackets outstanding. / if (i != 0) { int offset; code++ = OP_BRA; offset = (bralink == NULL)? 0 : code - bralink; bralink = code; PUTINC(code, 0, offset); } memcpy(code, previous, len); for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); cd->hwm += LINK_SIZE; } save_hwm = this_hwm; code += len; } /* Now chain through the pending brackets, and fill in their length fields (which are holding the chain links pro tem). / while (bralink != NULL) { int oldlinkoffset; int offset = code - bralink + 1; uschar bra = code - offset; oldlinkoffset = GET(bra, 1); bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; code++ = OP_KET; PUTINC(code, 0, offset); PUT(bra, 1, offset); } } / If the maximum is unlimited, set a repeater in the final copy. We can't just offset backwards from the current code point, because we don't know if there's been an options resetting after the ket. The correct offset was computed above. Then, when we are doing the actual compile phase, check to see whether this group is a non-atomic one that could match an empty string. If so, convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime checking can be done. [This check is also applied to atomic groups at runtime, but in a different way.] / else { uschar ketcode = code - ketoffset; uschar bracode = ketcode - GET(ketcode, 1); ketcode = OP_KETRMAX + repeat_type; if (lengthptr == NULL && bracode != OP_ONCE) { uschar scode = bracode; do { if (could_be_empty_branch(scode, ketcode, utf8)) { bracode += OP_SBRA - OP_BRA; break; } scode += GET(scode, 1); } while (scode == OP_ALT); } } } /* If previous is OP_FAIL, it was generated by an empty class [] in JavaScript mode. The other ways in which OP_FAIL can be generated, that is by (FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" error above. We can just ignore the repeat in JS case. / else if (previous == OP_FAIL) goto END_REPEAT; / Else there's some kind of shambles / else { errorcodeptr = ERR11; goto FAILED; } /* If the character following a repeat is '+', or if certain optimization tests above succeeded, possessive_quantifier is TRUE. For some of the simpler opcodes, there is an special alternative opcode for this. For anything else, we wrap the entire repeated item inside OP_ONCE brackets. The '+' notation is just syntactic sugar, taken from Sun's Java package, but the special opcodes can optimize it a bit. The repeated item starts at tempcode, not at previous, which might be the first part of a string whose (former) last char we repeated. Possessifying an 'exact' quantifier has no effect, so we can ignore it. But an 'upto' may follow. We skip over an 'exact' item, and then test the length of what remains before proceeding. / if (possessive_quantifier) { int len; if (tempcode == OP_EXACT \|\| tempcode == OP_TYPEEXACT \|\| tempcode == OP_NOTEXACT) tempcode += _pcre_OP_lengths[tempcode] + ((tempcode == OP_TYPEEXACT && (tempcode[3] == OP_PROP \|\| tempcode[3] == OP_NOTPROP))? 2:0); len = code - tempcode; if (len > 0) switch (tempcode) { case OP_STAR: tempcode = OP_POSSTAR; break; case OP_PLUS: tempcode = OP_POSPLUS; break; case OP_QUERY: tempcode = OP_POSQUERY; break; case OP_UPTO: tempcode = OP_POSUPTO; break; case OP_TYPESTAR: tempcode = OP_TYPEPOSSTAR; break; case OP_TYPEPLUS: tempcode = OP_TYPEPOSPLUS; break; case OP_TYPEQUERY: tempcode = OP_TYPEPOSQUERY; break; case OP_TYPEUPTO: tempcode = OP_TYPEPOSUPTO; break; case OP_NOTSTAR: tempcode = OP_NOTPOSSTAR; break; case OP_NOTPLUS: tempcode = OP_NOTPOSPLUS; break; case OP_NOTQUERY: tempcode = OP_NOTPOSQUERY; break; case OP_NOTUPTO: tempcode = OP_NOTPOSUPTO; break; default: memmove(tempcode + 1+LINK_SIZE, tempcode, len); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; tempcode[0] = OP_ONCE; code++ = OP_KET; PUTINC(code, 0, len); PUT(tempcode, 1, len); break; } } /* In all case we no longer have a previous item. We also set the "follows varying string" flag for subsequently encountered reqbytes if it isn't already set and we have just passed a varying length item. / END_REPEAT: previous = NULL; cd->req_varyopt \|= reqvary; break; / ===================================================================/ / Start of nested parenthesized sub-expression, or comment or lookahead or lookbehind or option setting or condition or all the other extended parenthesis forms. / case '(': newoptions = options; skipbytes = 0; bravalue = OP_CBRA; save_hwm = cd->hwm; reset_bracount = FALSE; / First deal with various "verbs" that can be introduced by ''. / if ((++ptr) == '' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) { int i, namelen; const char vn = verbnames; const uschar name = ++ptr; previous = NULL; while ((cd->ctypes[++ptr] & ctype_letter) != 0) {}; if (ptr == ':') { errorcodeptr = ERR59; / Not supported / goto FAILED; } if (ptr != ')') { errorcodeptr = ERR60; goto FAILED; } namelen = ptr - name; for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && strncmp((char )name, vn, namelen) == 0) { code = verbs[i].op; if (code++ == OP_ACCEPT) cd->had_accept = TRUE; break; } vn += verbs[i].len + 1; } if (i < verbcount) continue; errorcodeptr = ERR60; goto FAILED; } / Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. / else if (ptr == '?') { int i, set, unset, namelen; int optset; const uschar name; uschar slot; switch ((++ptr)) { case '#': /* Comment; skip to ket / ptr++; while (ptr != 0 && ptr != ')') ptr++; if (ptr == 0) { errorcodeptr = ERR18; goto FAILED; } continue; / ------------------------------------------------------------ / case '\|': / Reset capture count for each branch / reset_bracount = TRUE; / Fall through / / ------------------------------------------------------------ / case ':': / Non-capturing bracket / bravalue = OP_BRA; ptr++; break; / ------------------------------------------------------------ / case '(': bravalue = OP_COND; / Conditional group / / A condition can be an assertion, a number (referring to a numbered group), a name (referring to a named group), or 'R', referring to recursion. R and R&name are also permitted for recursion tests. There are several syntaxes for testing a named group: (?(name)) is used by Python; Perl 5.10 onwards uses (?() or (?('name')). There are two unfortunate ambiguities, caused by history. (a) 'R' can be the recursive thing or the name 'R' (and similarly for 'R' followed by digits), and (b) a number could be a name that consists of digits. In both cases, we look for a name first; if not found, we try the other cases. / / For conditions that are assertions, check the syntax, and then exit the switch. This will take control down to where bracketed groups, including assertions, are processed. / if (ptr[1] == '?' && (ptr[2] == '=' \|\| ptr[2] == '!' \|\| ptr[2] == '<')) break; / Most other conditions use OP_CREF (a couple change to OP_RREF below), and all need to skip 3 bytes at the start of the group. / code[1+LINK_SIZE] = OP_CREF; skipbytes = 3; refsign = -1; / Check for a test for recursion in a named group. / if (ptr[1] == 'R' && ptr[2] == '&') { terminator = -1; ptr += 2; code[1+LINK_SIZE] = OP_RREF; / Change the type of test / } / Check for a test for a named group's having been set, using the Perl syntax (?() or (?('name') / else if (ptr[1] == '<') { terminator = '>'; ptr++; } else if (ptr[1] == '\'') { terminator = '\''; ptr++; } else { terminator = 0; if (ptr[1] == '-' \|\| ptr[1] == '+') refsign = (++ptr); } /* We now expect to read a name; any thing else is an error / if ((cd->ctypes[ptr[1]] & ctype_word) == 0) { ptr += 1; / To get the right offset / errorcodeptr = ERR28; goto FAILED; } /* Read the name, but also get it as a number if it's all digits / recno = 0; name = ++ptr; while ((cd->ctypes[ptr] & ctype_word) != 0) { if (recno >= 0) recno = ((digitab[ptr] & ctype_digit) != 0)? recno 10 + ptr - '0' : -1; ptr++; } namelen = ptr - name; if ((terminator > 0 && ptr++ != terminator) \|\| ptr++ != ')') { ptr--; / Error offset / errorcodeptr = ERR26; goto FAILED; } /* Do no further checking in the pre-compile phase. / if (lengthptr != NULL) break; / In the real compile we do the work of looking for the actual reference. If the string started with "+" or "-" we require the rest to be digits, in which case recno will be set. / if (refsign > 0) { if (recno <= 0) { errorcodeptr = ERR58; goto FAILED; } recno = (refsign == '-')? cd->bracount - recno + 1 : recno +cd->bracount; if (recno <= 0 \|\| recno > cd->final_bracount) { errorcodeptr = ERR15; goto FAILED; } PUT2(code, 2+LINK_SIZE, recno); break; } / Otherwise (did not start with "+" or "-"), start by looking for the name. / slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { if (strncmp((char )name, (char )slot+2, namelen) == 0) break; slot += cd->name_entry_size; } / Found a previous named subpattern / if (i < cd->names_found) { recno = GET2(slot, 0); PUT2(code, 2+LINK_SIZE, recno); } / Search the pattern for a forward reference / else if ((i = find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) > 0) { PUT2(code, 2+LINK_SIZE, i); } / If terminator == 0 it means that the name followed directly after the opening parenthesis [e.g. (?(abc)...] and in this case there are some further alternatives to try. For the cases where terminator != 0 [things like (?(... or (?('name')... or (?(R&name)... ] we have now checked all the possibilities, so give an error. / else if (terminator != 0) { errorcodeptr = ERR15; goto FAILED; } /* Check for (?(R) for recursion. Allow digits after R to specify a specific group number. / else if (name == 'R') { recno = 0; for (i = 1; i < namelen; i++) { if ((digitab[name[i]] & ctype_digit) == 0) { errorcodeptr = ERR15; goto FAILED; } recno = recno 10 + name[i] - '0'; } if (recno == 0) recno = RREF_ANY; code[1+LINK_SIZE] = OP_RREF; /* Change test type / PUT2(code, 2+LINK_SIZE, recno); } / Similarly, check for the (?(DEFINE) "condition", which is always false. / else if (namelen == 6 && strncmp((char )name, "DEFINE", 6) == 0) { code[1+LINK_SIZE] = OP_DEF; skipbytes = 1; } /* Check for the "name" actually being a subpattern number. We are in the second pass here, so final_bracount is set. / else if (recno > 0 && recno <= cd->final_bracount) { PUT2(code, 2+LINK_SIZE, recno); } / Either an unidentified subpattern, or a reference to (?(0) / else { errorcodeptr = (recno == 0)? ERR35: ERR15; goto FAILED; } break; /* ------------------------------------------------------------ / case '=': / Positive lookahead / bravalue = OP_ASSERT; ptr++; break; / ------------------------------------------------------------ / case '!': / Negative lookahead / ptr++; if (ptr == ')') /* Optimize (?!) / { code++ = OP_FAIL; previous = NULL; continue; } bravalue = OP_ASSERT_NOT; break; /* ------------------------------------------------------------ / case '<': / Lookbehind or named define / switch (ptr[1]) { case '=': / Positive lookbehind / bravalue = OP_ASSERTBACK; ptr += 2; break; case '!': / Negative lookbehind / bravalue = OP_ASSERTBACK_NOT; ptr += 2; break; default: / Could be name define, else bad / if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME; ptr++; / Correct offset for error / errorcodeptr = ERR24; goto FAILED; } break; /* ------------------------------------------------------------ / case '>': / One-time brackets / bravalue = OP_ONCE; ptr++; break; / ------------------------------------------------------------ / case 'C': / Callout - may be followed by digits; / previous_callout = code; / Save for later completion / after_manual_callout = 1; / Skip one item before completing / code++ = OP_CALLOUT; { int n = 0; while ((digitab[(++ptr)] & ctype_digit) != 0) n = n 10 + ptr - '0'; if (ptr != ')') { errorcodeptr = ERR39; goto FAILED; } if (n > 255) { errorcodeptr = ERR38; goto FAILED; } code++ = n; PUT(code, 0, ptr - cd->start_pattern + 1); / Pattern offset / PUT(code, LINK_SIZE, 0); / Default length / code += 2 LINK_SIZE; } previous = NULL; continue; /* ------------------------------------------------------------ / case 'P': / Python-style named subpattern handling / if ((++ptr) == '=' \|\| ptr == '>') / Reference or recursion / { is_recurse = ptr == '>'; terminator = ')'; goto NAMED_REF_OR_RECURSE; } else if (ptr != '<') / Test for Python-style definition / { errorcodeptr = ERR41; goto FAILED; } /* Fall through to handle (?P< as (?< is handled / / ------------------------------------------------------------ / DEFINE_NAME: / Come here from (?< handling / case '\'': { terminator = (ptr == '<')? '>' : '\''; name = ++ptr; while ((cd->ctypes[ptr] & ctype_word) != 0) ptr++; namelen = ptr - name; / In the pre-compile phase, just do a syntax check. / if (lengthptr != NULL) { if (ptr != terminator) { errorcodeptr = ERR42; goto FAILED; } if (cd->names_found >= MAX_NAME_COUNT) { errorcodeptr = ERR49; goto FAILED; } if (namelen + 3 > cd->name_entry_size) { cd->name_entry_size = namelen + 3; if (namelen > MAX_NAME_SIZE) { errorcodeptr = ERR48; goto FAILED; } } } / In the real compile, create the entry in the table / else { slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { int crc = memcmp(name, slot+2, namelen); if (crc == 0) { if (slot[2+namelen] == 0) { if ((options & PCRE_DUPNAMES) == 0) { errorcodeptr = ERR43; goto FAILED; } } else crc = -1; /* Current name is substring / } if (crc < 0) { memmove(slot + cd->name_entry_size, slot, (cd->names_found - i) cd->name_entry_size); break; } slot += cd->name_entry_size; } PUT2(slot, 0, cd->bracount + 1); memcpy(slot + 2, name, namelen); slot[2+namelen] = 0; } } /* In both cases, count the number of names we've encountered. / ptr++; / Move past > or ' / cd->names_found++; goto NUMBERED_GROUP; / ------------------------------------------------------------ / case '&': / Perl recursion/subroutine syntax / terminator = ')'; is_recurse = TRUE; / Fall through / / We come here from the Python syntax above that handles both references (?P=name) and recursion (?P>name), as well as falling through from the Perl recursion syntax (?&name). We also come here from the Perl \k or \k'name' back reference syntax and the \k{name} .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. / NAMED_REF_OR_RECURSE: name = ++ptr; while ((cd->ctypes[ptr] & ctype_word) != 0) ptr++; namelen = ptr - name; /* In the pre-compile phase, do a syntax check and set a dummy reference number. / if (lengthptr != NULL) { if (namelen == 0) { errorcodeptr = ERR62; goto FAILED; } if (ptr != terminator) { errorcodeptr = ERR42; goto FAILED; } if (namelen > MAX_NAME_SIZE) { errorcodeptr = ERR48; goto FAILED; } recno = 0; } / In the real compile, seek the name in the table. We check the name first, and then check that we have reached the end of the name in the table. That way, if the name that is longer than any in the table, the comparison will fail without reading beyond the table entry. / else { slot = cd->name_table; for (i = 0; i < cd->names_found; i++) { if (strncmp((char )name, (char )slot+2, namelen) == 0 && slot[2+namelen] == 0) break; slot += cd->name_entry_size; } if (i < cd->names_found) / Back reference / { recno = GET2(slot, 0); } else if ((recno = / Forward back reference / find_parens(ptr, cd, name, namelen, (options & PCRE_EXTENDED) != 0)) <= 0) { errorcodeptr = ERR15; goto FAILED; } } /* In both phases, we can now go to the code than handles numerical recursion or backreferences. / if (is_recurse) goto HANDLE_RECURSION; else goto HANDLE_REFERENCE; / ------------------------------------------------------------ / case 'R': / Recursion / ptr++; / Same as (?0) / / Fall through / / ------------------------------------------------------------ / case '-': case '+': case '0': case '1': case '2': case '3': case '4': / Recursion or / case '5': case '6': case '7': case '8': case '9': / subroutine / { const uschar called; terminator = ')'; /* Come here from the \g<...> and \g'...' code (Oniguruma compatibility). However, the syntax has been checked to ensure that the ... are a (signed) number, so that neither ERR63 nor ERR29 will be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY ever be taken. / HANDLE_NUMERICAL_RECURSION: if ((refsign = ptr) == '+') { ptr++; if ((digitab[ptr] & ctype_digit) == 0) { errorcodeptr = ERR63; goto FAILED; } } else if (refsign == '-') { if ((digitab[ptr[1]] & ctype_digit) == 0) goto OTHER_CHAR_AFTER_QUERY; ptr++; } recno = 0; while((digitab[ptr] & ctype_digit) != 0) recno = recno 10 + ptr++ - '0'; if (ptr != terminator) { errorcodeptr = ERR29; goto FAILED; } if (refsign == '-') { if (recno == 0) { errorcodeptr = ERR58; goto FAILED; } recno = cd->bracount - recno + 1; if (recno <= 0) { errorcodeptr = ERR15; goto FAILED; } } else if (refsign == '+') { if (recno == 0) { errorcodeptr = ERR58; goto FAILED; } recno += cd->bracount; } /* Come here from code above that handles a named recursion / HANDLE_RECURSION: previous = code; called = cd->start_code; / When we are actually compiling, find the bracket that is being referenced. Temporarily end the regex in case it doesn't exist before this point. If we end up with a forward reference, first check that the bracket does occur later so we can give the error (and position) now. Then remember this forward reference in the workspace so it can be filled in at the end. / if (lengthptr == NULL) { code = OP_END; if (recno != 0) called = find_bracket(cd->start_code, utf8, recno); /* Forward reference / if (called == NULL) { if (find_parens(ptr, cd, NULL, recno, (options & PCRE_EXTENDED) != 0) < 0) { errorcodeptr = ERR15; goto FAILED; } called = cd->start_code + recno; PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code); } /* If not a forward reference, and the subpattern is still open, this is a recursive call. We check to see if this is a left recursion that could loop for ever, and diagnose that case. / else if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8)) { errorcodeptr = ERR40; goto FAILED; } } /* Insert the recursion/subroutine item, automatically wrapped inside "once" brackets. Set up a "previous group" length so that a subsequent quantifier will work. / code = OP_ONCE; PUT(code, 1, 2 + 2LINK_SIZE); code += 1 + LINK_SIZE; code = OP_RECURSE; PUT(code, 1, called - cd->start_code); code += 1 + LINK_SIZE; code = OP_KET; PUT(code, 1, 2 + 2LINK_SIZE); code += 1 + LINK_SIZE; length_prevgroup = 3 + 3LINK_SIZE; } / Can't determine a first byte now / if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; continue; / ------------------------------------------------------------ / default: / Other characters: check option setting / OTHER_CHAR_AFTER_QUERY: set = unset = 0; optset = &set; while (ptr != ')' && ptr != ':') { switch (ptr++) { case '-': optset = &unset; break; case 'J': /* Record that it changed in the external options / optset \|= PCRE_DUPNAMES; cd->external_flags \|= PCRE_JCHANGED; break; case 'i': optset \|= PCRE_CASELESS; break; case 'm': optset \|= PCRE_MULTILINE; break; case 's': optset \|= PCRE_DOTALL; break; case 'x': optset \|= PCRE_EXTENDED; break; case 'U': optset \|= PCRE_UNGREEDY; break; case 'X': optset \|= PCRE_EXTRA; break; default: errorcodeptr = ERR12; ptr--; / Correct the offset / goto FAILED; } } / Set up the changed option bits, but don't change anything yet. / newoptions = (options \| set) & (~unset); / If the options ended with ')' this is not the start of a nested group with option changes, so the options change at this level. If this item is right at the start of the pattern, the options can be abstracted and made external in the pre-compile phase, and ignored in the compile phase. This can be helpful when matching -- for instance in caseless checking of required bytes. If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are definitely not at the start of the pattern because something has been compiled. In the pre-compile phase, however, the code pointer can have that value after the start, because it gets reset as code is discarded during the pre-compile. However, this can happen only at top level - if we are within parentheses, the starting BRA will still be present. At any parenthesis level, the length value can be used to test if anything has been compiled at that level. Thus, a test for both these conditions is necessary to ensure we correctly detect the start of the pattern in both phases. If we are not at the pattern start, compile code to change the ims options if this setting actually changes any of them, and reset the greedy defaults and the case value for firstbyte and reqbyte. / if (ptr == ')') { if (code == cd->start_code + 1 + LINK_SIZE && (lengthptr == NULL \|\| lengthptr == 2 + 2LINK_SIZE)) { cd->external_options = newoptions; } else { if ((options & PCRE_IMS) != (newoptions & PCRE_IMS)) { code++ = OP_OPT; code++ = newoptions & PCRE_IMS; } greedy_default = ((newoptions & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; } /* Change options at this level, and pass them back for use in subsequent branches. When not at the start of the pattern, this information is also necessary so that a resetting item can be compiled at the end of a group (if we are in a group). / optionsptr = options = newoptions; previous = NULL; /* This item can't be repeated / continue; / It is complete / } / If the options ended with ':' we are heading into a nested group with possible change of options. Such groups are non-capturing and are not assertions of any kind. All we need to do is skip over the ':'; the newoptions value is handled below. / bravalue = OP_BRA; ptr++; } / End of switch for character following (? / } / End of (? handling / / Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become non-capturing and behave like (?:...) brackets. / else if ((options & PCRE_NO_AUTO_CAPTURE) != 0) { bravalue = OP_BRA; } / Else we have a capturing group. / else { NUMBERED_GROUP: cd->bracount += 1; PUT2(code, 1+LINK_SIZE, cd->bracount); skipbytes = 2; } / Process nested bracketed regex. Assertions may not be repeated, but other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a non-register variable in order to be able to pass its address because some compilers complain otherwise. Pass in a new setting for the ims options if they have changed. / previous = (bravalue >= OP_ONCE)? code : NULL; code = bravalue; tempcode = code; tempreqvary = cd->req_varyopt; /* Save value before bracket / length_prevgroup = 0; / Initialize for pre-compile phase / if (!compile_regex( newoptions, / The complete new option state / options & PCRE_IMS, / The previous ims option state / &tempcode, / Where to put code (updated) / &ptr, / Input pointer (updated) / errorcodeptr, / Where to put an error message / (bravalue == OP_ASSERTBACK \|\| bravalue == OP_ASSERTBACK_NOT), / TRUE if back assert / reset_bracount, / True if (?\| group / skipbytes, / Skip over bracket number / &subfirstbyte, / For possible first char / &subreqbyte, / For possible last char / bcptr, / Current branch chain / cd, / Tables block / (lengthptr == NULL)? NULL : / Actual compile phase / &length_prevgroup / Pre-compile phase / )) goto FAILED; / At the end of compiling, code is still pointing to the start of the group, while tempcode has been updated to point past the end of the group and any option resetting that may follow it. The pattern pointer (ptr) is on the bracket. / / If this is a conditional bracket, check that there are no more than two branches in the group, or just one if it's a DEFINE group. We do this in the real compile phase, not in the pre-pass, where the whole group may not be available. / if (bravalue == OP_COND && lengthptr == NULL) { uschar tc = code; int condcount = 0; do { condcount++; tc += GET(tc,1); } while (tc != OP_KET); / A DEFINE group is never obeyed inline (the "condition" is always false). It must have only one branch. / if (code[LINK_SIZE+1] == OP_DEF) { if (condcount > 1) { errorcodeptr = ERR54; goto FAILED; } bravalue = OP_DEF; /* Just a flag to suppress char handling below / } / A "normal" conditional group. If there is just one branch, we must not make use of its firstbyte or reqbyte, because this is equivalent to an empty second branch. / else { if (condcount > 2) { errorcodeptr = ERR27; goto FAILED; } if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE; } } /* Error if hit end of pattern / if (ptr != ')') { errorcodeptr = ERR14; goto FAILED; } / In the pre-compile phase, update the length by the length of the group, less the brackets at either end. Then reduce the compiled code to just a set of non-capturing brackets so that it doesn't use much memory if it is duplicated by a quantifier./ if (lengthptr != NULL) { if (OFLOW_MAX - lengthptr < length_prevgroup - 2 - 2LINK_SIZE) { errorcodeptr = ERR20; goto FAILED; } lengthptr += length_prevgroup - 2 - 2LINK_SIZE; code++ = OP_BRA; PUTINC(code, 0, 1 + LINK_SIZE); code++ = OP_KET; PUTINC(code, 0, 1 + LINK_SIZE); break; /* No need to waste time with special character handling / } / Otherwise update the main code pointer to the end of the group. / code = tempcode; / For a DEFINE group, required and first character settings are not relevant. / if (bravalue == OP_DEF) break; / Handle updating of the required and first characters for other types of group. Update for normal brackets of all kinds, and conditions with two branches (see code above). If the bracket is followed by a quantifier with zero repeat, we have to back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the main loop so that they can be accessed for the back off. / zeroreqbyte = reqbyte; zerofirstbyte = firstbyte; groupsetfirstbyte = FALSE; if (bravalue >= OP_ONCE) { / If we have not yet set a firstbyte in this branch, take it from the subpattern, remembering that it was set here so that a repeat of more than one can replicate it as reqbyte if necessary. If the subpattern has no firstbyte, set "none" for the whole branch. In both cases, a zero repeat forces firstbyte to "none". / if (firstbyte == REQ_UNSET) { if (subfirstbyte >= 0) { firstbyte = subfirstbyte; groupsetfirstbyte = TRUE; } else firstbyte = REQ_NONE; zerofirstbyte = REQ_NONE; } / If firstbyte was previously set, convert the subpattern's firstbyte into reqbyte if there wasn't one, using the vary flag that was in existence beforehand. / else if (subfirstbyte >= 0 && subreqbyte < 0) subreqbyte = subfirstbyte \| tempreqvary; / If the subpattern set a required byte (or set a first byte that isn't really the first byte - see above), set it. / if (subreqbyte >= 0) reqbyte = subreqbyte; } / For a forward assertion, we take the reqbyte, if set. This can be helpful if the pattern that follows the assertion doesn't set a different char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte for an assertion, however because it leads to incorrect effect for patterns such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead of a firstbyte. This is overcome by a scan at the end if there's no firstbyte, looking for an asserted first char. / else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte; break; / End of processing '(' / / ===================================================================/ / Handle metasequences introduced by \. For ones like \d, the ESC_ values are arranged to be the negation of the corresponding OP_values. For the back references, the values are ESC_REF plus the reference number. Only back references and those types that consume a character may be repeated. We can test for values between ESC_b and ESC_Z for the latter; this may have to change if any new ones are ever created. / case '\\': tempptr = ptr; c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE); if (errorcodeptr != 0) goto FAILED; if (c < 0) { if (-c == ESC_Q) /* Handle start of quoted string / { if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; / avoid empty string / else inescq = TRUE; continue; } if (-c == ESC_E) continue; / Perl ignores an orphan \E / / For metasequences that actually match a character, we disable the setting of a first character if it hasn't already been set. / if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z) firstbyte = REQ_NONE; / Set values to reset to if this is followed by a zero repeat. / zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; / \g or \g'name' is a subroutine call by name and \g or \g'n' is a subroutine call by number (Oniguruma syntax). In fact, the value -ESC_g is returned only for these cases. So we don't need to check for < or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as that is a synonym for a named back reference). / if (-c == ESC_g) { const uschar p; save_hwm = cd->hwm; /* Normally this is set when '(' is read / terminator = ((++ptr) == '<')? '>' : '\''; /* These two statements stop the compiler for warning about possibly unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In fact, because we actually check for a number below, the paths that would actually be in error are never taken. / skipbytes = 0; reset_bracount = FALSE; / Test for a name / if (ptr[1] != '+' && ptr[1] != '-') { BOOL isnumber = TRUE; for (p = ptr + 1; p != 0 && p != terminator; p++) { if ((cd->ctypes[p] & ctype_digit) == 0) isnumber = FALSE; if ((cd->ctypes[p] & ctype_word) == 0) break; } if (p != terminator) { errorcodeptr = ERR57; break; } if (isnumber) { ptr++; goto HANDLE_NUMERICAL_RECURSION; } is_recurse = TRUE; goto NAMED_REF_OR_RECURSE; } / Test a signed number in angle brackets or quotes. / p = ptr + 2; while ((digitab[p] & ctype_digit) != 0) p++; if (p != terminator) { errorcodeptr = ERR57; break; } ptr++; goto HANDLE_NUMERICAL_RECURSION; } /* \k or \k'name' is a back reference by name (Perl syntax). We also support \k{name} (.NET syntax) / if (-c == ESC_k && (ptr[1] == '<' \|\| ptr[1] == '\'' \|\| ptr[1] == '{')) { is_recurse = FALSE; terminator = ((++ptr) == '<')? '>' : (ptr == '\'')? '\'' : '}'; goto NAMED_REF_OR_RECURSE; } / Back references are handled specially; must disable firstbyte if not set to cope with cases like (?=(\w+))\1: which would otherwise set ':' later. / if (-c >= ESC_REF) { recno = -c - ESC_REF; HANDLE_REFERENCE: / Come here from named backref handling / if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; previous = code; code++ = OP_REF; PUT2INC(code, 0, recno); cd->backref_map \|= (recno < 32)? (1 << recno) : 1; if (recno > cd->top_backref) cd->top_backref = recno; } /* So are Unicode property matches, if supported. / #ifdef SUPPORT_UCP else if (-c == ESC_P \|\| -c == ESC_p) { BOOL negated; int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; previous = code; code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP; code++ = ptype; code++ = pdata; } #else /* If Unicode properties are not supported, \X, \P, and \p are not allowed. / else if (-c == ESC_X \|\| -c == ESC_P \|\| -c == ESC_p) { errorcodeptr = ERR45; goto FAILED; } #endif /* For the rest (including \X when Unicode properties are supported), we can obtain the OP value by negating the escape value. / else { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; code++ = -c; } continue; } /* We have a data character whose value is in c. In UTF-8 mode it may have a value > 127. We set its representation in the length/buffer, and then handle it as a data character. / #ifdef SUPPORT_UTF8 if (utf8 && c > 127) mclength = _pcre_ord2utf8(c, mcbuffer); else #endif { mcbuffer[0] = c; mclength = 1; } goto ONE_CHAR; / ===================================================================/ / Handle a literal character. It is guaranteed not to be whitespace or # when the extended flag is set. If we are in UTF-8 mode, it may be a multi-byte literal character. / default: NORMAL_CHAR: mclength = 1; mcbuffer[0] = c; #ifdef SUPPORT_UTF8 if (utf8 && c >= 0xc0) { while ((ptr[1] & 0xc0) == 0x80) mcbuffer[mclength++] = (++ptr); } #endif /* At this point we have the character's bytes in mcbuffer, and the length in mclength. When not in UTF-8 mode, the length is always 1. / ONE_CHAR: previous = code; code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR; for (c = 0; c < mclength; c++) code++ = mcbuffer[c]; / Remember if \r or \n were seen / if (mcbuffer[0] == '\r' \|\| mcbuffer[0] == '\n') cd->external_flags \|= PCRE_HASCRORLF; / Set the first and required bytes appropriately. If no previous first byte, set it from this character, but revert to none on a zero repeat. Otherwise, leave the firstbyte value alone, and don't change it on a zero repeat. / if (firstbyte == REQ_UNSET) { zerofirstbyte = REQ_NONE; zeroreqbyte = reqbyte; / If the character is more than one byte long, we can set firstbyte only if it is not to be matched caselessly. / if (mclength == 1 \|\| req_caseopt == 0) { firstbyte = mcbuffer[0] \| req_caseopt; if (mclength != 1) reqbyte = code[-1] \| cd->req_varyopt; } else firstbyte = reqbyte = REQ_NONE; } / firstbyte was previously set; we can set reqbyte only the length is 1 or the matching is caseful. / else { zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; if (mclength == 1 \|\| req_caseopt == 0) reqbyte = code[-1] \| req_caseopt \| cd->req_varyopt; } break; / End of literal character handling / } } / end of big loop / / Control never reaches here by falling through, only by a goto for all the error states. Pass back the position in the pattern so that it can be displayed to the user for diagnosing the error. / FAILED: ptrptr = ptr; return FALSE; } /************************************************* * Compile sequence of alternatives * ************************************************/ / On entry, ptr is pointing past the bracket character, but on return it points to the closing bracket, or vertical bar, or end of string. The code variable is pointing at the byte into which the BRA operator has been stored. If the ims options are changed at the start (for a (?ims: group) or during any branch, we need to insert an OP_OPT item at the start of every following branch to ensure they get set correctly at run time, and also pass the new options into every subsequent branch compile. This function is used during the pre-compile phase when we are trying to find out the amount of memory needed, as well as during the real compile phase. The value of lengthptr distinguishes the two phases. Arguments: options option bits, including any changes for this subpattern oldims previous settings of ims option bits codeptr -> the address of the current code pointer ptrptr -> the address of the current pattern pointer errorcodeptr -> pointer to error code variable lookbehind TRUE if this is a lookbehind assertion reset_bracount TRUE to reset the count for each branch skipbytes skip this many bytes at start (for brackets and OP_COND) firstbyteptr place to put the first required character, or a negative number reqbyteptr place to put the last required character, or a negative number bcptr pointer to the chain of currently open branches cd points to the data block with tables pointers etc. lengthptr NULL during the real compile phase points to length accumulator during pre-compile phase Returns: TRUE on success / static BOOL compile_regex(int options, int oldims, uschar codeptr, const uschar ptrptr, int errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes, int firstbyteptr, int reqbyteptr, branch_chain bcptr, compile_data cd, int lengthptr) { const uschar ptr = ptrptr; uschar code = codeptr; uschar last_branch = code; uschar start_bracket = code; uschar reverse_count = NULL; int firstbyte, reqbyte; int branchfirstbyte, branchreqbyte; int length; int orig_bracount; int max_bracount; branch_chain bc; bc.outer = bcptr; bc.current = code; firstbyte = reqbyte = REQ_UNSET; /* Accumulate the length for use in the pre-compile phase. Start with the length of the BRA and KET and any extra bytes that are required at the beginning. We accumulate in a local variable to save frequent testing of lenthptr for NULL. We cannot do this by looking at the value of code at the start and end of each alternative, because compiled items are discarded during the pre-compile phase so that the work space is not exceeded. / length = 2 + 2LINK_SIZE + skipbytes; /* WARNING: If the above line is changed for any reason, you must also change the code that abstracts option settings at the start of the pattern and makes them global. It tests the value of length for (2 + 2LINK_SIZE) in the pre-compile phase to find out whether anything has yet been compiled or not. / /* Offset is set zero to mark that this bracket is still open / PUT(code, 1, 0); code += 1 + LINK_SIZE + skipbytes; / Loop for each alternative branch / orig_bracount = max_bracount = cd->bracount; for (;;) { / For a (?\| group, reset the capturing bracket count so that each branch uses the same numbers. / if (reset_bracount) cd->bracount = orig_bracount; / Handle a change of ims options at the start of the branch / if ((options & PCRE_IMS) != oldims) { code++ = OP_OPT; code++ = options & PCRE_IMS; length += 2; } / Set up dummy OP_REVERSE if lookbehind assertion / if (lookbehind) { code++ = OP_REVERSE; reverse_count = code; PUTINC(code, 0, 0); length += 1 + LINK_SIZE; } /* Now compile the branch; in the pre-compile phase its length gets added into the length. / if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte, &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length)) { ptrptr = ptr; return FALSE; } /* Keep the highest bracket count in case (?\| was used and some branch has fewer than the rest. / if (cd->bracount > max_bracount) max_bracount = cd->bracount; / In the real compile phase, there is some post-processing to be done. / if (lengthptr == NULL) { / If this is the first branch, the firstbyte and reqbyte values for the branch become the values for the regex. / if (last_branch != OP_ALT) { firstbyte = branchfirstbyte; reqbyte = branchreqbyte; } /* If this is not the first branch, the first char and reqbyte have to match the values from all the previous branches, except that if the previous value for reqbyte didn't have REQ_VARY set, it can still match, and we set REQ_VARY for the regex. / else { / If we previously had a firstbyte, but it doesn't match the new branch, we have to abandon the firstbyte for the regex, but if there was previously no reqbyte, it takes on the value of the old firstbyte. / if (firstbyte >= 0 && firstbyte != branchfirstbyte) { if (reqbyte < 0) reqbyte = firstbyte; firstbyte = REQ_NONE; } / If we (now or from before) have no firstbyte, a firstbyte from the branch becomes a reqbyte if there isn't a branch reqbyte. / if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0) branchreqbyte = branchfirstbyte; / Now ensure that the reqbytes match / if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY)) reqbyte = REQ_NONE; else reqbyte \|= branchreqbyte; / To "or" REQ_VARY / } / If lookbehind, check that this branch matches a fixed-length string, and put the length into the OP_REVERSE item. Temporarily mark the end of the branch with OP_END. / if (lookbehind) { int fixed_length; code = OP_END; fixed_length = find_fixedlength(last_branch, options); DPRINTF(("fixed length = %d\n", fixed_length)); if (fixed_length < 0) { errorcodeptr = (fixed_length == -2)? ERR36 : ERR25; ptrptr = ptr; return FALSE; } PUT(reverse_count, 0, fixed_length); } } /* Reached end of expression, either ')' or end of pattern. In the real compile phase, go back through the alternative branches and reverse the chain of offsets, with the field in the BRA item now becoming an offset to the first alternative. If there are no alternatives, it points to the end of the group. The length in the terminating ket is always the length of the whole bracketed item. If any of the ims options were changed inside the group, compile a resetting op-code following, except at the very end of the pattern. Return leaving the pointer at the terminating char. / if (ptr != '\|') { if (lengthptr == NULL) { int branch_length = code - last_branch; do { int prev_length = GET(last_branch, 1); PUT(last_branch, 1, branch_length); branch_length = prev_length; last_branch -= branch_length; } while (branch_length > 0); } /* Fill in the ket / code = OP_KET; PUT(code, 1, code - start_bracket); code += 1 + LINK_SIZE; /* Resetting option if needed / if ((options & PCRE_IMS) != oldims && ptr == ')') { code++ = OP_OPT; code++ = oldims; length += 2; } /* Retain the highest bracket number, in case resetting was used. / cd->bracount = max_bracount; / Set values to pass back / codeptr = code; ptrptr = ptr; firstbyteptr = firstbyte; reqbyteptr = reqbyte; if (lengthptr != NULL) { if (OFLOW_MAX - lengthptr < length) { errorcodeptr = ERR20; return FALSE; } lengthptr += length; } return TRUE; } /* Another branch follows. In the pre-compile phase, we can move the code pointer back to where it was for the start of the first branch. (That is, pretend that each branch is the only one.) In the real compile phase, insert an ALT node. Its length field points back to the previous branch while the bracket remains open. At the end the chain is reversed. It's done like this so that the start of the bracket has a zero offset until it is closed, making it possible to detect recursion. / if (lengthptr != NULL) { code = codeptr + 1 + LINK_SIZE + skipbytes; length += 1 + LINK_SIZE; } else { code = OP_ALT; PUT(code, 1, code - last_branch); bc.current = last_branch = code; code += 1 + LINK_SIZE; } ptr++; } / Control never reaches here / } /************************************************ * Check for anchored expression * ************************************************/ / Try to find out if this is an anchored regular expression. Consider each alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then it's anchored. However, if this is a multiline pattern, then only OP_SOD counts, since OP_CIRC can match in the middle. We can also consider a regex to be anchored if OP_SOM starts all its branches. This is the code for \G, which means "match at start of match position, taking into account the match offset". A branch is also implicitly anchored if it starts with .* and DOTALL is set, because that will try the rest of the pattern at all possible matching points, so there is no point trying again.... er .... .... except when the .* appears inside capturing parentheses, and there is a subsequent back reference to those parentheses. We haven't enough information to catch that case precisely. At first, the best we could do was to detect when .* was in capturing brackets and the highest back reference was greater than or equal to that level. However, by keeping a bitmap of the first 31 back references, we can catch some of the more common cases more precisely. Arguments: code points to start of expression (the bracket) options points to the options setting bracket_map a bitmap of which brackets we are inside while testing; this handles up to substring 31; after that we just have to take the less precise approach backref_map the back reference bitmap Returns: TRUE or FALSE / static BOOL is_anchored(register const uschar code, int options, unsigned int bracket_map, unsigned int backref_map) { do { const uschar scode = first_significant_code(code + _pcre_OP_lengths[code], options, PCRE_MULTILINE, FALSE); register int op = scode; /* Non-capturing brackets / if (op == OP_BRA) { if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; } / Capturing brackets / else if (op == OP_CBRA) { int n = GET2(scode, 1+LINK_SIZE); int new_map = bracket_map \| ((n < 32)? (1 << n) : 1); if (!is_anchored(scode, options, new_map, backref_map)) return FALSE; } / Other brackets / else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND) { if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; } / .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and it isn't in brackets that are or may be referenced. / else if ((op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR)) { if (scode[1] != OP_ALLANY \|\| (bracket_map & backref_map) != 0) return FALSE; } / Check for explicit anchoring / else if (op != OP_SOD && op != OP_SOM && ((options & PCRE_MULTILINE) != 0 \|\| op != OP_CIRC)) return FALSE; code += GET(code, 1); } while (code == OP_ALT); / Loop for each alternative / return TRUE; } /************************************************ * Check for starting with ^ or .* * ************************************************/ / This is called to find out if every branch starts with ^ or .* so that "first char" processing can be done to speed things up in multiline matching and for non-DOTALL patterns that start with .* (which must start at the beginning or after \n). As in the case of is_anchored() (see above), we have to take account of back references to capturing brackets that contain .* because in that case we can't make the assumption. Arguments: code points to start of expression (the bracket) bracket_map a bitmap of which brackets we are inside while testing; this handles up to substring 31; after that we just have to take the less precise approach backref_map the back reference bitmap Returns: TRUE or FALSE / static BOOL is_startline(const uschar code, unsigned int bracket_map, unsigned int backref_map) { do { const uschar scode = first_significant_code(code + _pcre_OP_lengths[code], NULL, 0, FALSE); register int op = scode; / Non-capturing brackets / if (op == OP_BRA) { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } / Capturing brackets / else if (op == OP_CBRA) { int n = GET2(scode, 1+LINK_SIZE); int new_map = bracket_map \| ((n < 32)? (1 << n) : 1); if (!is_startline(scode, new_map, backref_map)) return FALSE; } / Other brackets / else if (op == OP_ASSERT \|\| op == OP_ONCE \|\| op == OP_COND) { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; } / .* means "start at start or after \n" if it isn't in brackets that may be referenced. / else if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR \|\| op == OP_TYPEPOSSTAR) { if (scode[1] != OP_ANY \|\| (bracket_map & backref_map) != 0) return FALSE; } / Check for explicit circumflex / else if (op != OP_CIRC) return FALSE; / Move on to the next alternative / code += GET(code, 1); } while (code == OP_ALT); /* Loop for each alternative / return TRUE; } /************************************************ * Check for asserted fixed first char * ************************************************/ / During compilation, the "first char" settings from forward assertions are discarded, because they can cause conflicts with actual literals that follow. However, if we end up without a first char setting for an unanchored pattern, it is worth scanning the regex to see if there is an initial asserted first char. If all branches start with the same asserted char, or with a bracket all of whose alternatives start with the same asserted char (recurse ad lib), then we return that char, otherwise -1. Arguments: code points to start of expression (the bracket) options pointer to the options (used to check casing changes) inassert TRUE if in an assertion Returns: -1 or the fixed first char / static int find_firstassertedchar(const uschar code, int options, BOOL inassert) { register int c = -1; do { int d; const uschar scode = first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE); register int op = scode; switch(op) { default: return -1; case OP_BRA: case OP_CBRA: case OP_ASSERT: case OP_ONCE: case OP_COND: if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0) return -1; if (c < 0) c = d; else if (c != d) return -1; break; case OP_EXACT: / Fall through / scode += 2; case OP_CHAR: case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: if (!inassert) return -1; if (c < 0) { c = scode[1]; if ((options & PCRE_CASELESS) != 0) c \|= REQ_CASELESS; } else if (c != scode[1]) return -1; break; } code += GET(code, 1); } while (code == OP_ALT); return c; } /************************************************ * Compile a Regular Expression * ************************************************/ / This function takes a string and returns a pointer to a block of store holding a compiled version of the expression. The original API for this function had no error code return variable; it is retained for backwards compatibility. The new function is given a new name. Arguments: pattern the regular expression options various option bits errorcodeptr pointer to error code variable (pcre_compile2() only) can be NULL if you don't want a code value errorptr pointer to pointer to error text erroroffset ptr offset in pattern where error was detected tables pointer to character tables or NULL Returns: pointer to compiled data block, or NULL on error, with errorptr and erroroffset set / PCRE_EXP_DEFN pcre PCRE_CALL_CONVENTION pcre_compile(const char pattern, int options, const char errorptr, int erroroffset, const unsigned char tables) { return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables); } PCRE_EXP_DEFN pcre PCRE_CALL_CONVENTION pcre_compile2(const char pattern, int options, int errorcodeptr, const char *errorptr, int erroroffset, const unsigned char tables) { real_pcre re; int length = 1; /* For final END opcode / int firstbyte, reqbyte, newline; int errorcode = 0; int skipatstart = 0; #ifdef SUPPORT_UTF8 BOOL utf8; #endif size_t size; uschar code; const uschar codestart; const uschar ptr; compile_data compile_block; compile_data cd = &compile_block; / This space is used for "compiling" into during the first phase, when we are computing the amount of memory that is needed. Compiled items are thrown away as soon as possible, so that a fairly large buffer should be sufficient for this purpose. The same space is used in the second phase for remembering where to fill in forward references to subpatterns. / uschar cworkspace[COMPILE_WORK_SIZE]; / Set this early so that early errors get offset 0. / ptr = (const uschar )pattern; /* We can't pass back an error message if errorptr is NULL; I guess the best we can do is just return NULL, but we can set a code value if there is a code pointer. / if (errorptr == NULL) { if (errorcodeptr != NULL) errorcodeptr = 99; return NULL; } errorptr = NULL; if (errorcodeptr != NULL) errorcodeptr = ERR0; /* However, we can give a message for this error / if (erroroffset == NULL) { errorcode = ERR16; goto PCRE_EARLY_ERROR_RETURN2; } erroroffset = 0; /* Can't support UTF8 unless PCRE has been compiled to include the code. / #ifdef SUPPORT_UTF8 utf8 = (options & PCRE_UTF8) != 0; if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 && (erroroffset = _pcre_valid_utf8((uschar )pattern, -1)) >= 0) { errorcode = ERR44; goto PCRE_EARLY_ERROR_RETURN2; } #else if ((options & PCRE_UTF8) != 0) { errorcode = ERR32; goto PCRE_EARLY_ERROR_RETURN; } #endif if ((options & ~PUBLIC_OPTIONS) != 0) { errorcode = ERR17; goto PCRE_EARLY_ERROR_RETURN; } / Set up pointers to the individual character tables / if (tables == NULL) tables = _pcre_default_tables; cd->lcc = tables + lcc_offset; cd->fcc = tables + fcc_offset; cd->cbits = tables + cbits_offset; cd->ctypes = tables + ctypes_offset; / Check for global one-time settings at the start of the pattern, and remember the offset for later. / while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '') { int newnl = 0; int newbsr = 0; if (strncmp((char )(ptr+skipatstart+2), "CR)", 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_CR; } else if (strncmp((char )(ptr+skipatstart+2), "LF)", 3) == 0) { skipatstart += 5; newnl = PCRE_NEWLINE_LF; } else if (strncmp((char )(ptr+skipatstart+2), "CRLF)", 5) == 0) { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; } else if (strncmp((char )(ptr+skipatstart+2), "ANY)", 4) == 0) { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; } else if (strncmp((char )(ptr+skipatstart+2), "ANYCRLF)", 8) == 0) { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; } else if (strncmp((char )(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; } else if (strncmp((char )(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0) { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; } if (newnl != 0) options = (options & ~PCRE_NEWLINE_BITS) \| newnl; else if (newbsr != 0) options = (options & ~(PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) \| newbsr; else break; } / Check validity of \R options. / switch (options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) { case 0: case PCRE_BSR_ANYCRLF: case PCRE_BSR_UNICODE: break; default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; } / Handle different types of newline. The three bits give seven cases. The current code allows for fixed one- or two-byte sequences, plus "any" and "anycrlf". / switch (options & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; / Build-time default / case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) \| '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN; } if (newline == -2) { cd->nltype = NLTYPE_ANYCRLF; } else if (newline < 0) { cd->nltype = NLTYPE_ANY; } else { cd->nltype = NLTYPE_FIXED; if (newline > 255) { cd->nllen = 2; cd->nl[0] = (newline >> 8) & 255; cd->nl[1] = newline & 255; } else { cd->nllen = 1; cd->nl[0] = newline; } } / Maximum back reference and backref bitmap. The bitmap records up to 31 back references to help in deciding whether (.) can be treated as anchored or not. / cd->top_backref = 0; cd->backref_map = 0; /* Reflect pattern for debugging output / DPRINTF(("------------------------------------------------------------------\n")); DPRINTF(("%s\n", pattern)); / Pretend to compile the pattern while actually just accumulating the length of memory required. This behaviour is triggered by passing a non-NULL final argument to compile_regex(). We pass a block of workspace (cworkspace) for it to compile parts of the pattern into; the compiled code is discarded when it is no longer needed, so hopefully this workspace will never overflow, though there is a test for its doing so. / cd->bracount = cd->final_bracount = 0; cd->names_found = 0; cd->name_entry_size = 0; cd->name_table = NULL; cd->start_workspace = cworkspace; cd->start_code = cworkspace; cd->hwm = cworkspace; cd->start_pattern = (const uschar )pattern; cd->end_pattern = (const uschar )(pattern + strlen(pattern)); cd->req_varyopt = 0; cd->external_options = options; cd->external_flags = 0; / Now do the pre-compile. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. The initial options have been put into the cd block so that they can be changed if an option setting is found within the regex right at the beginning. Bringing initial option settings outside can help speed up starting point checks. / ptr += skipatstart; code = cworkspace; code = OP_BRA; (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS, &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length); if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN; DPRINTF(("end pre-compile: length=%d workspace=%d\n", length, cd->hwm - cworkspace)); if (length > MAX_PATTERN_SIZE) { errorcode = ERR20; goto PCRE_EARLY_ERROR_RETURN; } /* Compute the size of data block needed and get it, either from malloc or externally provided function. Integer overflow should no longer be possible because nowadays we limit the maximum value of cd->names_found and cd->name_entry_size. / size = length + sizeof(real_pcre) + cd->names_found (cd->name_entry_size + 3); re = (real_pcre )(pcre_malloc)(size); if (re == NULL) { errorcode = ERR21; goto PCRE_EARLY_ERROR_RETURN; } / Put in the magic number, and save the sizes, initial options, internal flags, and character table pointer. NULL is used for the default character tables. The nullpad field is at the end; it's there to help in the case when a regex compiled on a system with 4-byte pointers is run on another with 8-byte pointers. / re->magic_number = MAGIC_NUMBER; re->size = size; re->options = cd->external_options; re->flags = cd->external_flags; re->dummy1 = 0; re->first_byte = 0; re->req_byte = 0; re->name_table_offset = sizeof(real_pcre); re->name_entry_size = cd->name_entry_size; re->name_count = cd->names_found; re->ref_count = 0; re->tables = (tables == _pcre_default_tables)? NULL : tables; re->nullpad = NULL; / The starting points of the name/number translation table and of the code are passed around in the compile data block. The start/end pattern and initial options are already set from the pre-compile phase, as is the name_entry_size field. Reset the bracket count and the names_found field. Also reset the hwm field; this time it's used for remembering forward references to subpatterns. / cd->final_bracount = cd->bracount; / Save for checking forward references / cd->bracount = 0; cd->names_found = 0; cd->name_table = (uschar )re + re->name_table_offset; codestart = cd->name_table + re->name_entry_size * re->name_count; cd->start_code = codestart; cd->hwm = cworkspace; cd->req_varyopt = 0; cd->had_accept = FALSE; /* Set up a starting, non-extracting bracket, then compile the expression. On error, errorcode will be set non-zero, so we don't need to look at the result of the function here. / ptr = (const uschar )pattern + skipatstart; code = (uschar )codestart; code = OP_BRA; (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL); re->top_bracket = cd->bracount; re->top_backref = cd->top_backref; re->flags = cd->external_flags; if (cd->had_accept) reqbyte = -1; /* Must disable after (ACCEPT) / /* If not reached end of pattern on success, there's an excess bracket. / if (errorcode == 0 && ptr != 0) errorcode = ERR22; /* Fill in the terminating state and check for disastrous overflow, but if debugging, leave the test till after things are printed out. / code++ = OP_END; #ifndef DEBUG if (code - codestart > length) errorcode = ERR23; #endif /* Fill in any forward references that are required. / while (errorcode == 0 && cd->hwm > cworkspace) { int offset, recno; const uschar groupptr; cd->hwm -= LINK_SIZE; offset = GET(cd->hwm, 0); recno = GET(codestart, offset); groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno); if (groupptr == NULL) errorcode = ERR53; else PUT(((uschar )codestart), offset, groupptr - codestart); } / Give an error if there's back reference to a non-existent capturing subpattern. / if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15; / Failed to compile, or error while post-processing / if (errorcode != 0) { (pcre_free)(re); PCRE_EARLY_ERROR_RETURN: erroroffset = ptr - (const uschar )pattern; PCRE_EARLY_ERROR_RETURN2: errorptr = find_error_text(errorcode); if (errorcodeptr != NULL) errorcodeptr = errorcode; return NULL; } / If the anchored option was not passed, set the flag if we can determine that the pattern is anchored by virtue of ^ characters or \A or anything else (such as starting with .* when DOTALL is set). Otherwise, if we know what the first byte has to be, save it, because that speeds up unanchored matches no end. If not, see if we can set the PCRE_STARTLINE flag. This is helpful for multiline matches when all branches start with ^. and also when all branches start with .* for non-DOTALL matches. / if ((re->options & PCRE_ANCHORED) == 0) { int temp_options = re->options; / May get changed during these scans / if (is_anchored(codestart, &temp_options, 0, cd->backref_map)) re->options \|= PCRE_ANCHORED; else { if (firstbyte < 0) firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE); if (firstbyte >= 0) / Remove caseless flag for non-caseable chars / { int ch = firstbyte & 255; re->first_byte = ((firstbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? ch : firstbyte; re->flags \|= PCRE_FIRSTSET; } else if (is_startline(codestart, 0, cd->backref_map)) re->flags \|= PCRE_STARTLINE; } } / For an anchored pattern, we use the "required byte" only if it follows a variable length item in the regex. Remove the caseless flag for non-caseable bytes. / if (reqbyte >= 0 && ((re->options & PCRE_ANCHORED) == 0 \|\| (reqbyte & REQ_VARY) != 0)) { int ch = reqbyte & 255; re->req_byte = ((reqbyte & REQ_CASELESS) != 0 && cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte; re->flags \|= PCRE_REQCHSET; } / Print out the compiled data if debugging is enabled. This is never the case when building a production library. / #ifdef DEBUG printf("Length = %d top_bracket = %d top_backref = %d\n", length, re->top_bracket, re->top_backref); printf("Options=%08x\n", re->options); if ((re->flags & PCRE_FIRSTSET) != 0) { int ch = re->first_byte & 255; const char caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; if (isprint(ch)) printf("First char = %c%s\n", ch, caseless); else printf("First char = \\x%02x%s\n", ch, caseless); } if ((re->flags & PCRE_REQCHSET) != 0) { int ch = re->req_byte & 255; const char caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)"; if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless); else printf("Req char = \\x%02x%s\n", ch, caseless); } pcre_printint(re, stdout, TRUE); / This check is done here in the debugging case so that the code that was compiled can be seen. / if (code - codestart > length) { (pcre_free)(re); errorptr = find_error_text(ERR23); erroroffset = ptr - (uschar )pattern; if (errorcodeptr != NULL) errorcodeptr = ERR23; return NULL; } #endif / DEBUG / return (pcre )re; }	pcrecomp.c	2403
pcreconf.c
Type	Function	Source	Line
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_config(int what, void where) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_config(int what, void where) { switch (what) { case PCRE_CONFIG_UTF8: #ifdef SUPPORT_UTF8 ((int )where) = 1; #else ((int )where) = 0; #endif break; case PCRE_CONFIG_UNICODE_PROPERTIES: #ifdef SUPPORT_UCP ((int )where) = 1; #else ((int )where) = 0; #endif break; case PCRE_CONFIG_NEWLINE: ((int )where) = NEWLINE; break; case PCRE_CONFIG_BSR: #ifdef BSR_ANYCRLF ((int )where) = 1; #else ((int )where) = 0; #endif break; case PCRE_CONFIG_LINK_SIZE: ((int )where) = LINK_SIZE; break; case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD: ((int )where) = POSIX_MALLOC_THRESHOLD; break; case PCRE_CONFIG_MATCH_LIMIT: ((unsigned int )where) = MATCH_LIMIT; break; case PCRE_CONFIG_MATCH_LIMIT_RECURSION: ((unsigned int )where) = MATCH_LIMIT_RECURSION; break; case PCRE_CONFIG_STACKRECURSE: #ifdef NO_RECURSE ((int )where) = 0; #else ((int )where) = 1; #endif break; default: return PCRE_ERROR_BADOPTION; } return 0; }	pcreconf.c	65
pcredfa.c
Type	Function	Source	Line
STATIC VOID	pchars(unsigned char p, int length, FILE f) static void pchars(unsigned char p, int length, FILE f) { int c; while (length-- > 0) { if (isprint(c = (p++))) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); } } #endif /************************************************ * Execute a Regular Expression - DFA engine * ************************************************/ / This internal function applies a compiled pattern to a subject string, starting at a given point, using a DFA engine. This function is called from the external one, possibly multiple times if the pattern is not anchored. The function calls itself recursively for some kinds of subpattern. Arguments: md the match_data block with fixed information this_start_code the opening bracket of this subexpression's code current_subject where we currently are in the subject string start_offset start offset in the subject string offsets vector to contain the matching string offsets offsetcount size of same workspace vector of workspace wscount size of same ims the current ims flags rlevel function call recursion level recursing regex recursive call level Returns: > 0 => number of match offset pairs placed in offsets = 0 => offsets overflowed; longest matches are present -1 => failed to match < -1 => some kind of unexpected problem The following macros are used for adding states to the two state vectors (one for the current character, one for the following character). / #define ADD_ACTIVE(x,y) \ if (active_count++ < wscount) \ { \ next_active_state->offset = (x); \ next_active_state->count = (y); \ next_active_state->ims = ims; \ next_active_state++; \ DPRINTF(("%.sADD_ACTIVE(%d,%d)\n", rlevel2-2, SP, (x), (y))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_ACTIVE_DATA(x,y,z) \ if (active_count++ < wscount) \ { \ next_active_state->offset = (x); \ next_active_state->count = (y); \ next_active_state->ims = ims; \ next_active_state->data = (z); \ next_active_state++; \ DPRINTF(("%.sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel2-2, SP, (x), (y), (z))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_NEW(x,y) \ if (new_count++ < wscount) \ { \ next_new_state->offset = (x); \ next_new_state->count = (y); \ next_new_state->ims = ims; \ next_new_state++; \ DPRINTF(("%.sADD_NEW(%d,%d)\n", rlevel2-2, SP, (x), (y))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_NEW_DATA(x,y,z) \ if (new_count++ < wscount) \ { \ next_new_state->offset = (x); \ next_new_state->count = (y); \ next_new_state->ims = ims; \ next_new_state->data = (z); \ next_new_state++; \ DPRINTF(("%.sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ } \ else return PCRE_ERROR_DFA_WSSIZE	pcredfa.c	188
ELSE RETURN PCRE_ERROR_DFA_WSSIZE STATIC INT	internal_dfa_exec( dfa_match_data md, const uschar this_start_code, const uschar current_subject, int start_offset, int offsets, int offsetcount, int workspace, int wscount, int ims, int rlevel, int recursing) static int internal_dfa_exec( dfa_match_data md, const uschar this_start_code, const uschar current_subject, int start_offset, int offsets, int offsetcount, int workspace, int wscount, int ims, int rlevel, int recursing) { stateblock active_states, new_states, temp_states; stateblock next_active_state, next_new_state; const uschar ctypes, lcc, fcc; const uschar ptr; const uschar end_code, first_op; int active_count, new_count, match_count; / Some fields in the md block are frequently referenced, so we load them into independent variables in the hope that this will perform better. / const uschar start_subject = md->start_subject; const uschar end_subject = md->end_subject; const uschar start_code = md->start_code; #ifdef SUPPORT_UTF8 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; #else BOOL utf8 = FALSE; #endif rlevel++; offsetcount &= (-2); wscount -= 2; wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / (2 * INTS_PER_STATEBLOCK); DPRINTF(("\n%.s---------------------\n" "%.sCall to internal_dfa_exec f=%d r=%d\n", rlevel2-2, SP, rlevel2-2, SP, rlevel, recursing)); ctypes = md->tables + ctypes_offset; lcc = md->tables + lcc_offset; fcc = md->tables + fcc_offset; match_count = PCRE_ERROR_NOMATCH; /* A negative number / active_states = (stateblock )(workspace + 2); next_new_state = new_states = active_states + wscount; new_count = 0; first_op = this_start_code + 1 + LINK_SIZE + ((this_start_code == OP_CBRA \|\| this_start_code == OP_SCBRA)? 2:0); /* The first thing in any (sub) pattern is a bracket of some sort. Push all the alternative states onto the list, and find out where the end is. This makes is possible to use this function recursively, when we want to stop at a matching internal ket rather than at the end. If the first opcode in the first alternative is OP_REVERSE, we are dealing with a backward assertion. In that case, we have to find out the maximum amount to move back, and set up each alternative appropriately. / if (first_op == OP_REVERSE) { int max_back = 0; int gone_back; end_code = this_start_code; do { int back = GET(end_code, 2+LINK_SIZE); if (back > max_back) max_back = back; end_code += GET(end_code, 1); } while (end_code == OP_ALT); / If we can't go back the amount required for the longest lookbehind pattern, go back as far as we can; some alternatives may still be viable. / #ifdef SUPPORT_UTF8 / In character mode we have to step back character by character / if (utf8) { for (gone_back = 0; gone_back < max_back; gone_back++) { if (current_subject <= start_subject) break; current_subject--; while (current_subject > start_subject && (current_subject & 0xc0) == 0x80) current_subject--; } } else #endif /* In byte-mode we can do this quickly. / { gone_back = (current_subject - max_back < start_subject)? current_subject - start_subject : max_back; current_subject -= gone_back; } / Now we can process the individual branches. / end_code = this_start_code; do { int back = GET(end_code, 2+LINK_SIZE); if (back <= gone_back) { int bstate = end_code - start_code + 2 + 2LINK_SIZE; ADD_NEW_DATA(-bstate, 0, gone_back - back); } end_code += GET(end_code, 1); } while (end_code == OP_ALT); } / This is the code for a "normal" subpattern (not a backward assertion). The start of a whole pattern is always one of these. If we are at the top level, we may be asked to restart matching from the same point that we reached for a previous partial match. We still have to scan through the top-level branches to find the end state. / else { end_code = this_start_code; / Restarting / if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) { do { end_code += GET(end_code, 1); } while (end_code == OP_ALT); new_count = workspace[1]; if (!workspace[0]) memcpy(new_states, active_states, new_count * sizeof(stateblock)); } /* Not restarting / else { int length = 1 + LINK_SIZE + ((this_start_code == OP_CBRA \|\| this_start_code == OP_SCBRA)? 2:0); do { ADD_NEW(end_code - start_code + length, 0); end_code += GET(end_code, 1); length = 1 + LINK_SIZE; } while (end_code == OP_ALT); } } workspace[0] = 0; /* Bit indicating which vector is current / DPRINTF(("%.sEnd state = %d\n", rlevel2-2, SP, end_code - start_code)); / Loop for scanning the subject / ptr = current_subject; for (;;) { int i, j; int clen, dlen; unsigned int c, d; / Make the new state list into the active state list and empty the new state list. / temp_states = active_states; active_states = new_states; new_states = temp_states; active_count = new_count; new_count = 0; workspace[0] ^= 1; / Remember for the restarting feature / workspace[1] = active_count; #ifdef DEBUG printf("%.sNext character: rest of subject = \"", rlevel2-2, SP); pchars((uschar )ptr, strlen((char )ptr), stdout); printf("\"\n"); printf("%.sActive states: ", rlevel2-2, SP); for (i = 0; i < active_count; i++) printf("%d/%d ", active_states[i].offset, active_states[i].count); printf("\n"); #endif / Set the pointers for adding new states / next_active_state = active_states + active_count; next_new_state = new_states; / Load the current character from the subject outside the loop, as many different states may want to look at it, and we assume that at least one will. / if (ptr < end_subject) { clen = 1; / Number of bytes in the character / #ifdef SUPPORT_UTF8 if (utf8) { GETCHARLEN(c, ptr, clen); } else #endif / SUPPORT_UTF8 / c = ptr; } else { clen = 0; /* This indicates the end of the subject / c = NOTACHAR; / This value should never actually be used / } / Scan up the active states and act on each one. The result of an action may be to add more states to the currently active list (e.g. on hitting a parenthesis) or it may be to put states on the new list, for considering when we move the character pointer on. / for (i = 0; i < active_count; i++) { stateblock current_state = active_states + i; const uschar code; int state_offset = current_state->offset; int count, codevalue; #ifdef DEBUG printf ("%.sProcessing state %d c=", rlevel2-2, SP, state_offset); if (clen == 0) printf("EOL\n"); else if (c > 32 && c < 127) printf("'%c'\n", c); else printf("0x%02x\n", c); #endif / This variable is referred to implicity in the ADD_xxx macros. / ims = current_state->ims; / A negative offset is a special case meaning "hold off going to this (negated) state until the number of characters in the data field have been skipped". / if (state_offset < 0) { if (current_state->data > 0) { DPRINTF(("%.sSkipping this character\n", rlevel2-2, SP)); ADD_NEW_DATA(state_offset, current_state->count, current_state->data - 1); continue; } else { current_state->offset = state_offset = -state_offset; } } / Check for a duplicate state with the same count, and skip if found. / for (j = 0; j < i; j++) { if (active_states[j].offset == state_offset && active_states[j].count == current_state->count) { DPRINTF(("%.sDuplicate state: skipped\n", rlevel2-2, SP)); goto NEXT_ACTIVE_STATE; } } / The state offset is the offset to the opcode / code = start_code + state_offset; codevalue = code; /* If this opcode is followed by an inline character, load it. It is tempting to test for the presence of a subject character here, but that is wrong, because sometimes zero repetitions of the subject are permitted. We also use this mechanism for opcodes such as OP_TYPEPLUS that take an argument that is not a data character - but is always one byte long. We have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert these ones to new opcodes. / if (coptable[codevalue] > 0) { dlen = 1; #ifdef SUPPORT_UTF8 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else #endif / SUPPORT_UTF8 / d = code[coptable[codevalue]]; if (codevalue >= OP_TYPESTAR) { switch(d) { case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; case OP_NOTPROP: case OP_PROP: codevalue += OP_PROP_EXTRA; break; case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; case OP_NOT_HSPACE: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; case OP_NOT_VSPACE: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; default: break; } } } else { dlen = 0; / Not strictly necessary, but compilers moan / d = NOTACHAR; / if these variables are not set. / } / Now process the individual opcodes / switch (codevalue) { / ========================================================================== / / Reached a closing bracket. If not at the end of the pattern, carry on with the next opcode. Otherwise, unless we have an empty string and PCRE_NOTEMPTY is set, save the match data, shifting up all previous matches so we always have the longest first. / case OP_KET: case OP_KETRMIN: case OP_KETRMAX: if (code != end_code) { ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); if (codevalue != OP_KET) { ADD_ACTIVE(state_offset - GET(code, 1), 0); } } else if (ptr > current_subject \|\| (md->moptions & PCRE_NOTEMPTY) == 0) { if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; else if (match_count > 0 && ++match_count 2 >= offsetcount) match_count = 0; count = ((match_count == 0)? offsetcount : match_count * 2) - 2; if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); if (offsetcount >= 2) { offsets[0] = current_subject - start_subject; offsets[1] = ptr - start_subject; DPRINTF(("%.sSet matched string = \"%.s\"\n", rlevel2-2, SP, offsets[1] - offsets[0], current_subject)); } if ((md->moptions & PCRE_DFA_SHORTEST) != 0) { DPRINTF(("%.sEnd of internal_dfa_exec %d: returning %d\n" "%.s---------------------\n\n", rlevel2-2, SP, rlevel, match_count, rlevel2-2, SP)); return match_count; } } break; / ========================================================================== / / These opcodes add to the current list of states without looking at the current character. / /-----------------------------------------------------------------/ case OP_ALT: do { code += GET(code, 1); } while (code == OP_ALT); ADD_ACTIVE(code - start_code, 0); break; /-----------------------------------------------------------------/ case OP_BRA: case OP_SBRA: do { ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); code += GET(code, 1); } while (code == OP_ALT); break; /-----------------------------------------------------------------/ case OP_CBRA: case OP_SCBRA: ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); code += GET(code, 1); while (code == OP_ALT) { ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); code += GET(code, 1); } break; /-----------------------------------------------------------------/ case OP_BRAZERO: case OP_BRAMINZERO: ADD_ACTIVE(state_offset + 1, 0); code += 1 + GET(code, 2); while (code == OP_ALT) code += GET(code, 1); ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); break; /-----------------------------------------------------------------/ case OP_SKIPZERO: code += 1 + GET(code, 2); while (code == OP_ALT) code += GET(code, 1); ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); break; /-----------------------------------------------------------------/ case OP_CIRC: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) \|\| ((ims & PCRE_MULTILINE) != 0 && ptr != end_subject && WAS_NEWLINE(ptr))) { ADD_ACTIVE(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_EOD: if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_OPT: ims = code[1]; ADD_ACTIVE(state_offset + 2, 0); break; /-----------------------------------------------------------------/ case OP_SOD: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_SOM: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } break; /* ========================================================================== / / These opcodes inspect the next subject character, and sometimes the previous one as well, but do not have an argument. The variable clen contains the length of the current character and is zero if we are at the end of the subject. / /-----------------------------------------------------------------/ case OP_ANY: if (clen > 0 && !IS_NEWLINE(ptr)) { ADD_NEW(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_ALLANY: if (clen > 0) { ADD_NEW(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_EODN: if (clen == 0 \|\| (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) { ADD_ACTIVE(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_DOLL: if ((md->moptions & PCRE_NOTEOL) == 0) { if (clen == 0 \|\| (IS_NEWLINE(ptr) && ((ims & PCRE_MULTILINE) != 0 \|\| ptr == end_subject - md->nllen) )) { ADD_ACTIVE(state_offset + 1, 0); } } else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) { ADD_ACTIVE(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_DIGIT: case OP_WHITESPACE: case OP_WORDCHAR: if (clen > 0 && c < 256 && ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) { ADD_NEW(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_NOT_DIGIT: case OP_NOT_WHITESPACE: case OP_NOT_WORDCHAR: if (clen > 0 && (c >= 256 \|\| ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) { ADD_NEW(state_offset + 1, 0); } break; /-----------------------------------------------------------------/ case OP_WORD_BOUNDARY: case OP_NOT_WORD_BOUNDARY: { int left_word, right_word; if (ptr > start_subject) { const uschar temp = ptr - 1; #ifdef SUPPORT_UTF8 if (utf8) BACKCHAR(temp); #endif GETCHARTEST(d, temp); left_word = d < 256 && (ctypes[d] & ctype_word) != 0; } else left_word = 0; if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; else right_word = 0; if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) { ADD_ACTIVE(state_offset + 1, 0); } } break; /-----------------------------------------------------------------/ /* Check the next character by Unicode property. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / #ifdef SUPPORT_UCP case OP_PROP: case OP_NOTPROP: if (clen > 0) { BOOL OK; const ucd_record prop = GET_UCD(c); switch(code[1]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[2]; break; case PT_PC: OK = prop->chartype == code[2]; break; case PT_SC: OK = prop->script == code[2]; break; /* Should never occur, but keep compilers from grumbling. / default: OK = codevalue != OP_PROP; break; } if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } } break; #endif / ========================================================================== / / These opcodes likewise inspect the subject character, but have an argument that is not a data character. It is one of these opcodes: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. / case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\| (c < 256 && (d != OP_ANY \|\| !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (count > 0 && codevalue == OP_TYPEPOSPLUS) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /-----------------------------------------------------------------/ case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\| (c < 256 && (d != OP_ANY \|\| !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW(state_offset + 2, 0); } } break; /-----------------------------------------------------------------/ case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\| (c < 256 && (d != OP_ANY \|\| !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSSTAR) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW(state_offset, 0); } } break; /-----------------------------------------------------------------/ case OP_TYPEEXACT: count = current_state->count; / Number already matched / if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\| (c < 256 && (d != OP_ANY \|\| !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 4, 0); } else { ADD_NEW(state_offset, count); } } } break; /-----------------------------------------------------------------/ case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: ADD_ACTIVE(state_offset + 4, 0); count = current_state->count; / Number already matched / if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) \|\| (c < 256 && (d != OP_ANY \|\| !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 4, 0); } else { ADD_NEW(state_offset, count); } } } break; / ========================================================================== / / These are virtual opcodes that are used when something like OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its argument. It keeps the code above fast for the other cases. The argument is in the d variable. / #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEPLUS: case OP_PROP_EXTRA + OP_TYPEMINPLUS: case OP_PROP_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } if (clen > 0) { BOOL OK; const ucd_record prop = GET_UCD(c); switch(code[2]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[3]; break; case PT_PC: OK = prop->chartype == code[3]; break; case PT_SC: OK = prop->script == code[3]; break; /* Should never occur, but keep compilers from grumbling. / default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /-----------------------------------------------------------------/ case OP_EXTUNI_EXTRA + OP_TYPEPLUS: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility / next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } count++; ADD_NEW_DATA(-state_offset, count, ncount); } break; #endif /-----------------------------------------------------------------/ case OP_ANYNL_EXTRA + OP_TYPEPLUS: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; / Fall through / ANYNL01: case 0x000a: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, ncount); break; default: break; } } break; /-----------------------------------------------------------------/ case OP_VSPACE_EXTRA + OP_TYPEPLUS: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_VSPACE)) { if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, 0); } } break; /-----------------------------------------------------------------/ case OP_HSPACE_EXTRA + OP_TYPEPLUS: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { BOOL OK; switch (c) { case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, 0); } } break; /-----------------------------------------------------------------/ #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEQUERY: case OP_PROP_EXTRA + OP_TYPEMINQUERY: case OP_PROP_EXTRA + OP_TYPEPOSQUERY: count = 4; goto QS1; case OP_PROP_EXTRA + OP_TYPESTAR: case OP_PROP_EXTRA + OP_TYPEMINSTAR: case OP_PROP_EXTRA + OP_TYPEPOSSTAR: count = 0; QS1: ADD_ACTIVE(state_offset + 4, 0); if (clen > 0) { BOOL OK; const ucd_record prop = GET_UCD(c); switch(code[2]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[3]; break; case PT_PC: OK = prop->chartype == code[3]; break; case PT_SC: OK = prop->script == code[3]; break; /* Should never occur, but keep compilers from grumbling. / default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR \|\| codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW(state_offset + count, 0); } } break; /-----------------------------------------------------------------/ case OP_EXTUNI_EXTRA + OP_TYPEQUERY: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS2; case OP_EXTUNI_EXTRA + OP_TYPESTAR: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: count = 0; QS2: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR \|\| codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility / next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); } break; #endif /-----------------------------------------------------------------/ case OP_ANYNL_EXTRA + OP_TYPEQUERY: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS3; case OP_ANYNL_EXTRA + OP_TYPESTAR: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: count = 0; QS3: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; / Fall through / ANYNL02: case 0x000a: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR \|\| codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); break; default: break; } } break; /-----------------------------------------------------------------/ case OP_VSPACE_EXTRA + OP_TYPEQUERY: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS4; case OP_VSPACE_EXTRA + OP_TYPESTAR: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: count = 0; QS4: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_VSPACE)) { if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR \|\| codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, 0); } } break; /-----------------------------------------------------------------/ case OP_HSPACE_EXTRA + OP_TYPEQUERY: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS5; case OP_HSPACE_EXTRA + OP_TYPESTAR: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: count = 0; QS5: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { BOOL OK; switch (c) { case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR \|\| codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, 0); } } break; /-----------------------------------------------------------------/ #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEEXACT: case OP_PROP_EXTRA + OP_TYPEUPTO: case OP_PROP_EXTRA + OP_TYPEMINUPTO: case OP_PROP_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 6, 0); } count = current_state->count; / Number already matched / if (clen > 0) { BOOL OK; const ucd_record prop = GET_UCD(c); switch(code[4]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[5]; break; case PT_PC: OK = prop->chartype == code[5]; break; case PT_SC: OK = prop->script == code[5]; break; /* Should never occur, but keep compilers from grumbling. / default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 6, 0); } else { ADD_NEW(state_offset, count); } } } break; /-----------------------------------------------------------------/ case OP_EXTUNI_EXTRA + OP_TYPEEXACT: case OP_EXTUNI_EXTRA + OP_TYPEUPTO: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; / Number already matched / if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility / next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } } break; #endif /-----------------------------------------------------------------/ case OP_ANYNL_EXTRA + OP_TYPEEXACT: case OP_ANYNL_EXTRA + OP_TYPEUPTO: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; / Number already matched / if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; / Fall through / ANYNL03: case 0x000a: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } break; default: break; } } break; /-----------------------------------------------------------------/ case OP_VSPACE_EXTRA + OP_TYPEEXACT: case OP_VSPACE_EXTRA + OP_TYPEUPTO: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; / Number already matched / if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; } if (OK == (d == OP_VSPACE)) { if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } } break; /-----------------------------------------------------------------/ case OP_HSPACE_EXTRA + OP_TYPEEXACT: case OP_HSPACE_EXTRA + OP_TYPEUPTO: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; / Number already matched / if (clen > 0) { BOOL OK; switch (c) { case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } } break; / ========================================================================== / / These opcodes are followed by a character that is usually compared to the current subject character; it is loaded into d. We still get here even if there is no subject character, because in some cases zero repetitions are permitted. / /-----------------------------------------------------------------/ case OP_CHAR: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } break; /-----------------------------------------------------------------/ case OP_CHARNC: if (clen == 0) break; #ifdef SUPPORT_UTF8 if (utf8) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { unsigned int othercase; if (c < 128) othercase = fcc[c]; else / If we have Unicode property support, we can use it to test the other case of the character. / #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE(c); #else othercase = NOTACHAR; #endif if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } } } else #endif / SUPPORT_UTF8 / / Non-UTF-8 mode / { if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } } break; #ifdef SUPPORT_UCP /-----------------------------------------------------------------/ / This is a tricky one because it can match more than one character. Find out how many characters to skip, and then set up a negative state to wait for them to pass before continuing. / case OP_EXTUNI: if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar nptr = ptr + clen; int ncount = 0; while (nptr < end_subject) { int nclen = 1; GETCHARLEN(c, nptr, nclen); if (UCD_CATEGORY(c) != ucp_M) break; ncount++; nptr += nclen; } ADD_NEW_DATA(-(state_offset + 1), 0, ncount); } break; #endif /-----------------------------------------------------------------/ /* This is a tricky like EXTUNI because it too can match more than one character (when CR is followed by LF). In this case, set up a negative state to wait for one character to pass before continuing. / case OP_ANYNL: if (clen > 0) switch(c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; case 0x000a: ADD_NEW(state_offset + 1, 0); break; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } else { ADD_NEW(state_offset + 1, 0); } break; } break; /-----------------------------------------------------------------/ case OP_NOT_VSPACE: if (clen > 0) switch(c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: break; default: ADD_NEW(state_offset + 1, 0); break; } break; /-----------------------------------------------------------------/ case OP_VSPACE: if (clen > 0) switch(c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: ADD_NEW(state_offset + 1, 0); break; default: break; } break; /-----------------------------------------------------------------/ case OP_NOT_HSPACE: if (clen > 0) switch(c) { case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / break; default: ADD_NEW(state_offset + 1, 0); break; } break; /-----------------------------------------------------------------/ case OP_HSPACE: if (clen > 0) switch(c) { case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / ADD_NEW(state_offset + 1, 0); break; } break; /-----------------------------------------------------------------/ / Match a negated single character. This is only used for one-byte characters, that is, we know that d < 256. The character we are checking (c) can be multibyte. / case OP_NOT: if (clen > 0) { unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } } break; /-----------------------------------------------------------------/ case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTPOSPLUS: count = current_state->count; / Already matched / if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / otherd = fcc[d]; } if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR)) { if (count > 0 && (codevalue == OP_POSPLUS \|\| codevalue == OP_NOTPOSPLUS)) { active_count--; / Remove non-match possibility / next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /-----------------------------------------------------------------/ case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: case OP_NOTQUERY: case OP_NOTMINQUERY: case OP_NOTPOSQUERY: ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / otherd = fcc[d]; } if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSQUERY \|\| codevalue == OP_NOTPOSQUERY) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW(state_offset + dlen + 1, 0); } } break; /-----------------------------------------------------------------/ case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_NOTSTAR: case OP_NOTMINSTAR: case OP_NOTPOSSTAR: ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / otherd = fcc[d]; } if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSSTAR \|\| codevalue == OP_NOTPOSSTAR) { active_count--; / Remove non-match possibility / next_active_state--; } ADD_NEW(state_offset, 0); } } break; /-----------------------------------------------------------------/ case OP_EXACT: case OP_NOTEXACT: count = current_state->count; / Number already matched / if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / otherd = fcc[d]; } if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR)) { if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + dlen + 3, 0); } else { ADD_NEW(state_offset, count); } } } break; /-----------------------------------------------------------------/ case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_NOTUPTO: case OP_NOTMINUPTO: case OP_NOTPOSUPTO: ADD_ACTIVE(state_offset + dlen + 3, 0); count = current_state->count; / Number already matched / if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif / SUPPORT_UCP / } else #endif / SUPPORT_UTF8 / otherd = fcc[d]; } if ((c == d \|\| c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSUPTO \|\| codevalue == OP_NOTPOSUPTO) { active_count--; / Remove non-match possibility / next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + dlen + 3, 0); } else { ADD_NEW(state_offset, count); } } } break; / ========================================================================== / / These are the class-handling opcodes / case OP_CLASS: case OP_NCLASS: case OP_XCLASS: { BOOL isinclass = FALSE; int next_state_offset; const uschar ecode; /* For a simple class, there is always just a 32-byte table, and we can set isinclass from it. / if (codevalue != OP_XCLASS) { ecode = code + 33; if (clen > 0) { isinclass = (c > 255)? (codevalue == OP_NCLASS) : ((code[1 + c/8] & (1 << (c&7))) != 0); } } / An extended class may have a table or a list of single characters, ranges, or both, and it may be positive or negative. There's a function that sorts all this out. / else { ecode = code + GET(code, 1); if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); } / At this point, isinclass is set for all kinds of class, and ecode points to the byte after the end of the class. If there is a quantifier, this is where it will be. / next_state_offset = ecode - start_code; switch (ecode) { case OP_CRSTAR: case OP_CRMINSTAR: ADD_ACTIVE(next_state_offset + 1, 0); if (isinclass) { ADD_NEW(state_offset, 0); } break; case OP_CRPLUS: case OP_CRMINPLUS: count = current_state->count; /* Already matched / if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } if (isinclass) { count++; ADD_NEW(state_offset, count); } break; case OP_CRQUERY: case OP_CRMINQUERY: ADD_ACTIVE(next_state_offset + 1, 0); if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } break; case OP_CRRANGE: case OP_CRMINRANGE: count = current_state->count; / Already matched / if (count >= GET2(ecode, 1)) { ADD_ACTIVE(next_state_offset + 5, 0); } if (isinclass) { int max = GET2(ecode, 3); if (++count >= max && max != 0) / Max 0 => no limit / { ADD_NEW(next_state_offset + 5, 0); } else { ADD_NEW(state_offset, count); } } break; default: if (isinclass) { ADD_NEW(next_state_offset, 0); } break; } } break; / ========================================================================== / / These are the opcodes for fancy brackets of various kinds. We have to use recursion in order to handle them. The "always failing" assersion (?!) is optimised when compiling to OP_FAIL, so we have to support that, though the other "backtracking verbs" are not supported. / case OP_FAIL: break; case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: { int rc; int local_offsets[2]; int local_workspace[1000]; const uschar endasscode = code + GET(code, 1); while (endasscode == OP_ALT) endasscode += GET(endasscode, 1); rc = internal_dfa_exec( md, / static match data / code, / this subexpression's code / ptr, / where we currently are / ptr - start_subject, / start offset / local_offsets, / offset vector / sizeof(local_offsets)/sizeof(int), / size of same / local_workspace, / workspace vector / sizeof(local_workspace)/sizeof(int), / size of same / ims, / the current ims flags / rlevel, / function recursion level / recursing); / pass on regex recursion / if ((rc >= 0) == (codevalue == OP_ASSERT \|\| codevalue == OP_ASSERTBACK)) { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } } break; /-----------------------------------------------------------------/ case OP_COND: case OP_SCOND: { int local_offsets[1000]; int local_workspace[1000]; int condcode = code[LINK_SIZE+1]; / Back reference conditions are not supported / if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; / The DEFINE condition is always false / if (condcode == OP_DEF) { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } / The only supported version of OP_RREF is for the value RREF_ANY, which means "test if in any recursion". We can't test for specifically recursed groups. / else if (condcode == OP_RREF) { int value = GET2(code, LINK_SIZE+2); if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } } / Otherwise, the condition is an assertion / else { int rc; const uschar asscode = code + LINK_SIZE + 1; const uschar endasscode = asscode + GET(asscode, 1); while (endasscode == OP_ALT) endasscode += GET(endasscode, 1); rc = internal_dfa_exec( md, /* fixed match data / asscode, / this subexpression's code / ptr, / where we currently are / ptr - start_subject, / start offset / local_offsets, / offset vector / sizeof(local_offsets)/sizeof(int), / size of same / local_workspace, / workspace vector / sizeof(local_workspace)/sizeof(int), / size of same / ims, / the current ims flags / rlevel, / function recursion level / recursing); / pass on regex recursion / if ((rc >= 0) == (condcode == OP_ASSERT \|\| condcode == OP_ASSERTBACK)) { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } } } break; /-----------------------------------------------------------------/ case OP_RECURSE: { int local_offsets[1000]; int local_workspace[1000]; int rc; DPRINTF(("%.sStarting regex recursion %d\n", rlevel2-2, SP, recursing + 1)); rc = internal_dfa_exec( md, / fixed match data / start_code + GET(code, 1), / this subexpression's code / ptr, / where we currently are / ptr - start_subject, / start offset / local_offsets, / offset vector / sizeof(local_offsets)/sizeof(int), / size of same / local_workspace, / workspace vector / sizeof(local_workspace)/sizeof(int), / size of same / ims, / the current ims flags / rlevel, / function recursion level / recursing + 1); / regex recurse level / DPRINTF(("%.sReturn from regex recursion %d: rc=%d\n", rlevel2-2, SP, recursing + 1, rc)); / Ran out of internal offsets / if (rc == 0) return PCRE_ERROR_DFA_RECURSE; / For each successful matched substring, set up the next state with a count of characters to skip before trying it. Note that the count is in characters, not bytes. / if (rc > 0) { for (rc = rc2 - 2; rc >= 0; rc -= 2) { const uschar p = start_subject + local_offsets[rc]; const uschar pp = start_subject + local_offsets[rc+1]; int charcount = local_offsets[rc+1] - local_offsets[rc]; while (p < pp) if ((p++ & 0xc0) == 0x80) charcount--; if (charcount > 0) { ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); } else { ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); } } } else if (rc != PCRE_ERROR_NOMATCH) return rc; } break; /-----------------------------------------------------------------/ case OP_ONCE: { int local_offsets[2]; int local_workspace[1000]; int rc = internal_dfa_exec( md, / fixed match data / code, / this subexpression's code / ptr, / where we currently are / ptr - start_subject, / start offset / local_offsets, / offset vector / sizeof(local_offsets)/sizeof(int), / size of same / local_workspace, / workspace vector / sizeof(local_workspace)/sizeof(int), / size of same / ims, / the current ims flags / rlevel, / function recursion level / recursing); / pass on regex recursion / if (rc >= 0) { const uschar end_subpattern = code; int charcount = local_offsets[1] - local_offsets[0]; int next_state_offset, repeat_state_offset; do { end_subpattern += GET(end_subpattern, 1); } while (end_subpattern == OP_ALT); next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; / If the end of this subpattern is KETRMAX or KETRMIN, we must arrange for the repeat state also to be added to the relevant list. Calculate the offset, or set -1 for no repeat. / repeat_state_offset = (end_subpattern == OP_KETRMAX \|\| end_subpattern == OP_KETRMIN)? end_subpattern - start_code - GET(end_subpattern, 1) : -1; / If we have matched an empty string, add the next state at the current character pointer. This is important so that the duplicate checking kicks in, which is what breaks infinite loops that match an empty string. / if (charcount == 0) { ADD_ACTIVE(next_state_offset, 0); } / Optimization: if there are no more active states, and there are no new states yet set up, then skip over the subject string right here, to save looping. Otherwise, set up the new state to swing into action when the end of the substring is reached. / else if (i + 1 >= active_count && new_count == 0) { ptr += charcount; clen = 0; ADD_NEW(next_state_offset, 0); / If we are adding a repeat state at the new character position, we must fudge things so that it is the only current state. Otherwise, it might be a duplicate of one we processed before, and that would cause it to be skipped. / if (repeat_state_offset >= 0) { next_active_state = active_states; active_count = 0; i = -1; ADD_ACTIVE(repeat_state_offset, 0); } } else { const uschar p = start_subject + local_offsets[0]; const uschar pp = start_subject + local_offsets[1]; while (p < pp) if ((p++ & 0xc0) == 0x80) charcount--; ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); if (repeat_state_offset >= 0) { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } } } else if (rc != PCRE_ERROR_NOMATCH) return rc; } break; /* ========================================================================== / / Handle callouts / case OP_CALLOUT: if (pcre_callout != NULL) { int rrc; pcre_callout_block cb; cb.version = 1; / Version 1 of the callout block / cb.callout_number = code[1]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; cb.subject_length = end_subject - start_subject; cb.start_match = current_subject - start_subject; cb.current_position = ptr - start_subject; cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); cb.capture_top = 1; cb.capture_last = -1; cb.callout_data = md->callout_data; if ((rrc = (pcre_callout)(&cb)) < 0) return rrc; /* Abandon / if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2LINK_SIZE, 0); } } break; /* ========================================================================== / default: / Unsupported opcode / return PCRE_ERROR_DFA_UITEM; } NEXT_ACTIVE_STATE: continue; } / End of loop scanning active states / / We have finished the processing at the current subject character. If no new states have been set for the next character, we have found all the matches that we are going to find. If we are at the top level and partial matching has been requested, check for appropriate conditions. / if (new_count <= 0) { if (match_count < 0 && / No matches found / rlevel == 1 && / Top level match function / (md->moptions & PCRE_PARTIAL) != 0 && / Want partial matching / ptr >= end_subject && / Reached end of subject / ptr > current_subject) / Matched non-empty string / { if (offsetcount >= 2) { offsets[0] = current_subject - start_subject; offsets[1] = end_subject - start_subject; } match_count = PCRE_ERROR_PARTIAL; } DPRINTF(("%.sEnd of internal_dfa_exec %d: returning %d\n" "%.s---------------------\n\n", rlevel2-2, SP, rlevel, match_count, rlevel2-2, SP)); break; / In effect, "return", but see the comment below / } / One or more states are active for the next character. / ptr += clen; / Advance to next subject character / } / Loop to move along the subject string / / Control gets here from "break" a few lines above. We do it this way because if we use "return" above, we have compiler trouble. Some compilers warn if there's nothing here because they think the function doesn't return a value. On the other hand, if we put a dummy statement here, some more clever compilers complain that it can't be reached. Sigh. */ return match_count; }	pcredfa.c	282
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_dfa_exec(const pcre argument_re, const pcre_extra extra_data, const char subject, int length, int start_offset, int options, int offsets, int offsetcount, int workspace, int wscount) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre argument_re, const pcre_extra extra_data, const char subject, int length, int start_offset, int options, int offsets, int offsetcount, int workspace, int wscount) { real_pcre re = (real_pcre )argument_re; dfa_match_data match_block; dfa_match_data md = &match_block; BOOL utf8, anchored, startline, firstline; const uschar current_subject, end_subject, lcc; pcre_study_data internal_study; const pcre_study_data study = NULL; real_pcre internal_re; const uschar req_byte_ptr; const uschar start_bits = NULL; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; int newline; / Plausibility checks / if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; if (re == NULL \|\| subject == NULL \|\| workspace == NULL \|\| (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; / We need to find the pointer to any study data before we test for byte flipping, so we scan the extra_data block first. This may set two fields in the match block, so we must initialize them beforehand. However, the other fields in the match block must not be set until after the byte flipping. / md->tables = re->tables; md->callout_data = NULL; if (extra_data != NULL) { unsigned int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data )extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) return PCRE_ERROR_DFA_UMLIMIT; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) md->tables = extra_data->tables; } /* Check that the first field in the block is the magic number. If it is not, test for a regex that was compiled on a host of opposite endianness. If this is the case, flipped values are put in internal_re and internal_study if there was study data too. / if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } / Set some local values / current_subject = (const unsigned char )subject + start_offset; end_subject = (const unsigned char )subject + length; req_byte_ptr = current_subject - 1; #ifdef SUPPORT_UTF8 utf8 = (re->options & PCRE_UTF8) != 0; #else utf8 = FALSE; #endif anchored = (options & (PCRE_ANCHORED\|PCRE_DFA_RESTART)) != 0 \|\| (re->options & PCRE_ANCHORED) != 0; / The remaining fixed data for passing around. / md->start_code = (const uschar )argument_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (const unsigned char )subject; md->end_subject = end_subject; md->moptions = options; md->poptions = re->options; / If the BSR option is not set at match time, copy what was set at compile time. / if ((md->moptions & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) == 0) { if ((re->options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) != 0) md->moptions \|= re->options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE); #ifdef BSR_ANYCRLF else md->moptions \|= PCRE_BSR_ANYCRLF; #endif } / Handle different types of newline. The three bits give eight cases. If nothing is set at run time, whatever was used at compile time applies. / switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; / Compile-time default / case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) \| '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; } if (newline == -2) { md->nltype = NLTYPE_ANYCRLF; } else if (newline < 0) { md->nltype = NLTYPE_ANY; } else { md->nltype = NLTYPE_FIXED; if (newline > 255) { md->nllen = 2; md->nl[0] = (newline >> 8) & 255; md->nl[1] = newline & 255; } else { md->nllen = 1; md->nl[0] = newline; } } / Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. / #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar )subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { int tb = ((uschar )subject)[start_offset]; if (tb > 127) { tb &= 0xc0; if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; } } } #endif / If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. / if (md->tables == NULL) md->tables = _pcre_default_tables; / The lower casing table and the "must be at the start of a line" flag are used in a loop when finding where to start. / lcc = md->tables + lcc_offset; startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; / Set up the first character to match, if available. The first_byte value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first char may be unset for an unanchored pattern, of course. If there's no first char and the pattern was studied, there may be a bitmap of possible first characters. / if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) first_byte = lcc[first_byte]; } else { if (startline && study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } } / For anchored or unanchored matches, there may be a "last known required character" set. / if ((re->flags & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; req_byte2 = (md->tables + fcc_offset)[req_byte]; / case flipped / } / Call the main matching function, looping for a non-anchored regex after a failed match. Unless restarting, optimize by moving to the first match character if possible, when not anchored. Then unless wanting a partial match, check for a required later character. / for (;;) { int rc; if ((options & PCRE_DFA_RESTART) == 0) { const uschar save_end_subject = end_subject; /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. Implement this by temporarily adjusting end_subject so that we stop scanning at a newline. If the match fails at the newline, later code breaks this loop. / if (firstline) { USPTR t = current_subject; #ifdef SUPPORT_UTF8 if (utf8) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; while (t < end_subject && (t & 0xc0) == 0x80) t++; } } else #endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } if (first_byte >= 0) { if (first_byte_caseless) while (current_subject < end_subject && lcc[current_subject] != first_byte) current_subject++; else while (current_subject < end_subject && current_subject != first_byte) current_subject++; } /* Or to just after a linebreak for a multiline match if possible / else if (startline) { if (current_subject > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 if (utf8) { while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) { current_subject++; while(current_subject < end_subject && (current_subject & 0xc0) == 0x80) current_subject++; } } else #endif while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) current_subject++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. / if (current_subject[-1] == '\r' && (md->nltype == NLTYPE_ANY \|\| md->nltype == NLTYPE_ANYCRLF) && current_subject < end_subject && current_subject == '\n') current_subject++; } } /* Or to a non-unique first char after study / else if (start_bits != NULL) { while (current_subject < end_subject) { register unsigned int c = current_subject; if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; else break; } } /* Restore fudged end_subject / end_subject = save_end_subject; } / If req_byte is set, we know that that character must appear in the subject for the match to succeed. If the first character is set, req_byte must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of work in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as does using an autoincrement and backing off on a match. HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching /^C/ on a 32-megabyte string... so we don't do this when the string is sufficiently long. ALSO: this processing is disabled when partial matching is requested. / if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX && (options & PCRE_PARTIAL) == 0) { register const uschar p = current_subject + ((first_byte >= 0)? 1 : 0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. / if (p > req_byte_ptr) { if (req_byte_caseless) { while (p < end_subject) { register int pp = p++; if (pp == req_byte \|\| pp == req_byte2) { p--; break; } } } else { while (p < end_subject) { if (p++ == req_byte) { p--; break; } } } / If we can't find the required character, break the matching loop, which will cause a return or PCRE_ERROR_NOMATCH. / if (p >= end_subject) break; / If we have found the required character, save the point where we found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. / req_byte_ptr = p; } } / OK, now we can do the business / rc = internal_dfa_exec( md, / fixed match data / md->start_code, / this subexpression's code / current_subject, / where we currently are / start_offset, / start offset in subject / offsets, / offset vector / offsetcount, / size of same / workspace, / workspace vector / wscount, / size of same / re->options & (PCRE_CASELESS\|PCRE_MULTILINE\|PCRE_DOTALL), / ims flags / 0, / function recurse level / 0); / regex recurse level / / Anything other than "no match" means we are done, always; otherwise, carry on only if not anchored. / if (rc != PCRE_ERROR_NOMATCH \|\| anchored) return rc; / Advance to the next subject character unless we are at the end of a line and firstline is set. / if (firstline && IS_NEWLINE(current_subject)) break; current_subject++; if (utf8) { while (current_subject < end_subject && (current_subject & 0xc0) == 0x80) current_subject++; } if (current_subject > end_subject) break; /* If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. / if (current_subject[-1] == '\r' && current_subject < end_subject && current_subject == '\n' && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY \|\| md->nltype == NLTYPE_ANYCRLF \|\| md->nllen == 2)) current_subject++; } /* "Bumpalong" loop */ return PCRE_ERROR_NOMATCH; }	pcredfa.c	2508
pcreexec.c
Type	Function	Source	Line
STATIC VOID	pchars(const uschar p, int length, BOOL is_subject, match_data md) static void pchars(const uschar p, int length, BOOL is_subject, match_data md) { unsigned int c; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); } #endif	pcreexec.c	109
STATIC BOOL	match_ref(int offset, register USPTR eptr, int length, match_data md, unsigned long int ims) static BOOL match_ref(int offset, register USPTR eptr, int length, match_data md, unsigned long int ims) { USPTR p = md->start_subject + md->offset_vector[offset]; #ifdef DEBUG if (eptr >= md->end_subject) printf("matching subject "); else { printf("matching subject "); pchars(eptr, length, TRUE, md); } printf(" against backref "); pchars(p, length, FALSE, md); printf("\n"); #endif /* Always fail if not enough characters left / if (length > md->end_subject - eptr) return FALSE; / Separate the caseless case for speed. In UTF-8 mode we can only do this properly if Unicode properties are supported. Otherwise, we can check only ASCII characters. / if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 #ifdef SUPPORT_UCP if (md->utf8) { USPTR endptr = eptr + length; while (eptr < endptr) { int c, d; GETCHARINC(c, eptr); GETCHARINC(d, p); if (c != d && c != UCD_OTHERCASE(d)) return FALSE; } } else #endif #endif / The same code works when not in UTF-8 mode and in UTF-8 mode when there is no UCP support. / while (length-- > 0) { if (md->lcc[p++] != md->lcc[eptr++]) return FALSE; } } / In the caseful case, we can just compare the bytes, whether or not we are in UTF-8 mode. / else { while (length-- > 0) if (p++ != eptr++) return FALSE; } return TRUE; } /************************************************************************ ************************************************************************ RECURSION IN THE match() FUNCTION The match() function is highly recursive, though not every recursive call increases the recursive depth. Nevertheless, some regular expressions can cause it to recurse to a great depth. I was writing for Unix, so I just let it call itself recursively. This uses the stack for saving everything that has to be saved for a recursive call. On Unix, the stack can be large, and this works fine. It turns out that on some non-Unix-like systems there are problems with programs that use a lot of stack. (This despite the fact that every last chip has oodles of memory these days, and techniques for extending the stack have been known for decades.) So.... There is a fudge, triggered by defining NO_RECURSE, which avoids recursive calls by keeping local variables that need to be preserved in blocks of memory obtained from malloc() instead instead of on the stack. Macros are used to achieve this so that the actual code doesn't look very different to what it always used to. The original heap-recursive code used longjmp(). However, it seems that this can be very slow on some operating systems. Following a suggestion from Stan Switzer, the use of longjmp() has been abolished, at the cost of having to provide a unique number for each call to RMATCH. There is no way of generating a sequence of numbers at compile time in C. I have given them names, to make them stand out more clearly. Crude tests on x86 Linux show a small speedup of around 5-8%. However, on FreeBSD, avoiding longjmp() more than halves the time taken to run the standard tests. Furthermore, not using longjmp() means that local dynamic variables don't have indeterminate values; this has meant that the frame size can be reduced because the result can be "passed back" by straight setting of the variable instead of being passed in the frame. ************************************************************************ ************************************************************************/ / Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN below must be updated in sync. / enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, RM51, RM52, RM53, RM54 }; / These versions of the macros use the stack, as normal. There are debugging versions and production versions. Note that the "rw" argument of RMATCH isn't actuall used in this definition. / #ifndef NO_RECURSE #define REGISTER register #ifdef DEBUG #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ { \ printf("match() called in line %d\n", __LINE__); \ rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \ printf("to line %d\n", __LINE__); \ } #define RRETURN(ra) \ { \ printf("match() returned %d from line %d ", ra, __LINE__); \ return ra; \ } #else #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1) #define RRETURN(ra) return ra #endif #else / These versions of the macros manage a private stack on the heap. Note that the "rd" argument of RMATCH isn't actually used in this definition. It's the md argument of match(), which never changes. / #define REGISTER #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ {\ heapframe newframe = (pcre_stack_malloc)(sizeof(heapframe));\ frame->Xwhere = rw; \ newframe->Xeptr = ra;\ newframe->Xecode = rb;\ newframe->Xmstart = mstart;\ newframe->Xoffset_top = rc;\ newframe->Xims = re;\ newframe->Xeptrb = rf;\ newframe->Xflags = rg;\ newframe->Xrdepth = frame->Xrdepth + 1;\ newframe->Xprevframe = frame;\ frame = newframe;\ DPRINTF(("restarting from line %d\n", __LINE__));\ goto HEAP_RECURSE;\ L_##rw:\ DPRINTF(("jumped back to line %d\n", __LINE__));\ } #define RRETURN(ra)\ {\ heapframe newframe = frame;\ frame = newframe->Xprevframe;\ (pcre_stack_free)(newframe);\ if (frame != NULL)\ {\ rrc = ra;\ goto HEAP_RETURN;\ }\ return ra;\ } / Structure for remembering the local variables in a private frame / typedef struct heapframe { struct heapframe Xprevframe; /* Function arguments that may change / const uschar Xeptr; const uschar Xecode; const uschar Xmstart; int Xoffset_top; long int Xims; eptrblock Xeptrb; int Xflags; unsigned int Xrdepth; / Function local variables / const uschar Xcallpat; const uschar Xcharptr; const uschar Xdata; const uschar Xnext; const uschar Xpp; const uschar Xprev; const uschar Xsaved_eptr; recursion_info Xnew_recursive; BOOL Xcur_is_word; BOOL Xcondition; BOOL Xprev_is_word; unsigned long int Xoriginal_ims; #ifdef SUPPORT_UCP int Xprop_type; int Xprop_value; int Xprop_fail_result; int Xprop_category; int Xprop_chartype; int Xprop_script; int Xoclength; uschar Xocchars[8]; #endif int Xctype; unsigned int Xfc; int Xfi; int Xlength; int Xmax; int Xmin; int Xnumber; int Xoffset; int Xop; int Xsave_capture_last; int Xsave_offset1, Xsave_offset2, Xsave_offset3; int Xstacksave[REC_STACK_SAVE_MAX]; eptrblock Xnewptrb; /* Where to jump back to / int Xwhere; } heapframe; #endif /************************************************************************ *************************************************************************/	pcreexec.c	138
STATIC INT	match(REGISTER USPTR eptr, REGISTER const uschar ecode, const uschar mstart, int offset_top, match_data md, unsigned long int ims, eptrblock eptrb, int flags, unsigned int rdepth) static int match(REGISTER USPTR eptr, REGISTER const uschar ecode, const uschar mstart, int offset_top, match_data md, unsigned long int ims, eptrblock eptrb, int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark some of them with "register" because they are used a lot in loops. / register int rrc; / Returns from recursive calls / register int i; / Used for loops not involving calls to RMATCH() / register unsigned int c; / Character values not kept over RMATCH() calls / register BOOL utf8; / Local copy of UTF-8 flag for speed / BOOL minimize, possessive; / Quantifier options / / When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from heap storage. Set up the top-level frame here; others are obtained from the heap whenever RMATCH() does a "recursion". See the macro definitions above. / #ifdef NO_RECURSE heapframe frame = (pcre_stack_malloc)(sizeof(heapframe)); frame->Xprevframe = NULL; /* Marks the top level / / Copy in the original argument variables / frame->Xeptr = eptr; frame->Xecode = ecode; frame->Xmstart = mstart; frame->Xoffset_top = offset_top; frame->Xims = ims; frame->Xeptrb = eptrb; frame->Xflags = flags; frame->Xrdepth = rdepth; / This is where control jumps back to to effect "recursion" / HEAP_RECURSE: / Macros make the argument variables come from the current frame / #define eptr frame->Xeptr #define ecode frame->Xecode #define mstart frame->Xmstart #define offset_top frame->Xoffset_top #define ims frame->Xims #define eptrb frame->Xeptrb #define flags frame->Xflags #define rdepth frame->Xrdepth / Ditto for the local variables / #ifdef SUPPORT_UTF8 #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat #define data frame->Xdata #define next frame->Xnext #define pp frame->Xpp #define prev frame->Xprev #define saved_eptr frame->Xsaved_eptr #define new_recursive frame->Xnew_recursive #define cur_is_word frame->Xcur_is_word #define condition frame->Xcondition #define prev_is_word frame->Xprev_is_word #define original_ims frame->Xoriginal_ims #ifdef SUPPORT_UCP #define prop_type frame->Xprop_type #define prop_value frame->Xprop_value #define prop_fail_result frame->Xprop_fail_result #define prop_category frame->Xprop_category #define prop_chartype frame->Xprop_chartype #define prop_script frame->Xprop_script #define oclength frame->Xoclength #define occhars frame->Xocchars #endif #define ctype frame->Xctype #define fc frame->Xfc #define fi frame->Xfi #define length frame->Xlength #define max frame->Xmax #define min frame->Xmin #define number frame->Xnumber #define offset frame->Xoffset #define op frame->Xop #define save_capture_last frame->Xsave_capture_last #define save_offset1 frame->Xsave_offset1 #define save_offset2 frame->Xsave_offset2 #define save_offset3 frame->Xsave_offset3 #define stacksave frame->Xstacksave #define newptrb frame->Xnewptrb / When recursion is being used, local variables are allocated on the stack and get preserved during recursion in the normal way. In this environment, fi and i, and fc and c, can be the same variables. / #else / NO_RECURSE not defined / #define fi i #define fc c #ifdef SUPPORT_UTF8 / Many of these variables are used only / const uschar charptr; /* in small blocks of the code. My normal / #endif / style of coding would have declared / const uschar callpat; /* them within each of those blocks. / const uschar data; /* However, in order to accommodate the / const uschar next; /* version of this code that uses an / USPTR pp; / external "stack" implemented on the / const uschar prev; /* heap, it is easier to declare them all / USPTR saved_eptr; / here, so the declarations can be cut / / out in a block. The only declarations / recursion_info new_recursive; / within blocks below are for variables / / that do not have to be preserved over / BOOL cur_is_word; / a recursive call to RMATCH(). / BOOL condition; BOOL prev_is_word; unsigned long int original_ims; #ifdef SUPPORT_UCP int prop_type; int prop_value; int prop_fail_result; int prop_category; int prop_chartype; int prop_script; int oclength; uschar occhars[8]; #endif int ctype; int length; int max; int min; int number; int offset; int op; int save_capture_last; int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; eptrblock newptrb; #endif / NO_RECURSE / / These statements are here to stop the compiler complaining about unitialized variables. / #ifdef SUPPORT_UCP prop_value = 0; prop_fail_result = 0; #endif / This label is used for tail recursion, which is used in a few cases even when NO_RECURSE is not defined, in order to reduce the amount of stack that is used. Thanks to Ian Taylor for noticing this possibility and sending the original patch. / TAIL_RECURSE: / OK, now we can get on with the real code of the function. Recursive calls are specified by the macro RMATCH and RRETURN is used to return. When NO_RECURSE is not defined, these just turn into a recursive call to match() and a "return", respectively (possibly with some debugging if DEBUG is defined). However, RMATCH isn't like a function call because it's quite a complicated macro. It has to be used in one particular way. This shouldn't, however, impact performance when true recursion is being used. / #ifdef SUPPORT_UTF8 utf8 = md->utf8; / Local copy of the flag / #else utf8 = FALSE; #endif / First check that we haven't called match() too many times, or that we haven't exceeded the recursive call limit. / if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); original_ims = ims; / Save for resetting on ')' / / At the start of a group with an unlimited repeat that may match an empty string, the match_cbegroup flag is set. When this is the case, add the current subject pointer to the chain of such remembered pointers, to be checked when we hit the closing ket, in order to break infinite loops that match no characters. When match() is called in other circumstances, don't add to the chain. The match_cbegroup flag must NOT be used with tail recursion, because the memory block that is used is on the stack, so a new one may be required for each match(). / if ((flags & match_cbegroup) != 0) { newptrb.epb_saved_eptr = eptr; newptrb.epb_prev = eptrb; eptrb = &newptrb; } / Now start processing the opcodes. / for (;;) { minimize = possessive = FALSE; op = ecode; /* For partial matching, remember if we ever hit the end of the subject after matching at least one subject character. / if (md->partial && eptr >= md->end_subject && eptr > mstart) md->hitend = TRUE; switch(op) { case OP_FAIL: RRETURN(MATCH_NOMATCH); case OP_PRUNE: RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM51); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); case OP_COMMIT: RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM52); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_COMMIT); case OP_SKIP: RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM53); if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position / RRETURN(MATCH_SKIP); case OP_THEN: RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM54); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_THEN); /* Handle a capturing bracket. If there is space in the offset vector, save the current subject position in the working slot at the top of the vector. We mustn't change the current values of the data slot, because they may be set from a previous iteration of this group, and be referred to by a reference inside the group. If the bracket fails to match, we need to restore this value and also the values of the final offsets, in case they were set by a previous iteration of the same bracket. If there isn't enough space in the offset vector, treat this as if it were a non-capturing bracket. Don't worry about setting the flag for the error case here; that is handled in the code for KET. / case OP_CBRA: case OP_SCBRA: number = GET2(ecode, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG printf("start bracket %d\n", number); printf("subject="); pchars(eptr, 16, TRUE, md); printf("\n"); #endif if (offset < md->offset_max) { save_offset1 = md->offset_vector[offset]; save_offset2 = md->offset_vector[offset+1]; save_offset3 = md->offset_vector[md->offset_end - number]; save_capture_last = md->capture_last; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); md->offset_vector[md->offset_end - number] = eptr - md->start_subject; flags = (op == OP_SCBRA)? match_cbegroup : 0; do { RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM1); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); } while (ecode == OP_ALT); DPRINTF(("bracket %d failed\n", number)); md->offset_vector[offset] = save_offset1; md->offset_vector[offset+1] = save_offset2; md->offset_vector[md->offset_end - number] = save_offset3; RRETURN(MATCH_NOMATCH); } / FALL THROUGH ... Insufficient room for saving captured contents. Treat as a non-capturing bracket. / / VVVVVVVVVVVVVVVVVVVVVVVVV / / VVVVVVVVVVVVVVVVVVVVVVVVV / DPRINTF(("insufficient capture room: treat as non-capturing\n")); / VVVVVVVVVVVVVVVVVVVVVVVVV / / VVVVVVVVVVVVVVVVVVVVVVVVV / / Non-capturing bracket. Loop for all the alternatives. When we get to the final alternative within the brackets, we would return the result of a recursive call to match() whatever happened. We can reduce stack usage by turning this into a tail recursion, except in the case when match_cbegroup is set./ case OP_BRA: case OP_SBRA: DPRINTF(("start non-capturing bracket\n")); flags = (op >= OP_SBRA)? match_cbegroup : 0; for (;;) { if (ecode[GET(ecode, 1)] != OP_ALT) / Final alternative / { if (flags == 0) / Not a possibly empty group / { ecode += _pcre_OP_lengths[ecode]; DPRINTF(("bracket 0 tail recursion\n")); goto TAIL_RECURSE; } /* Possibly empty group; can't use tail recursion. / RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM48); RRETURN(rrc); } /* For non-final alternatives, continue the loop for a NOMATCH result; otherwise return. / RMATCH(eptr, ecode + _pcre_OP_lengths[ecode], offset_top, md, ims, eptrb, flags, RM2); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } /* Control never reaches here. / / Conditional group: compilation checked that there are no more than two branches. If the condition is false, skipping the first branch takes us past the end if there is only one branch, but that's OK because that is exactly what going to the ket would do. As there is only one branch to be obeyed, we can use tail recursion to avoid using another stack frame. / case OP_COND: case OP_SCOND: if (ecode[LINK_SIZE+1] == OP_RREF) / Recursion test / { offset = GET2(ecode, LINK_SIZE + 2); / Recursion group number/ condition = md->recursive != NULL && (offset == RREF_ANY \|\| offset == md->recursive->group_num); ecode += condition? 3 : GET(ecode, 1); } else if (ecode[LINK_SIZE+1] == OP_CREF) / Group used test / { offset = GET2(ecode, LINK_SIZE+2) << 1; / Doubled ref number / condition = offset < offset_top && md->offset_vector[offset] >= 0; ecode += condition? 3 : GET(ecode, 1); } else if (ecode[LINK_SIZE+1] == OP_DEF) / DEFINE - always false / { condition = FALSE; ecode += GET(ecode, 1); } / The condition is an assertion. Call match() to evaluate it - setting the final argument match_condassert causes it to stop at the end of an assertion. / else { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, match_condassert, RM3); if (rrc == MATCH_MATCH) { condition = TRUE; ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); while (ecode == OP_ALT) ecode += GET(ecode, 1); } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else / } else { condition = FALSE; ecode += GET(ecode, 1); } } / We are now at the branch that is to be obeyed. As there is only one, we can use tail recursion to avoid using another stack frame, except when match_cbegroup is required for an unlimited repeat of a possibly empty group. If the second alternative doesn't exist, we can just plough on. / if (condition \|\| ecode == OP_ALT) { ecode += 1 + LINK_SIZE; if (op == OP_SCOND) /* Possibly empty group / { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); RRETURN(rrc); } else / Group must match something / { flags = 0; goto TAIL_RECURSE; } } else / Condition false & no 2nd alternative / { ecode += 1 + LINK_SIZE; } break; / End of the pattern, either real or forced. If we are in a top-level recursion, we should restore the offsets appropriately and continue from after the call. / case OP_ACCEPT: case OP_END: if (md->recursive != NULL && md->recursive->group_num == 0) { recursion_info rec = md->recursive; DPRINTF(("End of pattern in a (?0) recursion\n")); md->recursive = rec->prevrec; memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); mstart = rec->save_start; ims = original_ims; ecode = rec->after_call; break; } /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty string - backtracking will then try other alternatives, if any. / if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); md->end_match_ptr = eptr; / Record where we ended / md->end_offset_top = offset_top; / and how many extracts were taken / md->start_match_ptr = mstart; / and the start (\K can modify) / RRETURN(MATCH_MATCH); / Change option settings / case OP_OPT: ims = ecode[1]; ecode += 2; DPRINTF(("ims set to %02lx\n", ims)); break; / Assertion brackets. Check the alternative branches in turn - the matching won't pass the KET for an assertion. If any one branch matches, the assertion is true. Lookbehind assertions have an OP_REVERSE item at the start of each branch to move the current point backwards, so the code at this level is identical to the lookahead case. / case OP_ASSERT: case OP_ASSERTBACK: do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM4); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } while (ecode == OP_ALT); if (ecode == OP_KET) RRETURN(MATCH_NOMATCH); / If checking an assertion for a condition, return MATCH_MATCH. / if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); / Continue from after the assertion, updating the offsets high water mark, since extracts may have been taken during the assertion. / do ecode += GET(ecode,1); while (ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; /* Negative assertion: all branches must fail to match / case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM5); if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (ecode == OP_ALT); if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); ecode += 1 + LINK_SIZE; continue; /* Move the subject pointer back. This occurs only at the start of each branch of a lookbehind assertion. If we are too close to the start to move back, this match function fails. When working with UTF-8 we move back a number of characters, not bytes. / case OP_REVERSE: #ifdef SUPPORT_UTF8 if (utf8) { i = GET(ecode, 1); while (i-- > 0) { eptr--; if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); BACKCHAR(eptr); } } else #endif / No UTF-8 support, or not in UTF-8 mode: count is byte count / { eptr -= GET(ecode, 1); if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); } / Skip to next op code / ecode += 1 + LINK_SIZE; break; / The callout item calls an external function, if one is provided, passing details of the match so far. This is mainly for debugging, though the function is able to force a failure. / case OP_CALLOUT: if (pcre_callout != NULL) { pcre_callout_block cb; cb.version = 1; / Version 1 of the callout block / cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; cb.subject_length = md->end_subject - md->start_subject; cb.start_match = mstart - md->start_subject; cb.current_position = eptr - md->start_subject; cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; if ((rrc = (pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += 2 + 2LINK_SIZE; break; / Recursion either matches the current regex, or some subexpression. The offset data is the offset to the starting bracket from the start of the whole pattern. (This is so that it works from duplicated subpatterns.) If there are any capturing brackets started but not finished, we have to save their starting points and reinstate them after the recursion. However, we don't know how many such there are (offset_top records the completed total) so we just have to save all the potential data. There may be up to 65535 such values, which is too large to put on the stack, but using malloc for small numbers seems expensive. As a compromise, the stack is used when there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc is used. A problem is what to do if the malloc fails ... there is no way of returning to the top level with an error. Save the top REC_STACK_SAVE_MAX values on the stack, and accept that the rest may be wrong. There are also other values that have to be saved. We use a chained sequence of blocks that actually live on the stack. Thanks to Robin Houston for the original version of this logic. / case OP_RECURSE: { callpat = md->start_code + GET(ecode, 1); new_recursive.group_num = (callpat == md->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE); / Add to "recursing stack" / new_recursive.prevrec = md->recursive; md->recursive = &new_recursive; / Find where to continue from afterwards / ecode += 1 + LINK_SIZE; new_recursive.after_call = ecode; / Now save the offset data. / new_recursive.saved_max = md->offset_end; if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) new_recursive.offset_save = stacksave; else { new_recursive.offset_save = (int )(pcre_malloc)(new_recursive.saved_max * sizeof(int)); if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); } memcpy(new_recursive.offset_save, md->offset_vector, new_recursive.saved_max * sizeof(int)); new_recursive.save_start = mstart; mstart = eptr; /* OK, now we can do the recursion. For each top-level alternative we restore the offset and recursion data. / DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); flags = (callpat >= OP_SBRA)? match_cbegroup : 0; do { RMATCH(eptr, callpat + _pcre_OP_lengths[callpat], offset_top, md, ims, eptrb, flags, RM6); if (rrc == MATCH_MATCH) { DPRINTF(("Recursion matched\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_MATCH); } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { DPRINTF(("Recursion gave error %d\n", rrc)); RRETURN(rrc); } md->recursive = &new_recursive; memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max sizeof(int)); callpat += GET(callpat, 1); } while (callpat == OP_ALT); DPRINTF(("Recursion didn't match\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_NOMATCH); } / Control never reaches here / / "Once" brackets are like assertion brackets except that after a match, the point in the subject string is not moved back. Thus there can never be a move back into the brackets. Friedl calls these "atomic" subpatterns. Check the alternative branches in turn - the matching won't pass the KET for this kind of subpattern. If any one branch matches, we carry on as at the end of a normal bracket, leaving the subject pointer. / case OP_ONCE: prev = ecode; saved_eptr = eptr; do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (ecode == OP_ALT); /* If hit the end of the group (which could be repeated), fail / if (ecode != OP_ONCE && ecode != OP_ALT) RRETURN(MATCH_NOMATCH); / Continue as from after the assertion, updating the offsets high water mark, since extracts may have been taken. / do ecode += GET(ecode, 1); while (ecode == OP_ALT); offset_top = md->end_offset_top; eptr = md->end_match_ptr; /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. This is the forcible breaking of infinite loops as implemented in Perl 5.005. If there is an options reset, it will get obeyed in the normal course of events. / if (ecode == OP_KET \|\| eptr == saved_eptr) { ecode += 1+LINK_SIZE; break; } /* The repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. The second "call" of match() uses tail recursion, to avoid using another stack frame. We need to reset any options that changed within the bracket before re-running it, so check the next opcode. / if (ecode[1+LINK_SIZE] == OP_OPT) { ims = (ims & ~PCRE_IMS) \| ecode[4]; DPRINTF(("ims set to %02lx at group repeat\n", ims)); } if (ecode == OP_KETRMIN) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode = prev; flags = 0; goto TAIL_RECURSE; } else /* OP_KETRMAX / { RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; flags = 0; goto TAIL_RECURSE; } / Control never gets here / / An alternation is the end of a branch; scan along to find the end of the bracketed group and go to there. / case OP_ALT: do ecode += GET(ecode,1); while (ecode == OP_ALT); break; /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, indicating that it may occur zero times. It may repeat infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets with fixed upper repeat limits are compiled as a number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. / case OP_BRAZERO: { next = ecode+1; RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); if (rrc != MATCH_NOMATCH) RRETURN(rrc); do next += GET(next,1); while (next == OP_ALT); ecode = next + 1 + LINK_SIZE; } break; case OP_BRAMINZERO: { next = ecode+1; do next += GET(next, 1); while (next == OP_ALT); RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode++; } break; case OP_SKIPZERO: { next = ecode+1; do next += GET(next,1); while (next == OP_ALT); ecode = next + 1 + LINK_SIZE; } break; /* End of a group, repeated or non-repeating. / case OP_KET: case OP_KETRMIN: case OP_KETRMAX: prev = ecode - GET(ecode, 1); / If this was a group that remembered the subject start, in order to break infinite repeats of empty string matches, retrieve the subject start from the chain. Otherwise, set it NULL. / if (prev >= OP_SBRA) { saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group / eptrb = eptrb->epb_prev; / Backup to previous group / } else saved_eptr = NULL; / If we are at the end of an assertion group, stop matching and return MATCH_MATCH, but record the current high water mark for use by positive assertions. Do this also for the "once" (atomic) groups. / if (prev == OP_ASSERT \|\| prev == OP_ASSERT_NOT \|\| prev == OP_ASSERTBACK \|\| prev == OP_ASSERTBACK_NOT \|\| prev == OP_ONCE) { md->end_match_ptr = eptr; /* For ONCE / md->end_offset_top = offset_top; RRETURN(MATCH_MATCH); } / For capturing groups we have to check the group number back at the start and if necessary complete handling an extraction by setting the offsets and bumping the high water mark. Note that whole-pattern recursion is coded as a recurse into group 0, so it won't be picked up here. Instead, we catch it when the OP_END is reached. Other recursion is handled here. / if (prev == OP_CBRA \|\| prev == OP_SCBRA) { number = GET2(prev, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG printf("end bracket %d", number); printf("\n"); #endif md->capture_last = number; if (offset >= md->offset_max) md->offset_overflow = TRUE; else { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; md->offset_vector[offset+1] = eptr - md->start_subject; if (offset_top <= offset) offset_top = offset + 2; } / Handle a recursively called group. Restore the offsets appropriately and continue from after the call. / if (md->recursive != NULL && md->recursive->group_num == number) { recursion_info rec = md->recursive; DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); md->recursive = rec->prevrec; mstart = rec->save_start; memcpy(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); ecode = rec->after_call; ims = original_ims; break; } } /* For both capturing and non-capturing groups, reset the value of the ims flags, in case they got changed during the group. / ims = original_ims; DPRINTF(("ims reset to %02lx\n", ims)); / For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. This is the forcible breaking of infinite loops as implemented in Perl 5.005. If there is an options reset, it will get obeyed in the normal course of events. / if (ecode == OP_KET \|\| eptr == saved_eptr) { ecode += 1 + LINK_SIZE; break; } /* The repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. In the second case, we can use tail recursion to avoid using another stack frame, unless we have an unlimited repeat of a group that can match an empty string. / flags = (prev >= OP_SBRA)? match_cbegroup : 0; if (ecode == OP_KETRMIN) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (flags != 0) / Could match an empty string / { RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); RRETURN(rrc); } ecode = prev; goto TAIL_RECURSE; } else / OP_KETRMAX / { RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; flags = 0; goto TAIL_RECURSE; } / Control never gets here / / Start of subject unless notbol, or after internal newline if multiline / case OP_CIRC: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); if ((ims & PCRE_MULTILINE) != 0) { if (eptr != md->start_subject && (eptr == md->end_subject \|\| !WAS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); ecode++; break; } / ... else fall through / / Start of subject assertion / case OP_SOD: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); ecode++; break; / Start of match assertion / case OP_SOM: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); ecode++; break; / Reset the start of match point / case OP_SET_SOM: mstart = eptr; ecode++; break; / Assert before internal newline if multiline, or before a terminating newline unless endonly is set, else end of subject unless noteol is set. / case OP_DOLL: if ((ims & PCRE_MULTILINE) != 0) { if (eptr < md->end_subject) { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } else { if (md->noteol) RRETURN(MATCH_NOMATCH); } ecode++; break; } else { if (md->noteol) RRETURN(MATCH_NOMATCH); if (!md->endonly) { if (eptr != md->end_subject && (!IS_NEWLINE(eptr) \|\| eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; } } / ... else fall through for endonly / / End of subject assertion (\z) / case OP_EOD: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; break; / End of subject or ending \n assertion (\Z) / case OP_EODN: if (eptr != md->end_subject && (!IS_NEWLINE(eptr) \|\| eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; / Word boundary assertions / case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: { / Find out if the previous and current characters are "word" characters. It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to be "non-word" characters. / #ifdef SUPPORT_UTF8 if (utf8) { if (eptr == md->start_subject) prev_is_word = FALSE; else { const uschar lastptr = eptr - 1; while((lastptr & 0xc0) == 0x80) lastptr--; GETCHAR(c, lastptr); prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } if (eptr >= md->end_subject) cur_is_word = FALSE; else { GETCHAR(c, eptr); cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } } else #endif / More streamlined when not in UTF-8 mode / { prev_is_word = (eptr != md->start_subject) && ((md->ctypes[eptr[-1]] & ctype_word) != 0); cur_is_word = (eptr < md->end_subject) && ((md->ctypes[eptr] & ctype_word) != 0); } /* Now see if the situation is what we want / if ((ecode++ == OP_WORD_BOUNDARY)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) RRETURN(MATCH_NOMATCH); } break; /* Match a single character type; inline for speed / case OP_ANY: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); / Fall through / case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); if (utf8) while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; ecode++; break; /* Match a single byte, even in UTF-8 mode. This opcode really does match any byte, even newline, independent of the setting of PCRE_DOTALL. / case OP_ANYBYTE: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_DIGIT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_digit) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_DIGIT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 \|\| #endif (md->ctypes[c] & ctype_digit) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_WHITESPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_space) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_WHITESPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 \|\| #endif (md->ctypes[c] & ctype_space) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_WORDCHAR: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_word) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_WORDCHAR: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 \|\| #endif (md->ctypes[c] & ctype_word) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_ANYNL: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } ecode++; break; case OP_NOT_HSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / RRETURN(MATCH_NOMATCH); } ecode++; break; case OP_HSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / break; } ecode++; break; case OP_NOT_VSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: break; case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / RRETURN(MATCH_NOMATCH); } ecode++; break; case OP_VSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / break; } ecode++; break; #ifdef SUPPORT_UCP / Check the next character by Unicode property. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / case OP_PROP: case OP_NOTPROP: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { const ucd_record prop = GET_UCD(c); switch(ecode[1]) { case PT_ANY: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); break; case PT_LAMP: if ((prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: if ((ecode[2] != prop->chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: if ((ecode[2] != prop->script) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } ecode += 3; } break; /* Match an extended Unicode sequence. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / case OP_EXTUNI: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { int category = UCD_CATEGORY(c); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = eptr; else { GETCHARLEN(c, eptr, len); } category = UCD_CATEGORY(c); if (category != ucp_M) break; eptr += len; } } ecode++; break; #endif /* Match a back reference, possibly repeatedly. Look past the end of the item to see if there is repeat information following. The code is similar to that for character classes, but repeated for efficiency. Then obey similar code to character type repeats - written out again for speed. However, if the referenced string is the empty string, always treat it as matched, any number of times (otherwise there could be infinite loops). / case OP_REF: { offset = GET2(ecode, 1) << 1; / Doubled ref number / ecode += 3; / If the reference is unset, there are two possibilities: (a) In the default, Perl-compatible state, set the length to be longer than the amount of subject left; this ensures that every attempt at a match fails. We can't just fail here, because of the possibility of quantifiers with zero minima. (b) If the JavaScript compatibility flag is set, set the length to zero so that the back reference matches an empty string. Otherwise, set the length to the length of what was matched by the referenced subpattern. / if (offset >= offset_top \|\| md->offset_vector[offset] < 0) length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; else length = md->offset_vector[offset+1] - md->offset_vector[offset]; / Set up for repetition, or handle the non-repeated case / switch (ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows / if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; continue; / With the main loop / } / If the length of the reference is zero, just continue with the main loop. / if (length == 0) continue; / First, ensure the minimum number of matches are present. We get back the length of the reference string explicitly rather than passing the address of eptr, so that eptr can be a register variable. / for (i = 1; i <= min; i++) { if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; } / If min = max, continue at the same level without recursion. They are not both allowed to be zero. / if (min == max) continue; / If minimizing, keep trying and advancing the pointer / if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| !match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; } / Control never gets here / } / If maximizing, find the longest string and work backwards / else { pp = eptr; for (i = min; i < max; i++) { if (!match_ref(offset, eptr, length, md, ims)) break; eptr += length; } while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr -= length; } RRETURN(MATCH_NOMATCH); } } / Control never gets here / / Match a bit-mapped character class, possibly repeatedly. This op code is used when all the characters in the class have values in the range 0-255, and either the matching is caseful, or the characters are in the range 0-127 when UTF-8 processing is enabled. The only difference between OP_CLASS and OP_NCLASS occurs when a data character outside the range is encountered. First, look past the end of the item to see if there is repeat information following. Then obey similar code to character type repeats - written out again for speed. / case OP_NCLASS: case OP_CLASS: { data = ecode + 1; / Save for matching / ecode += 33; / Advance past the item / switch (ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows / min = max = 1; break; } / First, ensure the minimum number of matches are present. / #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c > 255) { if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else { if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } } else #endif / Not UTF-8 mode / { for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); c = eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* If max == min we can continue with the main loop without the need to recurse. / if (min == max) continue; / If minimizing, keep testing the rest of the expression and advancing the pointer while it matches the class. / if (minimize) { #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c > 255) { if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else { if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } } else #endif / Not UTF-8 mode / { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); c = eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* Control never gets here / } / If maximizing, find the longest possible run, then work backwards. / else { pp = eptr; #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c > 255) { if (op == OP_CLASS) break; } else { if ((data[c/8] & (1 << (c&7))) == 0) break; } eptr += len; } for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / BACKCHAR(eptr); } } else #endif / Not UTF-8 mode / { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if ((data[c/8] & (1 << (c&7))) == 0) break; eptr++; } while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } } /* Control never gets here / / Match an extended character class. This opcode is encountered only in UTF-8 mode, because that's the only time it is compiled. / #ifdef SUPPORT_UTF8 case OP_XCLASS: { data = ecode + 1 + LINK_SIZE; / Save for matching / ecode += GET(ecode, 1); / Advance past the item / switch (ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows / min = max = 1; break; } / First, ensure the minimum number of matches are present. / for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } / If max == min we can continue with the main loop without the need to recurse. / if (min == max) continue; / If minimizing, keep testing the rest of the expression and advancing the pointer while it matches the class. / if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } / Control never gets here / } / If maximizing, find the longest possible run, then work backwards. / else { pp = eptr; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; } for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / if (utf8) BACKCHAR(eptr); } RRETURN(MATCH_NOMATCH); } / Control never gets here / } #endif / End of XCLASS / / Match a single character, casefully / case OP_CHAR: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; ecode++; GETCHARLEN(fc, ecode, length); if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); while (length-- > 0) if (ecode++ != eptr++) RRETURN(MATCH_NOMATCH); } else #endif / Non-UTF-8 mode / { if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); if (ecode[1] != eptr++) RRETURN(MATCH_NOMATCH); ecode += 2; } break; /* Match a single character, caselessly / case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; ecode++; GETCHARLEN(fc, ecode, length); if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); / If the pattern character's value is < 128, we have only one byte, and can use the fast lookup table. / if (fc < 128) { if (md->lcc[ecode++] != md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); } / Otherwise we must pick up the subject character / else { unsigned int dc; GETCHARINC(dc, eptr); ecode += length; / If we have Unicode property support, we can use it to test the other case of the character, if there is one. / if (fc != dc) { #ifdef SUPPORT_UCP if (dc != UCD_OTHERCASE(fc)) #endif RRETURN(MATCH_NOMATCH); } } } else #endif / SUPPORT_UTF8 / / Non-UTF-8 mode / { if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); if (md->lcc[ecode[1]] != md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); ecode += 2; } break; /* Match a single character repeatedly. / case OP_EXACT: min = max = GET2(ecode, 1); ecode += 3; goto REPEATCHAR; case OP_POSUPTO: possessive = TRUE; / Fall through / case OP_UPTO: case OP_MINUPTO: min = 0; max = GET2(ecode, 1); minimize = ecode == OP_MINUPTO; ecode += 3; goto REPEATCHAR; case OP_POSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATCHAR; case OP_POSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATCHAR; case OP_POSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATCHAR; case OP_STAR: case OP_MINSTAR: case OP_PLUS: case OP_MINPLUS: case OP_QUERY: case OP_MINQUERY: c = ecode++ - OP_STAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; / Common code for all repeated single-character matches. We can give up quickly if there are fewer than the minimum number of characters left in the subject. / REPEATCHAR: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; charptr = ecode; GETCHARLEN(fc, ecode, length); if (min length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); ecode += length; /* Handle multibyte character matching specially here. There is support for caseless matching if UCP support is present. / if (length > 1) { #ifdef SUPPORT_UCP unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && (othercase = UCD_OTHERCASE(fc)) != fc) oclength = _pcre_ord2utf8(othercase, occhars); else oclength = 0; #endif / SUPPORT_UCP / for (i = 1; i <= min; i++) { if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP / Need braces because of following else / else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } else { if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); eptr += oclength; } #else / without SUPPORT_UCP / else { RRETURN(MATCH_NOMATCH); } #endif / SUPPORT_UCP / } if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP / Need braces because of following else / else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } else { if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); eptr += oclength; } #else / without SUPPORT_UCP / else { RRETURN (MATCH_NOMATCH); } #endif / SUPPORT_UCP / } / Control never gets here / } else / Maximize / { pp = eptr; for (i = min; i < max; i++) { if (eptr > md->end_subject - length) break; if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP else if (oclength == 0) break; else { if (memcmp(eptr, occhars, oclength) != 0) break; eptr += oclength; } #else / without SUPPORT_UCP / else break; #endif / SUPPORT_UCP / } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr == pp) RRETURN(MATCH_NOMATCH); #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); #else / without SUPPORT_UCP / eptr -= length; #endif / SUPPORT_UCP / } } / Control never gets here / } / If the length of a UTF-8 character is 1, we fall through here, and obey the code as for non-UTF-8 characters below, though in this case the value of fc will always be < 128. / } else #endif / SUPPORT_UTF8 / / When not in UTF-8 mode, load a single-byte character. / { if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); fc = ecode++; } /* The value of fc at this point is always less than 256, though we may or may not be in UTF-8 mode. The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the minimum number of matches are present. If min = max, continue at the same level without recursing. Otherwise, if minimizing, keep trying the rest of the expression and advancing one matching character if failing, up to the maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. / DPRINTF(("matching %c{%d,%d} against subject %.s\n", fc, min, max, max, eptr)); if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; for (i = 1; i <= min; i++) if (fc != md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| fc != md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); } /* Control never gets here / } else / Maximize / { pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| fc != md->lcc[eptr]) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } RRETURN(MATCH_NOMATCH); } /* Control never gets here / } / Caseful comparisons (includes all multi-byte characters) / else { for (i = 1; i <= min; i++) if (fc != eptr++) RRETURN(MATCH_NOMATCH); if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| fc != eptr++) RRETURN(MATCH_NOMATCH); } / Control never gets here / } else / Maximize / { pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| fc != eptr) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } RRETURN(MATCH_NOMATCH); } } /* Control never gets here / / Match a negated single one-byte character. The character we are checking can be multibyte. / case OP_NOT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; GETCHARINCTEST(c, eptr); if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (c < 256) #endif c = md->lcc[c]; if (md->lcc[ecode++] == c) RRETURN(MATCH_NOMATCH); } else { if (ecode++ == c) RRETURN(MATCH_NOMATCH); } break; / Match a negated single one-byte character repeatedly. This is almost a repeat of the code for a repeated single character, but I haven't found a nice way of commoning these up that doesn't require a test of the positive/negative option for each character match. Maybe that wouldn't add very much to the time taken, but character matching is what this is all about... / case OP_NOTEXACT: min = max = GET2(ecode, 1); ecode += 3; goto REPEATNOTCHAR; case OP_NOTUPTO: case OP_NOTMINUPTO: min = 0; max = GET2(ecode, 1); minimize = ecode == OP_NOTMINUPTO; ecode += 3; goto REPEATNOTCHAR; case OP_NOTPOSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSUPTO: possessive = TRUE; min = 0; max = GET2(ecode, 1); ecode += 3; goto REPEATNOTCHAR; case OP_NOTSTAR: case OP_NOTMINSTAR: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTQUERY: case OP_NOTMINQUERY: c = ecode++ - OP_NOTSTAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; / Common code for all repeated single-byte matches. We can give up quickly if there are fewer than the minimum number of bytes left in the subject. / REPEATNOTCHAR: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); fc = ecode++; /* The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the minimum number of matches are present. If min = max, continue at the same level without recursing. Otherwise, if minimizing, keep trying the rest of the expression and advancing one matching character if failing, up to the maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. / DPRINTF(("negative matching %c{%d,%d} against subject %.s\n", fc, min, max, max, eptr)); if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; #ifdef SUPPORT_UTF8 /* UTF-8 mode / if (utf8) { register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif / Not UTF-8 mode / { for (i = 1; i <= min; i++) if (fc == md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) { #ifdef SUPPORT_UTF8 /* UTF-8 mode / if (utf8) { register unsigned int d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif / Not UTF-8 mode / { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| fc == md->lcc[eptr++]) RRETURN(MATCH_NOMATCH); } } /* Control never gets here / } / Maximize case / else { pp = eptr; #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { register unsigned int d; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(d, eptr, len); if (d < 256) d = md->lcc[d]; if (fc == d) break; eptr += len; } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / BACKCHAR(eptr); } } else #endif / Not UTF-8 mode / { for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| fc == md->lcc[eptr]) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } /* Control never gets here / } / Caseful comparisons / else { #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif / Not UTF-8 mode / { for (i = 1; i <= min; i++) if (fc == eptr++) RRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) { #ifdef SUPPORT_UTF8 /* UTF-8 mode / if (utf8) { register unsigned int d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif / Not UTF-8 mode / { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| fc == eptr++) RRETURN(MATCH_NOMATCH); } } /* Control never gets here / } / Maximize case / else { pp = eptr; #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { register unsigned int d; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(d, eptr, len); if (fc == d) break; eptr += len; } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / BACKCHAR(eptr); } } else #endif / Not UTF-8 mode / { for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| fc == eptr) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } } /* Control never gets here / / Match a single character type repeatedly; several different opcodes share code. This is very similar to the code for single characters, but we repeat it in the interests of efficiency. / case OP_TYPEEXACT: min = max = GET2(ecode, 1); minimize = TRUE; ecode += 3; goto REPEATTYPE; case OP_TYPEUPTO: case OP_TYPEMINUPTO: min = 0; max = GET2(ecode, 1); minimize = ecode == OP_TYPEMINUPTO; ecode += 3; goto REPEATTYPE; case OP_TYPEPOSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATTYPE; case OP_TYPEPOSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATTYPE; case OP_TYPEPOSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATTYPE; case OP_TYPEPOSUPTO: possessive = TRUE; min = 0; max = GET2(ecode, 1); ecode += 3; goto REPEATTYPE; case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: c = ecode++ - OP_TYPESTAR; minimize = (c & 1) != 0; min = rep_min[c]; / Pick up values from tables; / max = rep_max[c]; / zero for max => infinity / if (max == 0) max = INT_MAX; / Common code for all repeated single character type matches. Note that in UTF-8 mode, '.' matches a character of any length, but for the other character types, the valid characters are all one-byte long. / REPEATTYPE: ctype = ecode++; /* Code for the character type / #ifdef SUPPORT_UCP if (ctype == OP_PROP \|\| ctype == OP_NOTPROP) { prop_fail_result = ctype == OP_NOTPROP; prop_type = ecode++; prop_value = ecode++; } else prop_type = -1; #endif / First, ensure the minimum number of matches are present. Use inline code for maximizing the speed, and do the type test once at the start (i.e. keep it out of the loop). Also we can test that there are at least the minimum number of bytes before we start. This isn't as effective in UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that is tidier. Also separate the UCP code, which can be the same for both UTF-8 and single-bytes. / if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); if (min > 0) { #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: if (prop_fail_result) RRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); } break; case PT_LAMP: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu \|\| prop_chartype == ucp_Ll \|\| prop_chartype == ucp_Lt) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_GC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_PC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_SC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; default: RRETURN(PCRE_ERROR_INTERNAL); } } / Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / else if (ctype == OP_EXTUNI) { for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } } else #endif /* SUPPORT_UCP / / Handle all other cases when the coding is UTF-8 / #ifdef SUPPORT_UTF8 if (utf8) switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; } break; case OP_ALLANY: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; } break; case OP_ANYBYTE: eptr += min; break; case OP_ANYNL: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } break; case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / RRETURN(MATCH_NOMATCH); } } break; case OP_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / break; } } break; case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: break; case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / RRETURN(MATCH_NOMATCH); } } break; case OP_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / break; } } break; case OP_NOT_DIGIT: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); } break; case OP_DIGIT: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); / No need to skip more bytes - we know it's a 1-byte character / } break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| (eptr < 128 && (md->ctypes[eptr] & ctype_space) != 0)) RRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (eptr & 0xc0) == 0x80); } break; case OP_WHITESPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character / } break; case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| (eptr < 128 && (md->ctypes[eptr] & ctype_word) != 0)) RRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (eptr & 0xc0) == 0x80); } break; case OP_WORDCHAR: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject \|\| eptr >= 128 \|\| (md->ctypes[eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character / } break; default: RRETURN(PCRE_ERROR_INTERNAL); } / End switch(ctype) / else #endif / SUPPORT_UTF8 / / Code for the non-UTF-8 case for minimum matching of operators other than OP_PROP and OP_NOTPROP. We can assume that there are the minimum number of bytes present, as this was tested above. / switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) { if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; } break; case OP_ALLANY: eptr += min; break; case OP_ANYBYTE: eptr += min; break; / Because of the CRLF case, we can't assume the minimum number of bytes are present in this case. / case OP_ANYNL: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } break; case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(eptr++) { default: break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / RRETURN(MATCH_NOMATCH); } } break; case OP_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / break; } } break; case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(eptr++) { default: break; case 0x0a: /* LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / RRETURN(MATCH_NOMATCH); } } break; case OP_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / break; } } break; case OP_NOT_DIGIT: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: for (i = 1; i <= min; i++) if ((md->ctypes[eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } / If min = max, continue at the same level without recursing / if (min == max) continue; / If minimizing, we have to test the rest of the pattern before each subsequent match. Again, separate the UTF-8 case for speed, and also separate the UCP cases. / if (minimize) { #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (prop_fail_result) RRETURN(MATCH_NOMATCH); } / Control never gets here / case PT_LAMP: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu \|\| prop_chartype == ucp_Ll \|\| prop_chartype == ucp_Lt) == prop_fail_result) RRETURN(MATCH_NOMATCH); } / Control never gets here / case PT_GC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } / Control never gets here / case PT_PC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } / Control never gets here / case PT_SC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } / Control never gets here / default: RRETURN(PCRE_ERROR_INTERNAL); } } / Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / else if (ctype == OP_EXTUNI) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } } else #endif /* SUPPORT_UCP / #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { case OP_ANY: / This is the non-NL case / case OP_ALLANY: case OP_ANYBYTE: break; case OP_ANYNL: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; case OP_NOT_HSPACE: switch(c) { default: break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / break; } break; case OP_NOT_VSPACE: switch(c) { default: break; case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / break; } break; case OP_NOT_DIGIT: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if (c >= 256 \|\| (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if (c >= 256 \|\| (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if (c >= 256 \|\| (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } } else #endif / Not UTF-8 mode / { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max \|\| eptr >= md->end_subject \|\| (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = eptr++; switch(ctype) { case OP_ANY: /* This is the non-NL case / case OP_ALLANY: case OP_ANYBYTE: break; case OP_ANYNL: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; case OP_NOT_HSPACE: switch(c) { default: break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: / HT / case 0x20: / SPACE / case 0xa0: / NBSP / break; } break; case OP_NOT_VSPACE: switch(c) { default: break; case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / break; } break; case OP_NOT_DIGIT: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } } / Control never gets here / } / If maximizing, it is worth using inline code for speed, doing the type test once at the start (i.e. keep it out of the loop). Again, keep the UTF-8 and UCP stuff separate. / else { pp = eptr; / Remember where we started / #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (prop_fail_result) break; eptr+= len; } break; case PT_LAMP: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu \|\| prop_chartype == ucp_Ll \|\| prop_chartype == ucp_Lt) == prop_fail_result) break; eptr+= len; } break; case PT_GC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; eptr+= len; } break; case PT_PC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; } break; case PT_SC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; } break; } / eptr is now past the end of the maximum run / if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / if (utf8) BACKCHAR(eptr); } } / Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. / else if (ctype == OP_EXTUNI) { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { int len = 1; if (!utf8) c = eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } /* eptr is now past the end of the maximum run / if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / for (;;) / Move back over one extended / { int len = 1; if (!utf8) c = eptr; else { BACKCHAR(eptr); GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr--; } } } else #endif /* SUPPORT_UCP / #ifdef SUPPORT_UTF8 / UTF-8 mode / if (utf8) { switch(ctype) { case OP_ANY: if (max < INT_MAX) { for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; } } /* Handle unlimited UTF-8 repeat / else { for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; } } break; case OP_ALLANY: if (max < INT_MAX) { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; eptr++; while (eptr < md->end_subject && (eptr & 0xc0) == 0x80) eptr++; } } else eptr = md->end_subject; / Unlimited UTF-8 repeat / break; / The byte case is the same as non-UTF8 / case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) c = md->end_subject - eptr; eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c == 0x000d) { if (++eptr >= md->end_subject) break; if (eptr == 0x000a) eptr++; } else { if (c != 0x000a && (md->bsr_anycrlf \|\| (c != 0x000b && c != 0x000c && c != 0x0085 && c != 0x2028 && c != 0x2029))) break; eptr += len; } } break; case OP_NOT_HSPACE: case OP_HSPACE: for (i = min; i < max; i++) { BOOL gotspace; int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); switch(c) { default: gotspace = FALSE; break; case 0x09: /* HT / case 0x20: / SPACE / case 0xa0: / NBSP / case 0x1680: / OGHAM SPACE MARK / case 0x180e: / MONGOLIAN VOWEL SEPARATOR / case 0x2000: / EN QUAD / case 0x2001: / EM QUAD / case 0x2002: / EN SPACE / case 0x2003: / EM SPACE / case 0x2004: / THREE-PER-EM SPACE / case 0x2005: / FOUR-PER-EM SPACE / case 0x2006: / SIX-PER-EM SPACE / case 0x2007: / FIGURE SPACE / case 0x2008: / PUNCTUATION SPACE / case 0x2009: / THIN SPACE / case 0x200A: / HAIR SPACE / case 0x202f: / NARROW NO-BREAK SPACE / case 0x205f: / MEDIUM MATHEMATICAL SPACE / case 0x3000: / IDEOGRAPHIC SPACE / gotspace = TRUE; break; } if (gotspace == (ctype == OP_NOT_HSPACE)) break; eptr += len; } break; case OP_NOT_VSPACE: case OP_VSPACE: for (i = min; i < max; i++) { BOOL gotspace; int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); switch(c) { default: gotspace = FALSE; break; case 0x0a: / LF / case 0x0b: / VT / case 0x0c: / FF / case 0x0d: / CR / case 0x85: / NEL / case 0x2028: / LINE SEPARATOR / case 0x2029: / PARAGRAPH SEPARATOR / gotspace = TRUE; break; } if (gotspace == (ctype == OP_NOT_VSPACE)) break; eptr += len; } break; case OP_NOT_DIGIT: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; eptr+= len; } break; case OP_DIGIT: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 \|\|(md->ctypes[c] & ctype_digit) == 0) break; eptr+= len; } break; case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; eptr+= len; } break; case OP_WHITESPACE: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 \|\|(md->ctypes[c] & ctype_space) == 0) break; eptr+= len; } break; case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; eptr+= len; } break; case OP_WORDCHAR: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 \|\| (md->ctypes[c] & ctype_word) == 0) break; eptr+= len; } break; default: RRETURN(PCRE_ERROR_INTERNAL); } / eptr is now past the end of the maximum run / if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; / Stop if tried at original pos / BACKCHAR(eptr); } } else #endif / SUPPORT_UTF8 / / Not UTF-8 mode / { switch(ctype) { case OP_ANY: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| IS_NEWLINE(eptr)) break; eptr++; } break; case OP_ALLANY: case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) c = md->end_subject - eptr; eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if (c == 0x000d) { if (++eptr >= md->end_subject) break; if (eptr == 0x000a) eptr++; } else { if (c != 0x000a && (md->bsr_anycrlf \|\| (c != 0x000b && c != 0x000c && c != 0x0085))) break; eptr++; } } break; case OP_NOT_HSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if (c == 0x09 \|\| c == 0x20 \|\| c == 0xa0) break; eptr++; } break; case OP_HSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if (c != 0x09 && c != 0x20 && c != 0xa0) break; eptr++; } break; case OP_NOT_VSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if (c == 0x0a \|\| c == 0x0b \|\| c == 0x0c \|\| c == 0x0d \|\| c == 0x85) break; eptr++; } break; case OP_VSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = eptr; if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) break; eptr++; } break; case OP_NOT_DIGIT: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_digit) != 0) break; eptr++; } break; case OP_DIGIT: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_digit) == 0) break; eptr++; } break; case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_space) != 0) break; eptr++; } break; case OP_WHITESPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_space) == 0) break; eptr++; } break; case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_word) != 0) break; eptr++; } break; case OP_WORDCHAR: for (i = min; i < max; i++) { if (eptr >= md->end_subject \|\| (md->ctypes[eptr] & ctype_word) == 0) break; eptr++; } break; default: RRETURN(PCRE_ERROR_INTERNAL); } / eptr is now past the end of the maximum run / if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } } / Get here if we can't make it match with any permitted repetitions / RRETURN(MATCH_NOMATCH); } / Control never gets here / / There's been some horrible disaster. Arrival here can only mean there is something seriously wrong in the code above or the OP_xxx definitions. / default: DPRINTF(("Unknown opcode %d\n", ecode)); RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); } /* Do not stick any code in here without much thought; it is assumed that "continue" in the code above comes out to here to repeat the main loop. / } / End of main loop / / Control never reaches here / / When compiling to use the heap rather than the stack for recursive calls to match(), the RRETURN() macro jumps here. The number that is saved in frame->Xwhere indicates which label we actually want to return to. / #ifdef NO_RECURSE #define LBL(val) case val: goto L_RM##val; HEAP_RETURN: switch (frame->Xwhere) { LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) #ifdef SUPPORT_UTF8 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) #endif / SUPPORT_UCP / #endif / SUPPORT_UTF8 / default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); return PCRE_ERROR_INTERNAL; } #undef LBL #endif / NO_RECURSE / } /************************************************************************ ************************************************************************** RECURSION IN THE match() FUNCTION Undefine all the macros that were defined above to handle this. / #ifdef NO_RECURSE #undef eptr #undef ecode #undef mstart #undef offset_top #undef ims #undef eptrb #undef flags #undef callpat #undef charptr #undef data #undef next #undef pp #undef prev #undef saved_eptr #undef new_recursive #undef cur_is_word #undef condition #undef prev_is_word #undef original_ims #undef ctype #undef length #undef max #undef min #undef number #undef offset #undef op #undef save_capture_last #undef save_offset1 #undef save_offset2 #undef save_offset3 #undef stacksave #undef newptrb #endif / These two are defined as macros in both cases / #undef fc #undef fi /************************************************************************ *************************************************************************/	pcreexec.c	427
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_exec(const pcre argument_re, const pcre_extra extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int offsets, int offsetcount) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre argument_re, const pcre_extra extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int offsets, int offsetcount) { int rc, resetcount, ocount; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; int newline; unsigned long int ims; BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; BOOL firstline; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; BOOL utf8; match_data match_block; match_data md = &match_block; const uschar tables; const uschar start_bits = NULL; USPTR start_match = (USPTR)subject + start_offset; USPTR end_subject; USPTR req_byte_ptr = start_match - 1; pcre_study_data internal_study; const pcre_study_data study; real_pcre internal_re; const real_pcre external_re = (const real_pcre )argument_re; const real_pcre re = external_re; / Plausibility checks / if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; if (re == NULL \|\| subject == NULL \|\| (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; / Fish out the optional data from the extra_data structure, first setting the default values. / study = NULL; md->match_limit = MATCH_LIMIT; md->match_limit_recursion = MATCH_LIMIT_RECURSION; md->callout_data = NULL; / The table pointer is always in native byte order. / tables = external_re->tables; if (extra_data != NULL) { register unsigned int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data )extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) md->match_limit = extra_data->match_limit; if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) md->match_limit_recursion = extra_data->match_limit_recursion; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. / if (tables == NULL) tables = _pcre_default_tables; / Check that the first field in the block is the magic number. If it is not, test for a regex that was compiled on a host of opposite endianness. If this is the case, flipped values are put in internal_re and internal_study if there was study data too. / if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } / Set up other data / anchored = ((re->options \| options) & PCRE_ANCHORED) != 0; startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; / The code starts after the real_pcre block and the capture name table. / md->start_code = (const uschar )external_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (USPTR)subject; md->start_offset = start_offset; md->end_subject = md->start_subject + length; end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; md->notempty = (options & PCRE_NOTEMPTY) != 0; md->partial = (options & PCRE_PARTIAL) != 0; md->hitend = FALSE; md->recursive = NULL; /* No recursion at top level / md->lcc = tables + lcc_offset; md->ctypes = tables + ctypes_offset; / Handle different \R options. / switch (options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) { case 0: if ((re->options & (PCRE_BSR_ANYCRLF\|PCRE_BSR_UNICODE)) != 0) md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; else #ifdef BSR_ANYCRLF md->bsr_anycrlf = TRUE; #else md->bsr_anycrlf = FALSE; #endif break; case PCRE_BSR_ANYCRLF: md->bsr_anycrlf = TRUE; break; case PCRE_BSR_UNICODE: md->bsr_anycrlf = FALSE; break; default: return PCRE_ERROR_BADNEWLINE; } / Handle different types of newline. The three bits give eight cases. If nothing is set at run time, whatever was used at compile time applies. / switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; / Compile-time default / case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) \| '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; } if (newline == -2) { md->nltype = NLTYPE_ANYCRLF; } else if (newline < 0) { md->nltype = NLTYPE_ANY; } else { md->nltype = NLTYPE_FIXED; if (newline > 255) { md->nllen = 2; md->nl[0] = (newline >> 8) & 255; md->nl[1] = newline & 255; } else { md->nllen = 1; md->nl[0] = newline; } } / Partial matching is supported only for a restricted set of regexes at the moment. / if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) return PCRE_ERROR_BADPARTIAL; / Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. / #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar )subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { int tb = ((uschar )subject)[start_offset]; if (tb > 127) { tb &= 0xc0; if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; } } } #endif / The ims options can vary during the matching as a result of the presence of (?ims) items in the pattern. They are kept in a local variable so that restoring at the exit of a group is easy. / ims = re->options & (PCRE_CASELESS\|PCRE_MULTILINE\|PCRE_DOTALL); / If the expression has got more back references than the offsets supplied can hold, we get a temporary chunk of working store to use during the matching. Otherwise, we can use the vector supplied, rounding down its size to a multiple of 3. / ocount = offsetcount - (offsetcount % 3); if (re->top_backref > 0 && re->top_backref >= ocount/3) { ocount = re->top_backref 3 + 3; md->offset_vector = (int )(pcre_malloc)(ocount sizeof(int)); if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; DPRINTF(("Got memory to hold back references\n")); } else md->offset_vector = offsets; md->offset_end = ocount; md->offset_max = (2ocount)/3; md->offset_overflow = FALSE; md->capture_last = -1; / Compute the minimum number of offsets that we need to reset each time. Doing this makes a huge difference to execution time when there aren't many brackets in the pattern. / resetcount = 2 + re->top_bracket 2; if (resetcount > offsetcount) resetcount = ocount; /* Reset the working variable associated with each extraction. These should never be used unless previously set, but they get saved and restored, and so we initialize them to avoid reading uninitialized locations. / if (md->offset_vector != NULL) { register int iptr = md->offset_vector + ocount; register int iend = iptr - resetcount/2 + 1; while (--iptr >= iend) iptr = -1; } /* Set up the first character to match, if available. The first_byte value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first char may be unset for an unanchored pattern, of course. If there's no first char and the pattern was studied, there may be a bitmap of possible first characters. / if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) first_byte = md->lcc[first_byte]; } else if (!startline && study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } / For anchored or unanchored matches, there may be a "last known required character" set. / if ((re->flags & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; req_byte2 = (tables + fcc_offset)[req_byte]; / case flipped / } / ==========================================================================/ / Loop for handling unanchored repeated matching attempts; for anchored regexs the loop runs just once. / for(;;) { USPTR save_end_subject = end_subject; USPTR new_start_match; / Reset the maximum number of extractions we might see. / if (md->offset_vector != NULL) { register int iptr = md->offset_vector; register int iend = iptr + resetcount; while (iptr < iend) iptr++ = -1; } /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the first newline. Implement this by temporarily adjusting end_subject so that we stop scanning at a newline. If the match fails at the newline, later code breaks this loop. / if (firstline) { USPTR t = start_match; #ifdef SUPPORT_UTF8 if (utf8) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; while (t < end_subject && (t & 0xc0) == 0x80) t++; } } else #endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } /* Now advance to a unique first byte if there is one. / if (first_byte >= 0) { if (first_byte_caseless) while (start_match < end_subject && md->lcc[start_match] != first_byte) start_match++; else while (start_match < end_subject && start_match != first_byte) start_match++; } / Or to just after a linebreak for a multiline match / else if (startline) { if (start_match > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 if (utf8) { while (start_match < end_subject && !WAS_NEWLINE(start_match)) { start_match++; while(start_match < end_subject && (start_match & 0xc0) == 0x80) start_match++; } } else #endif while (start_match < end_subject && !WAS_NEWLINE(start_match)) start_match++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. / if (start_match[-1] == '\r' && (md->nltype == NLTYPE_ANY \|\| md->nltype == NLTYPE_ANYCRLF) && start_match < end_subject && start_match == '\n') start_match++; } } /* Or to a non-unique first byte after study / else if (start_bits != NULL) { while (start_match < end_subject) { register unsigned int c = start_match; if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; } } /* Restore fudged end_subject / end_subject = save_end_subject; #ifdef DEBUG / Sigh. Some compilers never learn. / printf(">>>> Match against: "); pchars(start_match, end_subject - start_match, TRUE, md); printf("\n"); #endif / If req_byte is set, we know that that character must appear in the subject for the match to succeed. If the first character is set, req_byte must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of backtracking in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as does using an autoincrement and backing off on a match. HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching something like /^\d+C/ on a 32-megabyte string... so we don't do this when the string is sufficiently long. ALSO: this processing is disabled when partial matching is requested. / if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && !md->partial) { register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); / We don't need to repeat the search if we haven't yet reached the place we found it at last time. / if (p > req_byte_ptr) { if (req_byte_caseless) { while (p < end_subject) { register int pp = p++; if (pp == req_byte \|\| pp == req_byte2) { p--; break; } } } else { while (p < end_subject) { if (p++ == req_byte) { p--; break; } } } / If we can't find the required character, break the matching loop, forcing a match failure. / if (p >= end_subject) { rc = MATCH_NOMATCH; break; } / If we have found the required character, save the point where we found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. / req_byte_ptr = p; } } / OK, we can now run the match. / md->start_match_ptr = start_match; md->match_call_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); switch(rc) { / NOMATCH and PRUNE advance by one character. THEN at this level acts exactly like PRUNE. / case MATCH_NOMATCH: case MATCH_PRUNE: case MATCH_THEN: new_start_match = start_match + 1; #ifdef SUPPORT_UTF8 if (utf8) while(new_start_match < end_subject && (new_start_match & 0xc0) == 0x80) new_start_match++; #endif break; /* SKIP passes back the next starting point explicitly. / case MATCH_SKIP: new_start_match = md->start_match_ptr; break; / COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. / case MATCH_COMMIT: rc = MATCH_NOMATCH; goto ENDLOOP; / Any other return is some kind of error. / default: goto ENDLOOP; } / Control reaches here for the various types of "no match at this point" result. Reset the code to MATCH_NOMATCH for subsequent checking. / rc = MATCH_NOMATCH; / If PCRE_FIRSTLINE is set, the match must happen before or at the first newline in the subject (though it may continue over the newline). Therefore, if we have just failed to match, starting at a newline, do not continue. / if (firstline && IS_NEWLINE(start_match)) break; / Advance to new matching position / start_match = new_start_match; / Break the loop if the pattern is anchored or if we have passed the end of the subject. / if (anchored \|\| start_match > end_subject) break; / If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. / if (start_match[-1] == '\r' && start_match < end_subject && start_match == '\n' && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY \|\| md->nltype == NLTYPE_ANYCRLF \|\| md->nllen == 2)) start_match++; } /* End of for(;;) "bumpalong" loop / / ==========================================================================/ / We reach here when rc is not MATCH_NOMATCH, or if one of the stopping conditions is true: (1) The pattern is anchored or the match was failed by (COMMIT); (2) We are past the end of the subject; (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because this option requests that a match occur at or before the first newline in the subject. When we have a match and the offset vector is big enough to deal with any backreferences, captured substring offsets will already be set up. In the case where we had to get some local store to hold offsets for backreference processing, copy those that we can. In this case there need not be overflow if certain parts of the pattern were not used, even though there are more capturing parentheses than vector slots. / ENDLOOP: if (rc == MATCH_MATCH) { if (using_temporary_offsets) { if (offsetcount >= 4) { memcpy(offsets + 2, md->offset_vector + 2, (offsetcount - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; DPRINTF(("Freeing temporary memory\n")); (pcre_free)(md->offset_vector); } /* Set the return code to the number of captured strings, or 0 if there are too many to fit into the vector. / rc = md->offset_overflow? 0 : md->end_offset_top/2; / If there is space, set up the whole thing as substring 0. The value of md->start_match_ptr might be modified if \K was encountered on the success matching path. / if (offsetcount < 2) rc = 0; else { offsets[0] = md->start_match_ptr - md->start_subject; offsets[1] = md->end_match_ptr - md->start_subject; } DPRINTF((">>>> returning %d\n", rc)); return rc; } / Control gets here if there has been an error, or if the overall match attempt has failed at all permitted starting positions. */ if (using_temporary_offsets) { DPRINTF(("Freeing temporary memory\n")); (pcre_free)(md->offset_vector); } if (rc != MATCH_NOMATCH) { DPRINTF((">>>> error: returning %d\n", rc)); return rc; } else if (md->partial && md->hitend) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); return PCRE_ERROR_PARTIAL; } else { DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); return PCRE_ERROR_NOMATCH; } }	pcreexec.c	4388
pcrefinf.c
Type	Function	Source	Line
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_fullinfo(const pcre argument_re, const pcre_extra extra_data, int what, void where) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_fullinfo(const pcre argument_re, const pcre_extra extra_data, int what, void where) { real_pcre internal_re; pcre_study_data internal_study; const real_pcre re = (const real_pcre )argument_re; const pcre_study_data study = NULL; if (re == NULL \|\| where == NULL) return PCRE_ERROR_NULL; if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data )extra_data->study_data; if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } switch (what) { case PCRE_INFO_OPTIONS: ((unsigned long int )where) = re->options & PUBLIC_OPTIONS; break; case PCRE_INFO_SIZE: ((size_t )where) = re->size; break; case PCRE_INFO_STUDYSIZE: ((size_t )where) = (study == NULL)? 0 : study->size; break; case PCRE_INFO_CAPTURECOUNT: ((int )where) = re->top_bracket; break; case PCRE_INFO_BACKREFMAX: ((int )where) = re->top_backref; break; case PCRE_INFO_FIRSTBYTE: ((int )where) = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; break; /* Make sure we pass back the pointer to the bit vector in the external block, not the internal copy (with flipped integer fields). / case PCRE_INFO_FIRSTTABLE: ((const uschar *)where) = (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? ((const pcre_study_data )extra_data->study_data)->start_bits : NULL; break; case PCRE_INFO_LASTLITERAL: ((int )where) = ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; break; case PCRE_INFO_NAMEENTRYSIZE: ((int )where) = re->name_entry_size; break; case PCRE_INFO_NAMECOUNT: ((int )where) = re->name_count; break; case PCRE_INFO_NAMETABLE: ((const uschar )where) = (const uschar )re + re->name_table_offset; break; case PCRE_INFO_DEFAULT_TABLES: ((const uschar )where) = (const uschar )(_pcre_default_tables); break; case PCRE_INFO_OKPARTIAL: ((int )where) = (re->flags & PCRE_NOPARTIAL) == 0; break; case PCRE_INFO_JCHANGED: ((int )where) = (re->flags & PCRE_JCHANGED) != 0; break; case PCRE_INFO_HASCRORLF: ((int )where) = (re->flags & PCRE_HASCRORLF) != 0; break; default: return PCRE_ERROR_BADOPTION; } return 0; }	pcrefinf.c	68
pcreget.c
Type	Function	Source	Line
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_get_stringnumber(const pcre code, const char stringname) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringnumber(const pcre code, const char stringname) { int rc; int entrysize; int top, bot; uschar nametable; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) return rc; if (top <= 0) return PCRE_ERROR_NOSUBSTRING; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) return rc; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) return rc; bot = 0; while (top > bot) { int mid = (top + bot) / 2; uschar entry = nametable + entrysizemid; int c = strcmp(stringname, (char )(entry + 2)); if (c == 0) return (entry[0] << 8) + entry[1]; if (c > 0) bot = mid + 1; else top = mid; } return PCRE_ERROR_NOSUBSTRING; }	pcreget.c	68
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_get_stringtable_entries(const pcre code, const char stringname, char firstptr, char lastptr) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringtable_entries(const pcre code, const char stringname, char firstptr, char lastptr) { int rc; int entrysize; int top, bot; uschar nametable, lastentry; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) return rc; if (top <= 0) return PCRE_ERROR_NOSUBSTRING; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) return rc; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) return rc; lastentry = nametable + entrysize * (top - 1); bot = 0; while (top > bot) { int mid = (top + bot) / 2; uschar entry = nametable + entrysizemid; int c = strcmp(stringname, (char )(entry + 2)); if (c == 0) { uschar first = entry; uschar last = entry; while (first > nametable) { if (strcmp(stringname, (char )(first - entrysize + 2)) != 0) break; first -= entrysize; } while (last < lastentry) { if (strcmp(stringname, (char )(last + entrysize + 2)) != 0) break; last += entrysize; } firstptr = (char )first; lastptr = (char *)last; return entrysize; } if (c > 0) bot = mid + 1; else top = mid; } return PCRE_ERROR_NOSUBSTRING; }	pcreget.c	117
STATIC INT	get_first_set(const pcre code, const char stringname, int ovector) static int get_first_set(const pcre code, const char stringname, int ovector) { const real_pcre re = (const real_pcre )code; int entrysize; char first, last; uschar entry; if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0) return pcre_get_stringnumber(code, stringname); entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last); if (entrysize <= 0) return entrysize; for (entry = (uschar )first; entry <= (uschar )last; entry += entrysize) { int n = (entry[0] << 8) + entry[1]; if (ovector[n2] >= 0) return n; } return (first[0] << 8) + first[1]; }	pcreget.c	185
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_copy_substring(const char subject, int ovector, int stringcount, int stringnumber, char buffer, int size) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_substring(const char subject, int ovector, int stringcount, int stringnumber, char buffer, int size) { int yield; if (stringnumber < 0 \|\| stringnumber >= stringcount) return PCRE_ERROR_NOSUBSTRING; stringnumber *= 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; if (size < yield + 1) return PCRE_ERROR_NOMEMORY; memcpy(buffer, subject + ovector[stringnumber], yield); buffer[yield] = 0; return yield; }	pcreget.c	234
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_copy_named_substring(const pcre code, const char subject, int ovector, int stringcount, const char stringname, char buffer, int size) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_named_substring(const pcre code, const char subject, int ovector, int stringcount, const char stringname, char buffer, int size) { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); }	pcreget.c	279
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_get_substring_list(const char subject, int ovector, int stringcount, const char **listptr) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring_list(const char subject, int ovector, int stringcount, const char *listptr) { int i; int size = sizeof(char ); int double_count = stringcount * 2; char *stringlist; char p; for (i = 0; i < double_count; i += 2) size += sizeof(char ) + ovector[i+1] - ovector[i] + 1; stringlist = (char )(pcre_malloc)(size); if (stringlist == NULL) return PCRE_ERROR_NOMEMORY; listptr = (const char *)stringlist; p = (char )(stringlist + stringcount + 1); for (i = 0; i < double_count; i += 2) { int len = ovector[i+1] - ovector[i]; memcpy(p, subject + ovector[i], len); stringlist++ = p; p += len; p++ = 0; } *stringlist = NULL; return 0; }	pcreget.c	311
PCRE_EXP_DEFN VOID PCRE_CALL_CONVENTION	pcre_free_substring_list(const char pointer) PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring_list(const char pointer) { (pcre_free)((void *)pointer); }	pcreget.c	356
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_get_substring(const char subject, int ovector, int stringcount, int stringnumber, const char *stringptr) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring(const char subject, int ovector, int stringcount, int stringnumber, const char stringptr) { int yield; char substring; if (stringnumber < 0 \|\| stringnumber >= stringcount) return PCRE_ERROR_NOSUBSTRING; stringnumber = 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; substring = (char )(pcre_malloc)(yield + 1); if (substring == NULL) return PCRE_ERROR_NOMEMORY; memcpy(substring, subject + ovector[stringnumber], yield); substring[yield] = 0; *stringptr = substring; return yield; }	pcreget.c	389
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_get_named_substring(const pcre code, const char subject, int ovector, int stringcount, const char stringname, const char *stringptr) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_named_substring(const pcre code, const char subject, int ovector, int stringcount, const char stringname, const char *stringptr) { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_get_substring(subject, ovector, stringcount, n, stringptr); }	pcreget.c	436
PCRE_EXP_DEFN VOID PCRE_CALL_CONVENTION	pcre_free_substring(const char pointer) PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring(const char pointer) { (pcre_free)((void *)pointer); }	pcreget.c	459
pcreinfo.c
Type	Function	Source	Line
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_info(const pcre argument_re, int optptr, int first_byte) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_info(const pcre argument_re, int optptr, int first_byte) { real_pcre internal_re; const real_pcre re = (const real_pcre )argument_re; if (re == NULL) return PCRE_ERROR_NULL; if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, NULL, NULL); if (re == NULL) return PCRE_ERROR_BADMAGIC; } if (optptr != NULL) optptr = (int)(re->options & PUBLIC_OPTIONS); if (first_byte != NULL) first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; return re->top_bracket; }	pcreinfo.c	75
pcremktb.c
Type	Function	Source	Line
CONST UNSIGNED CHAR *	pcre_maketables(void) const unsigned char * pcre_maketables(void) { unsigned char yield, p; int i; #ifndef DFTABLES yield = (unsigned char)(pcre_malloc)(tables_length); #else yield = (unsigned char)malloc(tables_length); #endif if (yield == NULL) return NULL; p = yield; /* First comes the lower casing table / for (i = 0; i < 256; i++) p++ = tolower(i); /* Next the case-flipping table / for (i = 0; i < 256; i++) p++ = islower(i)? toupper(i) : tolower(i); /* Then the character class tables. Don't try to be clever and save effort on exclusive ones - in some locales things may be different. Note that the table for "space" includes everything "isspace" gives, including VT in the default locale. This makes it work for the POSIX class [:space:]. Note also that it is possible for a character to be alnum or alpha without being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must test for alnum specially. / memset(p, 0, cbit_length); for (i = 0; i < 256; i++) { if (isdigit(i)) p[cbit_digit + i/8] \|= 1 << (i&7); if (isupper(i)) p[cbit_upper + i/8] \|= 1 << (i&7); if (islower(i)) p[cbit_lower + i/8] \|= 1 << (i&7); if (isalnum(i)) p[cbit_word + i/8] \|= 1 << (i&7); if (i == '_') p[cbit_word + i/8] \|= 1 << (i&7); if (isspace(i)) p[cbit_space + i/8] \|= 1 << (i&7); if (isxdigit(i))p[cbit_xdigit + i/8] \|= 1 << (i&7); if (isgraph(i)) p[cbit_graph + i/8] \|= 1 << (i&7); if (isprint(i)) p[cbit_print + i/8] \|= 1 << (i&7); if (ispunct(i)) p[cbit_punct + i/8] \|= 1 << (i&7); if (iscntrl(i)) p[cbit_cntrl + i/8] \|= 1 << (i&7); } p += cbit_length; / Finally, the character type table. In this, we exclude VT from the white space chars, because Perl doesn't recognize it as such for \s and for comments within regexes. / for (i = 0; i < 256; i++) { int x = 0; if (i != 0x0b && isspace(i)) x += ctype_space; if (isalpha(i)) x += ctype_letter; if (isdigit(i)) x += ctype_digit; if (isxdigit(i)) x += ctype_xdigit; if (isalnum(i) \|\| i == '_') x += ctype_word; / Note: strchr includes the terminating zero in the characters it considers. In this instance, that is ok because we want binary zero to be flagged as a meta-character, which in this sense is any character that terminates a run of data characters. / if (strchr("\\+?{^.$\|()[", i) != 0) x += ctype_meta; *p++ = x; } return yield; }	pcremktb.c	69
pcrenewl.c
Type	Function	Source	Line
BOOL	_pcre_is_newline(const uschar ptr, int type, const uschar endptr, int lenptr, BOOL utf8) BOOL _pcre_is_newline(const uschar ptr, int type, const uschar endptr, int lenptr, BOOL utf8) { int c; if (utf8) { GETCHAR(c, ptr); } else c = ptr; if (type == NLTYPE_ANYCRLF) switch(c) { case 0x000a: lenptr = 1; return TRUE; /* LF / case 0x000d: lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR / default: return FALSE; } / NLTYPE_ANY / else switch(c) { case 0x000a: / LF / case 0x000b: / VT / case 0x000c: lenptr = 1; return TRUE; /* FF / case 0x000d: lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR / case 0x0085: lenptr = utf8? 2 : 1; return TRUE; /* NEL / case 0x2028: / LS / case 0x2029: lenptr = 3; return TRUE; /* PS */ default: return FALSE; } }	pcrenewl.c	75
BOOL	_pcre_was_newline(const uschar ptr, int type, const uschar startptr, int lenptr, BOOL utf8) BOOL _pcre_was_newline(const uschar ptr, int type, const uschar startptr, int lenptr, BOOL utf8) { int c; ptr--; #ifdef SUPPORT_UTF8 if (utf8) { BACKCHAR(ptr); GETCHAR(c, ptr); } else c = ptr; #else / no UTF-8 support / c = ptr; #endif /* SUPPORT_UTF8 / if (type == NLTYPE_ANYCRLF) switch(c) { case 0x000a: lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; return TRUE; /* LF / case 0x000d: lenptr = 1; return TRUE; /* CR / default: return FALSE; } else switch(c) { case 0x000a: lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; return TRUE; /* LF / case 0x000b: / VT / case 0x000c: / FF / case 0x000d: lenptr = 1; return TRUE; /* CR / case 0x0085: lenptr = utf8? 2 : 1; return TRUE; /* NEL / case 0x2028: / LS / case 0x2029: lenptr = 3; return TRUE; /* PS */ default: return FALSE; } }	pcrenewl.c	125
pcreoutf.c
Type	Function	Source	Line
INT	_pcre_ord2utf8(int cvalue, uschar buffer) int _pcre_ord2utf8(int cvalue, uschar buffer) { #ifdef SUPPORT_UTF8 register int i, j; for (i = 0; i < _pcre_utf8_table1_size; i++) if (cvalue <= _pcre_utf8_table1[i]) break; buffer += i; for (j = i; j > 0; j--) { buffer-- = 0x80 \| (cvalue & 0x3f); cvalue >>= 6; } buffer = _pcre_utf8_table2[i] \| cvalue; return i + 1; #else (void)(cvalue); /* Keep compiler happy; this function won't ever be / (void)(buffer); / called when SUPPORT_UTF8 is not defined. */ return 0; #endif }	pcreoutf.c	65
pcrerefc.c
Type	Function	Source	Line
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION	pcre_refcount(pcre argument_re, int adjust) PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_refcount(pcre argument_re, int adjust) { real_pcre re = (real_pcre )argument_re; if (re == NULL) return PCRE_ERROR_NULL; re->ref_count = (-adjust > re->ref_count)? 0 : (adjust + re->ref_count > 65535)? 65535 : re->ref_count + adjust; return re->ref_count; }	pcrerefc.c	71
pcrestud.c
Type	Function	Source	Line
STATIC VOID	set_bit(uschar start_bits, unsigned int c, BOOL caseless, compile_data cd) static void set_bit(uschar start_bits, unsigned int c, BOOL caseless, compile_data cd) { start_bits[c/8] \|= (1 << (c&7)); if (caseless && (cd->ctypes[c] & ctype_letter) != 0) start_bits[cd->fcc[c]/8] \|= (1 << (cd->fcc[c]&7)); }	pcrestud.c	73
STATIC INT	set_start_bits(const uschar code, uschar start_bits, BOOL caseless, BOOL utf8, compile_data cd) static int set_start_bits(const uschar code, uschar start_bits, BOOL caseless, BOOL utf8, compile_data cd) { register int c; int yield = SSB_DONE; #if 0 /* ========================================================================= / / The following comment and code was inserted in January 1999. In May 2006, when it was observed to cause compiler warnings about unused values, I took it out again. If anybody is still using OS/2, they will have to put it back manually. / / This next statement and the later reference to dummy are here in order to trick the optimizer of the IBM C compiler for OS/2 into generating correct code. Apparently IBM isn't going to fix the problem, and we would rather not disable optimization (in this module it actually makes a big difference, and the pcre module can use all the optimization it can get). / volatile int dummy; / ========================================================================= / #endif do { const uschar tcode = code + (((int)code == OP_CBRA)? 3:1) + LINK_SIZE; BOOL try_next = TRUE; while (try_next) / Loop for items in this branch / { int rc; switch(tcode) { /* Fail if we reach something we don't understand / default: return SSB_FAIL; / If we hit a bracket or a positive lookahead assertion, recurse to set bits from within the subpattern. If it can't find anything, we have to give up. If it finds some mandatory character(s), we are done for this branch. Otherwise, carry on scanning after the subpattern. / case OP_BRA: case OP_SBRA: case OP_CBRA: case OP_SCBRA: case OP_ONCE: case OP_ASSERT: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); if (rc == SSB_FAIL) return SSB_FAIL; if (rc == SSB_DONE) try_next = FALSE; else { do tcode += GET(tcode, 1); while (tcode == OP_ALT); tcode += 1 + LINK_SIZE; } break; /* If we hit ALT or KET, it means we haven't found anything mandatory in this branch, though we might have found something optional. For ALT, we continue with the next alternative, but we have to arrange that the final result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, return SSB_CONTINUE: if this is the top level, that indicates failure, but after a nested subpattern, it causes scanning to continue. / case OP_ALT: yield = SSB_CONTINUE; try_next = FALSE; break; case OP_KET: case OP_KETRMAX: case OP_KETRMIN: return SSB_CONTINUE; / Skip over callout / case OP_CALLOUT: tcode += 2 + 2LINK_SIZE; break; /* Skip over lookbehind and negative lookahead assertions / case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do tcode += GET(tcode, 1); while (tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* Skip over an option setting, changing the caseless flag / case OP_OPT: caseless = (tcode[1] & PCRE_CASELESS) != 0; tcode += 2; break; / BRAZERO does the bracket, but carries on. / case OP_BRAZERO: case OP_BRAMINZERO: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) return SSB_FAIL; / ========================================================================= See the comment at the head of this function concerning the next line, which was an old fudge for the benefit of OS/2. dummy = 1; ========================================================================= / do tcode += GET(tcode,1); while (tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* SKIPZERO skips the bracket. / case OP_SKIPZERO: tcode++; do tcode += GET(tcode,1); while (tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* Single-char * or ? sets the bit and tries the next item / case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; / Single-char upto sets the bit and tries the next / case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; / At least one single char sets the bit and stops / case OP_EXACT: / Fall through / tcode += 2; case OP_CHAR: case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: set_bit(start_bits, tcode[1], caseless, cd); try_next = FALSE; break; / Single character type sets the bits and stops / case OP_NOT_DIGIT: for (c = 0; c < 32; c++) start_bits[c] \|= ~cd->cbits[c+cbit_digit]; try_next = FALSE; break; case OP_DIGIT: for (c = 0; c < 32; c++) start_bits[c] \|= cd->cbits[c+cbit_digit]; try_next = FALSE; break; / The cbit_space table has vertical tab as whitespace; we have to discard it. / case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] \|= ~d; } try_next = FALSE; break; / The cbit_space table has vertical tab as whitespace; we have to discard it. / case OP_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] \|= d; } try_next = FALSE; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] \|= ~cd->cbits[c+cbit_word]; try_next = FALSE; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] \|= cd->cbits[c+cbit_word]; try_next = FALSE; break; / One or more character type fudges the pointer and restarts, knowing it will hit a single character type and stop there. / case OP_TYPEPLUS: case OP_TYPEMINPLUS: tcode++; break; case OP_TYPEEXACT: tcode += 3; break; / Zero or more repeats of character types set the bits and then try again. / case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: tcode += 2; / Fall through / case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: switch(tcode[1]) { case OP_ANY: case OP_ALLANY: return SSB_FAIL; case OP_NOT_DIGIT: for (c = 0; c < 32; c++) start_bits[c] \|= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: for (c = 0; c < 32; c++) start_bits[c] \|= cd->cbits[c+cbit_digit]; break; / The cbit_space table has vertical tab as whitespace; we have to discard it. / case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] \|= ~d; } break; / The cbit_space table has vertical tab as whitespace; we have to discard it. / case OP_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] \|= d; } break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] \|= ~cd->cbits[c+cbit_word]; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] \|= cd->cbits[c+cbit_word]; break; } tcode += 2; break; / Character class where all the information is in a bit map: set the bits and either carry on or not, according to the repeat count. If it was a negative class, and we are operating with UTF-8 characters, any byte with a value >= 0xc4 is a potentially valid starter because it starts a character with a value > 255. / case OP_NCLASS: #ifdef SUPPORT_UTF8 if (utf8) { start_bits[24] \|= 0xf0; / Bits for 0xc4 - 0xc8 / memset(start_bits+25, 0xff, 7); / Bits for 0xc9 - 0xff / } #endif / Fall through / case OP_CLASS: { tcode++; / In UTF-8 mode, the bits in a bit map correspond to character values, not to byte values. However, the bit map we are constructing is for byte values. So we have to do a conversion for characters whose value is > 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. / #ifdef SUPPORT_UTF8 if (utf8) { for (c = 0; c < 16; c++) start_bits[c] \|= tcode[c]; for (c = 128; c < 256; c++) { if ((tcode[c/8] && (1 << (c&7))) != 0) { int d = (c >> 6) \| 0xc0; / Set bit for this starter / start_bits[d/8] \|= (1 << (d&7)); / and then skip on to the / c = (c & 0xc0) + 0x40 - 1; / next relevant character. / } } } / In non-UTF-8 mode, the two bit maps are completely compatible. / else #endif { for (c = 0; c < 32; c++) start_bits[c] \|= tcode[c]; } / Advance past the bit map, and act on what follows / tcode += 32; switch (tcode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: tcode++; break; case OP_CRRANGE: case OP_CRMINRANGE: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; else try_next = FALSE; break; default: try_next = FALSE; break; } } break; /* End of bitmap class handling / } / End of switch / } / End of try_next loop / code += GET(code, 1); / Advance to next branch / } while (code == OP_ALT); return yield; }	pcrestud.c	107
PCRE_EXP_DEFN PCRE_EXTRA * PCRE_CALL_CONVENTION	pcre_study(const pcre external_re, int options, const char errorptr) PCRE_EXP_DEFN pcre_extra PCRE_CALL_CONVENTION pcre_study(const pcre external_re, int options, const char errorptr) { uschar start_bits[32]; pcre_extra extra; pcre_study_data study; const uschar tables; uschar code; compile_data compile_block; const real_pcre re = (const real_pcre )external_re; errorptr = NULL; if (re == NULL \|\| re->magic_number != MAGIC_NUMBER) { errorptr = "argument is not a compiled regular expression"; return NULL; } if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) { errorptr = "unknown or incorrect option bit(s) set"; return NULL; } code = (uschar )re + re->name_table_offset + (re->name_count re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or a multiline pattern that matches only at "line starts", no further processing at present. / if ((re->options & PCRE_ANCHORED) != 0 \|\| (re->flags & (PCRE_FIRSTSET\|PCRE_STARTLINE)) != 0) return NULL; / Set the character tables in the block that is passed around / tables = re->tables; if (tables == NULL) (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, (void )(&tables)); compile_block.lcc = tables + lcc_offset; compile_block.fcc = tables + fcc_offset; compile_block.cbits = tables + cbits_offset; compile_block.ctypes = tables + ctypes_offset; /* See if we can find a fixed set of initial characters for the pattern. / memset(start_bits, 0, 32 sizeof(uschar)); if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional data set later by the calling program. At the moment, the size of pcre_study_data is fixed. We nevertheless save it in a field for returning via the pcre_fullinfo() function so that if it becomes variable in the future, we don't have to change that code. / extra = (pcre_extra )(pcre_malloc) (sizeof(pcre_extra) + sizeof(pcre_study_data)); if (extra == NULL) { errorptr = "failed to get memory"; return NULL; } study = (pcre_study_data )((char *)extra + sizeof(pcre_extra)); extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; study->size = sizeof(pcre_study_data); study->options = PCRE_STUDY_MAPPED; memcpy(study->start_bits, start_bits, sizeof(start_bits)); return extra; }	pcrestud.c	507
pcretryf.c
Type	Function	Source	Line
STATIC UNSIGNED LONG INT	byteflip(unsigned long int value, int n) static unsigned long int byteflip(unsigned long int value, int n) { if (n == 2) return ((value & 0x00ff) << 8) \| ((value & 0xff00) >> 8); return ((value & 0x000000ff) << 24) \| ((value & 0x0000ff00) << 8) \| ((value & 0x00ff0000) >> 8) \| ((value & 0xff000000) >> 24); }	pcretryf.c	69
REAL_PCRE *	_pcre_try_flipped(const real_pcre re, real_pcre internal_re, const pcre_study_data study, pcre_study_data internal_study) real_pcre * _pcre_try_flipped(const real_pcre re, real_pcre internal_re, const pcre_study_data study, pcre_study_data internal_study) { if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) return NULL; internal_re = re; /* To copy other fields / internal_re->size = byteflip(re->size, sizeof(re->size)); internal_re->options = byteflip(re->options, sizeof(re->options)); internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags)); internal_re->top_bracket = (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); internal_re->top_backref = (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); internal_re->first_byte = (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); internal_re->req_byte = (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); internal_re->name_table_offset = (pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset)); internal_re->name_entry_size = (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); internal_re->name_count = (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); if (study != NULL) { internal_study = study; / To copy other fields */ internal_study->size = byteflip(study->size, sizeof(study->size)); internal_study->options = byteflip(study->options, sizeof(study->options)); } return internal_re; }	pcretryf.c	101
pcrever.c
Type	Function	Source	Line
PCRE_EXP_DEFN CONST CHAR * PCRE_CALL_CONVENTION	pcre_version(void) PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION pcre_version(void) { return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE); }	pcrever.c	82
pcrevutf.c
Type	Function	Source	Line
INT	_pcre_valid_utf8(const uschar string, int length) int _pcre_valid_utf8(const uschar string, int length) { #ifdef SUPPORT_UTF8 register const uschar p; if (length < 0) { for (p = string; p != 0; p++); length = p - string; } for (p = string; length-- > 0; p++) { register int ab; register int c = p; if (c < 128) continue; if (c < 0xc0) return p - string; ab = _pcre_utf8_table4[c & 0x3f]; / Number of additional bytes / if (length < ab \|\| ab > 3) return p - string; length -= ab; / Check top bits in the second byte / if (((++p) & 0xc0) != 0x80) return p - string; /* Check for overlong sequences for each different length, and for the excluded range 0xd000 to 0xdfff. / switch (ab) { / Check for xx00 000x (overlong sequence) / case 1: if ((c & 0x3e) == 0) return p - string; continue; / We know there aren't any more bytes to check / / Check for 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd000 - 0xdfff) / case 2: if ((c == 0xe0 && (p & 0x20) == 0) \|\| (c == 0xed && p >= 0xa0)) return p - string; break; / Check for 1111 0000, xx00 xxxx (overlong sequence) or greater than 0x0010ffff (f4 8f bf bf) / case 3: if ((c == 0xf0 && (p & 0x30) == 0) \|\| (c > 0xf4 ) \|\| (c == 0xf4 && p > 0x8f)) return p - string; break; #if 0 / These cases can no longer occur, as we restrict to a maximum of four bytes nowadays. Leave the code here in case we ever want to add an option for longer sequences. / / Check for 1111 1000, xx00 0xxx / case 4: if (c == 0xf8 && (p & 0x38) == 0) return p - string; break; /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx / case 5: if (c == 0xfe \|\| c == 0xff \|\| (c == 0xfc && (p & 0x3c) == 0)) return p - string; break; #endif } /* Check for valid bytes after the 2nd, if any; all must start 10 / while (--ab > 0) { if (((++p) & 0xc0) != 0x80) return p - string; } } #else (void)(string); /* Keep picky compilers happy */ (void)(length); #endif return -1; }	pcrevutf.c	77
pcrexcls.c
Type	Function	Source	Line
BOOL	_pcre_xclass(int c, const uschar data) BOOL _pcre_xclass(int c, const uschar data) { int t; BOOL negated = (data & XCL_NOT) != 0; / Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. / if (c < 256) { if ((data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) return !negated; /* char found / } / First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. / if ((data++ & XCL_MAP) != 0) data += 32; while ((t = data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { GETCHARINC(x, data); if (c == x) return !negated; } else if (t == XCL_RANGE) { GETCHARINC(x, data); GETCHARINC(y, data); if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else / XCL_PROP & XCL_NOTPROP / { const ucd_record prop = GET_UCD(c); switch(data) { case PT_ANY: if (t == XCL_PROP) return !negated; break; case PT_LAMP: if ((prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\| prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated; break; case PT_PC: if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; break; / This should never occur, but compilers may mutter if there is no default. / default: return FALSE; } data += 2; } #endif / SUPPORT_UCP / } return negated; / char did not match */ }	pcrexcls.c	67
_hbpcreg.c
Type	Function	Source	Line
STATIC VOID *	hb_pcre_grab( size_t size ) static void * hb_pcre_grab( size_t size ) { return hb_xgrab( size ); } #if 1 #include "_hbconf.h" #endif #include "pcreinal.h" #ifndef VPCOMPAT HB_EXTERN_BEGIN PCRE_EXP_DATA_DEFN void (pcre_malloc)(size_t) = hb_pcre_grab; PCRE_EXP_DATA_DEFN void (pcre_free)(void ) = hb_xfree; PCRE_EXP_DATA_DEFN void (pcre_stack_malloc)(size_t) = hb_pcre_grab; PCRE_EXP_DATA_DEFN void (pcre_stack_free)(void ) = hb_xfree; PCRE_EXP_DATA_DEFN int (pcre_callout)(pcre_callout_block ) = NULL; HB_EXTERN_END	_hbpcreg.c	58

Page url: http://www.yourdomain.com/help/index.html?hbpcre.htm