pcrecomp.c | |||
Type | Function | Source | Line |
---|---|---|---|
STATIC CONST CHAR * | find_error_text(int n)
static const char * find_error_text(int n) { const char *s = error_texts; for (; n > 0; n--) while (*s++ != 0) {}; return s; } | pcrecomp.c | 454 |
STATIC INT | check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass)
static int check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount, int options, BOOL isclass) { BOOL utf8 = (options & PCRE_UTF8) != 0; const uschar *ptr = *ptrptr + 1; int c, i; GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */ ptr--; /* Set pointer back to the last byte */ /* If backslash is at the end of the pattern, it's an error. */ if (c == 0) *errorcodeptr = ERR1; /* Non-alphanumerics are literals. For digits or letters, do an initial lookup in a table. A non-zero result is something that can be returned immediately. Otherwise further processing may be required. */ #ifndef EBCDIC /* ASCII coding */ else if (c < '0' || c > 'z') {} /* Not alphanumeric */ else if ((i = escapes[c - '0']) != 0) c = i; #else /* EBCDIC coding */ else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */ else if ((i = escapes[c - 0x48]) != 0) c = i; #endif /* Escapes that need further processing, or are illegal. */ else { const uschar *oldptr; BOOL braced, negated; switch (c) { /* A number of Perl escapes are not handled by PCRE. We give an explicit error. */ case 'l': case 'L': case 'N': case 'u': case 'U': *errorcodeptr = ERR37; break; /* \g must be followed by one of a number of specific things: (1) A number, either plain or braced. If positive, it is an absolute backreference. If negative, it is a relative backreference. This is a Perl 5.10 feature. (2) Perl 5.10 also supports \g{name} as a reference to a named group. This is part of Perl's movement towards a unified syntax for back references. As this is synonymous with \k{name}, we fudge it up by pretending it really was \k. (3) For Oniguruma compatibility we also support \g followed by a name or a number either in angle brackets or in single quotes. However, these are (possibly recursive) subroutine calls, _not_ backreferences. Just return the -ESC_g code (cf \k). */ case 'g': if (ptr[1] == '<' || ptr[1] == '\'') { c = -ESC_g; break; } /* Handle the Perl-compatible cases */ if (ptr[1] == '{') { const uschar *p; for (p = ptr+2; *p != 0 && *p != '}'; p++) if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break; if (*p != 0 && *p != '}') { c = -ESC_k; break; } braced = TRUE; ptr++; } else braced = FALSE; if (ptr[1] == '-') { negated = TRUE; ptr++; } else negated = FALSE; c = 0; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } if (braced && *(++ptr) != '}') { *errorcodeptr = ERR57; break; } if (c == 0) { *errorcodeptr = ERR58; break; } if (negated) { if (c > bracount) { *errorcodeptr = ERR15; break; } c = bracount - (c - 1); } c = -(ESC_REF + c); break; /* The handling of escape sequences consisting of a string of digits starting with one that is not zero is not straightforward. By experiment, the way Perl works seems to be as follows: Outside a character class, the digits are read as a decimal number. If the number is less than 10, or if there are that many previous extracting left brackets, then it is a back reference. Otherwise, up to three octal digits are read to form an escaped byte. Thus \123 is likely to be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal value is greater than 377, the least significant 8 bits are taken. Inside a character class, \ followed by a digit is always an octal number. */ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (!isclass) { oldptr = ptr; c -= '0'; while ((digitab[ptr[1]] & ctype_digit) != 0) c = c * 10 + *(++ptr) - '0'; if (c < 0) /* Integer overflow */ { *errorcodeptr = ERR61; break; } if (c < 10 || c <= bracount) { c = -(ESC_REF + c); break; } ptr = oldptr; /* Put the pointer back and fall through */ } /* Handle an octal number following \. If the first digit is 8 or 9, Perl generates a binary zero byte and treats the digit as a following literal. Thus we have to pull back the pointer by one. */ if ((c = *ptr) >= '8') { ptr--; c = 0; break; } /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more than 3 octal digits. */ case '0': c -= '0'; while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7') c = c * 8 + *(++ptr) - '0'; if (!utf8 && c > 255) *errorcodeptr = ERR51; break; /* \x is complicated. \x{ddd} is a character number which can be greater than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is treated as a data character. */ case 'x': if (ptr[1] == '{') { const uschar *pt = ptr + 2; int count = 0; c = 0; while ((digitab[*pt] & ctype_xdigit) != 0) { register int cc = *pt++; if (c == 0 && cc == '0') continue; /* Leading zeroes */ count++; #ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10)); #else /* EBCDIC coding */ if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */ c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } if (*pt == '}') { if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34; ptr = pt; break; } /* If the sequence of hex digits does not end with '}', then we don't recognize this construct; fall through to the normal \x handling. */ } /* Read just a single-byte hex-defined char */ c = 0; while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0) { int cc; /* Some compilers don't like ++ */ cc = *(++ptr); /* in initializers */ #ifndef EBCDIC /* ASCII coding */ if (cc >= 'a') cc -= 32; /* Convert to upper case */ c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10)); #else /* EBCDIC coding */ if (cc <= 'z') cc += 64; /* Convert to upper case */ c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10)); #endif } break; /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped. This coding is ASCII-specific, but then the whole concept of \cx is ASCII-specific. (However, an EBCDIC equivalent has now been added.) */ case 'c': c = *(++ptr); if (c == 0) { *errorcodeptr = ERR2; break; } #ifndef EBCDIC /* ASCII coding */ if (c >= 'a' && c <= 'z') c -= 32; c ^= 0x40; #else /* EBCDIC coding */ if (c >= 'a' && c <= 'z') c += 64; c ^= 0xC0; #endif break; /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any other alphanumeric following \ is an error if PCRE_EXTRA was set; otherwise, for Perl compatibility, it is a literal. This code looks a bit odd, but there used to be some cases other than the default, and there may be again in future, so I haven't "optimized" it. */ default: if ((options & PCRE_EXTRA) != 0) switch(c) { default: *errorcodeptr = ERR3; break; } break; } } *ptrptr = ptr; return c; } #ifdef SUPPORT_UCP | pcrecomp.c | 487 |
STATIC INT | get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
static int get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr) { int c, i, bot, top; const uschar *ptr = *ptrptr; char name[32]; c = *(++ptr); if (c == 0) goto ERROR_RETURN; *negptr = FALSE; /* \P or \p can be followed by a name in {}, optionally preceded by ^ for negation. */ if (c == '{') { if (ptr[1] == '^') { *negptr = TRUE; ptr++; } for (i = 0; i < (int)sizeof(name) - 1; i++) { c = *(++ptr); if (c == 0) goto ERROR_RETURN; if (c == '}') break; name[i] = c; } if (c !='}') goto ERROR_RETURN; name[i] = 0; } /* Otherwise there is just one following character */ else { name[0] = c; name[1] = 0; } *ptrptr = ptr; /* Search for a recognized property name using binary chop */ bot = 0; top = _pcre_utt_size; while (bot < top) { i = (bot + top) >> 1; c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset); if (c == 0) { *dptr = _pcre_utt[i].value; return _pcre_utt[i].type; } if (c > 0) bot = i + 1; else top = i; } *errorcodeptr = ERR47; *ptrptr = ptr; return -1; ERROR_RETURN: *errorcodeptr = ERR46; *ptrptr = ptr; return -1; } #endif | pcrecomp.c | 792 |
STATIC BOOL | is_counted_repeat(const uschar *p)
static BOOL is_counted_repeat(const uschar *p) { if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; if (*p == '}') return TRUE; if (*p++ != ',') return FALSE; if (*p == '}') return TRUE; if ((digitab[*p++] & ctype_digit) == 0) return FALSE; while ((digitab[*p] & ctype_digit) != 0) p++; return (*p == '}'); } | pcrecomp.c | 881 |
STATIC CONST USCHAR * | read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
static const uschar * read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr) { int min = 0; int max = -1; /* Read the minimum value and do a paranoid check: a negative value indicates an integer overflow. */ while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0'; if (min < 0 || min > 65535) { *errorcodeptr = ERR5; return p; } /* Read the maximum value if there is one, and again do a paranoid on its size. Also, max must not be less than min. */ if (*p == '}') max = min; else { if (*(++p) != '}') { max = 0; while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0'; if (max < 0 || max > 65535) { *errorcodeptr = ERR5; return p; } if (max < min) { *errorcodeptr = ERR4; return p; } } } /* Fill in the required variables, and pass back the pointer to the terminating '}'. */ *minp = min; *maxp = max; return p; } | pcrecomp.c | 918 |
STATIC INT | find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode)
static int find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn, BOOL xmode) { const uschar *thisname; int count = cd->bracount; for (; *ptr != 0; ptr++) { int term; /* Skip over backslashed characters and also entire \Q...\E */ if (*ptr == '\\') { if (*(++ptr) == 0) return -1; if (*ptr == 'Q') for (;;) { while (*(++ptr) != 0 && *ptr != '\\') {}; if (*ptr == 0) return -1; if (*(++ptr) == 'E') break; } continue; } /* Skip over character classes; this logic must be similar to the way they are handled for real. If the first character is '^', skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them too. This makes for compatibility with Perl. */ if (*ptr == '[') { BOOL negate_class = FALSE; for (;;) { int c = *(++ptr); if (c == '\\') { if (ptr[1] == 'E') ptr++; else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; else break; } else if (!negate_class && c == '^') negate_class = TRUE; else break; } /* If the next character is ']', it is a data character that must be skipped, except in JavaScript compatibility mode. */ if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0) ptr++; while (*(++ptr) != ']') { if (*ptr == 0) return -1; if (*ptr == '\\') { if (*(++ptr) == 0) return -1; if (*ptr == 'Q') for (;;) { while (*(++ptr) != 0 && *ptr != '\\') {}; if (*ptr == 0) return -1; if (*(++ptr) == 'E') break; } continue; } } continue; } /* Skip comments in /x mode */ if (xmode && *ptr == '#') { while (*(++ptr) != 0 && *ptr != '\n') {}; if (*ptr == 0) return -1; continue; } /* An opening parens must now be a real metacharacter */ if (*ptr != '(') continue; if (ptr[1] != '?' && ptr[1] != '*') { count++; if (name == NULL && count == lorn) return count; continue; } ptr += 2; if (*ptr == 'P') ptr++; /* Allow optional P */ /* We have to disambiguate (? */ if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') && *ptr != '\'') continue; count++; if (name == NULL && count == lorn) return count; term = *ptr++; if (term == '<') term = '>'; thisname = ptr; while (*ptr != term) ptr++; if (name != NULL && lorn == ptr - thisname && strncmp((const char *)name, (const char *)thisname, lorn) == 0) return count; } return -1; } | pcrecomp.c | 987 |
STATIC CONST USCHAR* | first_significant_code(const uschar *code, int *options, int optbit, BOOL skipassert)
static const uschar* first_significant_code(const uschar *code, int *options, int optbit, BOOL skipassert) { for (;;) { switch ((int)*code) { case OP_OPT: if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit)) *options = (int)code[1]; code += 2; break; case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: if (!skipassert) return code; do code += GET(code, 1); while (*code == OP_ALT); code += _pcre_OP_lengths[*code]; break; case OP_WORD_BOUNDARY: case OP_NOT_WORD_BOUNDARY: if (!skipassert) return code; /* Fall through */ case OP_CALLOUT: case OP_CREF: case OP_RREF: case OP_DEF: code += _pcre_OP_lengths[*code]; break; default: return code; } } /* Control never reaches here */ } | pcrecomp.c | 1123 |
STATIC INT | find_fixedlength(uschar *code, int options)
static int find_fixedlength(uschar *code, int options) { int length = -1; register int branchlength = 0; register uschar *cc = code + 1 + LINK_SIZE; /* Scan along the opcodes for this branch. If we get to the end of the branch, check the length against that of the other branches. */ for (;;) { int d; register int op = *cc; switch (op) { case OP_CBRA: case OP_BRA: case OP_ONCE: case OP_COND: d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options); if (d < 0) return d; branchlength += d; do cc += GET(cc, 1); while (*cc == OP_ALT); cc += 1 + LINK_SIZE; break; /* Reached end of a branch; if it's a ket it is the end of a nested call. If it's ALT it is an alternation in a nested call. If it is END it's the end of the outer call. All can be handled by the same code. */ case OP_ALT: case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_END: if (length < 0) length = branchlength; else if (length != branchlength) return -1; if (*cc != OP_ALT) return length; cc += 1 + LINK_SIZE; branchlength = 0; break; /* Skip over assertive subpatterns */ case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do cc += GET(cc, 1); while (*cc == OP_ALT); /* Fall through */ /* Skip over things that don't match chars */ case OP_REVERSE: case OP_CREF: case OP_RREF: case OP_DEF: case OP_OPT: case OP_CALLOUT: case OP_SOD: case OP_SOM: case OP_EOD: case OP_EODN: case OP_CIRC: case OP_DOLL: case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: cc += _pcre_OP_lengths[*cc]; break; /* Handle literal characters */ case OP_CHAR: case OP_CHARNC: case OP_NOT: branchlength++; cc += 2; #ifdef SUPPORT_UTF8 if ((options & PCRE_UTF8) != 0) { while ((*cc & 0xc0) == 0x80) cc++; } #endif break; /* Handle exact repetitions. The count is already in characters, but we need to skip over a multibyte character in UTF8 mode. */ case OP_EXACT: branchlength += GET2(cc,1); cc += 4; #ifdef SUPPORT_UTF8 if ((options & PCRE_UTF8) != 0) { while((*cc & 0x80) == 0x80) cc++; } #endif break; case OP_TYPEEXACT: branchlength += GET2(cc,1); if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2; cc += 4; break; /* Handle single-char matchers */ case OP_PROP: case OP_NOTPROP: cc += 2; /* Fall through */ case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: case OP_ALLANY: branchlength++; cc++; break; /* The single-byte matcher isn't allowed */ case OP_ANYBYTE: return -2; /* Check a class for variable quantification */ #ifdef SUPPORT_UTF8 case OP_XCLASS: cc += GET(cc, 1) - 33; /* Fall through */ #endif case OP_CLASS: case OP_NCLASS: cc += 33; switch (*cc) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: return -1; case OP_CRRANGE: case OP_CRMINRANGE: if (GET2(cc,1) != GET2(cc,3)) return -1; branchlength += GET2(cc,1); cc += 5; break; default: branchlength++; } break; /* Anything else is variable length */ default: return -1; } } /* Control never gets here */ } | pcrecomp.c | 1183 |
STATIC CONST USCHAR * | find_bracket(const uschar *code, BOOL utf8, int number)
static const uschar * find_bracket(const uschar *code, BOOL utf8, int number) { for (;;) { register int c = *code; if (c == OP_END) return NULL; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. */ if (c == OP_XCLASS) code += GET(code, 1); /* Handle capturing bracket */ else if (c == OP_CBRA) { int n = GET2(code, 1+LINK_SIZE); if (n == number) return (uschar *)code; code += _pcre_OP_lengths[c]; } /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. */ else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; } /* Add in the fixed length from the table */ code += _pcre_OP_lengths[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } #else (void)(utf8); /* Keep compiler happy by referencing function argument */ #endif } } } | pcrecomp.c | 1373 |
STATIC CONST USCHAR * | find_recurse(const uschar *code, BOOL utf8)
static const uschar * find_recurse(const uschar *code, BOOL utf8) { for (;;) { register int c = *code; if (c == OP_END) return NULL; if (c == OP_RECURSE) return code; /* XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in the table is zero; the actual length is stored in the compiled code. */ if (c == OP_XCLASS) code += GET(code, 1); /* Otherwise, we can get the item's length from the table, except that for repeated character types, we have to test for \p and \P, which have an extra two bytes of parameters. */ else { switch(c) { case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSSTAR: case OP_TYPEPOSPLUS: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; case OP_TYPEPOSUPTO: case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEEXACT: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; } /* Add in the fixed length from the table */ code += _pcre_OP_lengths[c]; /* In UTF-8 mode, opcodes that are followed by a character may be followed by a multi-byte character. The length in the table is a minimum, so we have to arrange to skip the extra bytes. */ #ifdef SUPPORT_UTF8 if (utf8) switch(c) { case OP_CHAR: case OP_CHARNC: case OP_EXACT: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f]; break; } #else (void)(utf8); /* Keep compiler happy by referencing function argument */ #endif } } } | pcrecomp.c | 1476 |
STATIC BOOL | could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
static BOOL could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8) { register int c; for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE); code < endcode; code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE)) { const uschar *ccode; c = *code; /* Skip over forward assertions; the other assertions are skipped by first_significant_code() with a TRUE final argument. */ if (c == OP_ASSERT) { do code += GET(code, 1); while (*code == OP_ALT); c = *code; continue; } /* Groups with zero repeats can of course be empty; skip them. */ if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO) { code += _pcre_OP_lengths[c]; do code += GET(code, 1); while (*code == OP_ALT); c = *code; continue; } /* For other groups, scan the branches. */ if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND) { BOOL empty_branch; if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */ /* Scan a closed bracket */ empty_branch = FALSE; do { if (!empty_branch && could_be_empty_branch(code, endcode, utf8)) empty_branch = TRUE; code += GET(code, 1); } while (*code == OP_ALT); if (!empty_branch) return FALSE; /* All branches are non-empty */ c = *code; continue; } /* Handle the other opcodes */ switch (c) { /* Check for quantifiers after a class. XCLASS is used for classes that cannot be represented just by a bit map. This includes negated single high-valued characters. The length in _pcre_OP_lengths[] is zero; the actual length is stored in the compiled code, so we must update "code" here. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: ccode = code += GET(code, 1); goto CHECK_CLASS_REPEAT; #endif case OP_CLASS: case OP_NCLASS: ccode = code + 33; #ifdef SUPPORT_UTF8 CHECK_CLASS_REPEAT: #endif switch (*ccode) { case OP_CRSTAR: /* These could be empty; continue */ case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: break; default: /* Non-repeat => class must match */ case OP_CRPLUS: /* These repeats aren't empty */ case OP_CRMINPLUS: return FALSE; case OP_CRRANGE: case OP_CRMINRANGE: if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */ break; } break; /* Opcodes that must match a character */ case OP_PROP: case OP_NOTPROP: case OP_EXTUNI: case OP_NOT_DIGIT: case OP_DIGIT: case OP_NOT_WHITESPACE: case OP_WHITESPACE: case OP_NOT_WORDCHAR: case OP_WORDCHAR: case OP_ANY: case OP_ALLANY: case OP_ANYBYTE: case OP_CHAR: case OP_CHARNC: case OP_NOT: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_EXACT: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTPOSPLUS: case OP_NOTEXACT: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEPOSPLUS: case OP_TYPEEXACT: return FALSE; /* These are going to continue, as they may be empty, but we have to fudge the length for the \p and \P cases. */ case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2; break; /* Same for these */ case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2; break; /* End of branch */ case OP_KET: case OP_KETRMAX: case OP_KETRMIN: case OP_ALT: return TRUE; /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO, MINUPTO, and POSUPTO may be followed by a multibyte character */ #ifdef SUPPORT_UTF8 case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: if (utf8) while ((code[2] & 0xc0) == 0x80) code++; break; #endif } } return TRUE; } | pcrecomp.c | 1577 |
STATIC BOOL | could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, BOOL utf8)
static BOOL could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr, BOOL utf8) { while (bcptr != NULL && bcptr->current >= code) { if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE; bcptr = bcptr->outer; } return TRUE; } | pcrecomp.c | 1776 |
STATIC BOOL | check_posix_syntax(const uschar *ptr, const uschar **endptr)
static BOOL check_posix_syntax(const uschar *ptr, const uschar **endptr) { int terminator; /* Don't combine these lines; the Solaris cc */ terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */ for (++ptr; *ptr != 0; ptr++) { if (*ptr == '\\' && ptr[1] == ']') ptr++; else { if (*ptr == ']') return FALSE; if (*ptr == terminator && ptr[1] == ']') { *endptr = ptr; return TRUE; } } } return FALSE; } | pcrecomp.c | 1821 |
STATIC INT | check_posix_name(const uschar *ptr, int len)
static int check_posix_name(const uschar *ptr, int len) { const char *pn = posix_names; register int yield = 0; while (posix_name_lengths[yield] != 0) { if (len == posix_name_lengths[yield] && strncmp((const char *)ptr, pn, len) == 0) return yield; pn += posix_name_lengths[yield] + 1; yield++; } return -1; } | pcrecomp.c | 1858 |
STATIC VOID | adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm)
static void adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd, uschar *save_hwm) { uschar *ptr = group; while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL) { int offset; uschar *hc; /* See if this recursion is on the forward reference list. If so, adjust the reference. */ for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE) { offset = GET(hc, 0); if (cd->start_code + offset == ptr + 1) { PUT(hc, 0, offset + adjust); break; } } /* Otherwise, adjust the recursion offset if it's after the start of this group. */ if (hc >= cd->hwm) { offset = GET(ptr, 1); if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust); } ptr += 1 + LINK_SIZE; } } | pcrecomp.c | 1905 |
STATIC USCHAR * | auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
static uschar * auto_callout(uschar *code, const uschar *ptr, compile_data *cd) { *code++ = OP_CALLOUT; *code++ = 255; PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */ PUT(code, LINK_SIZE, 0); /* Default length */ return code + 2*LINK_SIZE; } | pcrecomp.c | 1959 |
STATIC VOID | complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
static void complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd) { int length = ptr - cd->start_pattern - GET(previous_callout, 2); PUT(previous_callout, 2 + LINK_SIZE, length); } #ifdef SUPPORT_UCP | pcrecomp.c | 1987 |
STATIC BOOL | get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, unsigned int *odptr)
static BOOL get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr, unsigned int *odptr) { unsigned int c, othercase, next; for (c = *cptr; c <= d; c++) { if ((othercase = UCD_OTHERCASE(c)) != c) break; } if (c > d) return FALSE; *ocptr = othercase; next = othercase + 1; for (++c; c <= d; c++) { if (UCD_OTHERCASE(c) != next) break; next++; } *odptr = next - 1; *cptr = c; return TRUE; } #endif /* SUPPORT_UCP */ | pcrecomp.c | 2015 |
STATIC BOOL | check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, const uschar *ptr, int options, compile_data *cd)
static BOOL check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char, const uschar *ptr, int options, compile_data *cd) { int next; /* Skip whitespace and comments in extended mode */ if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == '#') { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } } /* If the next item is one that we can handle, get its value. A non-negative value is a character, a negative value is an escape value. */ if (*ptr == '\\') { int temperrorcode = 0; next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE); if (temperrorcode != 0) return FALSE; ptr++; /* Point after the escape sequence */ } else if ((cd->ctypes[*ptr] & ctype_meta) == 0) { #ifdef SUPPORT_UTF8 if (utf8) { GETCHARINC(next, ptr); } else #endif next = *ptr++; } else return FALSE; /* Skip whitespace and comments in extended mode */ if ((options & PCRE_EXTENDED) != 0) { for (;;) { while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++; if (*ptr == '#') { while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; } } else break; } } /* If the next thing is itself optional, we have to give up. */ if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0) return FALSE; /* Now compare the next item with the previous opcode. If the previous is a positive single character match, "item" either contains the character or, if "item" is greater than 127 in utf8 mode, the character's bytes are in utf8_char. */ /* Handle cases when the next item is a character. */ if (next >= 0) switch(op_code) { case OP_CHAR: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #else (void)(utf8_char); /* Keep compiler happy by referencing function argument */ #endif return item != next; /* For CHARNC (caseless character) we must check the other case. If we have Unicode property support, we can use it to test the other case of high-valued characters. */ case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #endif if (item == next) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE((unsigned int)next); #else othercase = NOTACHAR; #endif return (unsigned int)item != othercase; } else #endif /* SUPPORT_UTF8 */ return (item != cd->fcc[next]); /* Non-UTF-8 mode */ /* For OP_NOT, "item" must be a single-byte character. */ case OP_NOT: if (item == next) return TRUE; if ((options & PCRE_CASELESS) == 0) return FALSE; #ifdef SUPPORT_UTF8 if (utf8) { unsigned int othercase; if (next < 128) othercase = cd->fcc[next]; else #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE(next); #else othercase = NOTACHAR; #endif return (unsigned int)item == othercase; } else #endif /* SUPPORT_UTF8 */ return (item == cd->fcc[next]); /* Non-UTF-8 mode */ case OP_DIGIT: return next > 127 || (cd->ctypes[next] & ctype_digit) == 0; case OP_NOT_DIGIT: return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0; case OP_WHITESPACE: return next > 127 || (cd->ctypes[next] & ctype_space) == 0; case OP_NOT_WHITESPACE: return next <= 127 && (cd->ctypes[next] & ctype_space) != 0; case OP_WORDCHAR: return next > 127 || (cd->ctypes[next] & ctype_word) == 0; case OP_NOT_WORDCHAR: return next <= 127 && (cd->ctypes[next] & ctype_word) != 0; case OP_HSPACE: case OP_NOT_HSPACE: switch(next) { case 0x09: case 0x20: case 0xa0: case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200A: case 0x202f: case 0x205f: case 0x3000: return op_code != OP_HSPACE; default: return op_code == OP_HSPACE; } case OP_VSPACE: case OP_NOT_VSPACE: switch(next) { case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x85: case 0x2028: case 0x2029: return op_code != OP_VSPACE; default: return op_code == OP_VSPACE; } default: return FALSE; } /* Handle the case when the next item is \d, \s, etc. */ switch(op_code) { case OP_CHAR: case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8 && item > 127) { GETCHAR(item, utf8_char); } #endif switch(-next) { case ESC_d: return item > 127 || (cd->ctypes[item] & ctype_digit) == 0; case ESC_D: return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0; case ESC_s: return item > 127 || (cd->ctypes[item] & ctype_space) == 0; case ESC_S: return item <= 127 && (cd->ctypes[item] & ctype_space) != 0; case ESC_w: return item > 127 || (cd->ctypes[item] & ctype_word) == 0; case ESC_W: return item <= 127 && (cd->ctypes[item] & ctype_word) != 0; case ESC_h: case ESC_H: switch(item) { case 0x09: case 0x20: case 0xa0: case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200A: case 0x202f: case 0x205f: case 0x3000: return -next != ESC_h; default: return -next == ESC_h; } case ESC_v: case ESC_V: switch(item) { case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x85: case 0x2028: case 0x2029: return -next != ESC_v; default: return -next == ESC_v; } default: return FALSE; } case OP_DIGIT: return next == -ESC_D || next == -ESC_s || next == -ESC_W || next == -ESC_h || next == -ESC_v; case OP_NOT_DIGIT: return next == -ESC_d; case OP_WHITESPACE: return next == -ESC_S || next == -ESC_d || next == -ESC_w; case OP_NOT_WHITESPACE: return next == -ESC_s || next == -ESC_h || next == -ESC_v; case OP_HSPACE: return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w; case OP_NOT_HSPACE: return next == -ESC_h; /* Can't have \S in here because VT matches \S (Perl anomaly) */ case OP_VSPACE: return next == -ESC_V || next == -ESC_d || next == -ESC_w; case OP_NOT_VSPACE: return next == -ESC_v; case OP_WORDCHAR: return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v; case OP_NOT_WORDCHAR: return next == -ESC_w || next == -ESC_d; default: return FALSE; } /* Control does not reach here */ } | pcrecomp.c | 2064 |
STATIC BOOL | compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
static BOOL compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr) { int repeat_type, op_type; int repeat_min = 0, repeat_max = 0; /* To please picky compilers */ int bravalue = 0; int greedy_default, greedy_non_default; int firstbyte, reqbyte; int zeroreqbyte, zerofirstbyte; int req_caseopt, reqvary, tempreqvary; int options = *optionsptr; int after_manual_callout = 0; int length_prevgroup = 0; register int c; register uschar *code = *codeptr; uschar *last_code = code; uschar *orig_code = code; uschar *tempcode; BOOL inescq = FALSE; BOOL groupsetfirstbyte = FALSE; const uschar *ptr = *ptrptr; const uschar *tempptr; uschar *previous = NULL; uschar *previous_callout = NULL; uschar *save_hwm = NULL; uschar classbits[32]; #ifdef SUPPORT_UTF8 BOOL class_utf8; BOOL utf8 = (options & PCRE_UTF8) != 0; uschar *class_utf8data; uschar *class_utf8data_base; uschar utf8_char[6]; #else BOOL utf8 = FALSE; uschar *utf8_char = NULL; #endif #ifdef DEBUG if (lengthptr != NULL) DPRINTF((">> start branch\n")); #endif /* Set up the default and non-default settings for greediness */ greedy_default = ((options & PCRE_UNGREEDY) != 0); greedy_non_default = greedy_default ^ 1; /* Initialize no first byte, no required byte. REQ_UNSET means "no char matching encountered yet". It gets changed to REQ_NONE if we hit something that matches a non-fixed char first char; reqbyte just remains unset if we never find one. When we hit a repeat whose minimum is zero, we may have to adjust these values to take the zero repeat into account. This is implemented by setting them to zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual item types that can be repeated set these backoff variables appropriately. */ firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET; /* The variable req_caseopt contains either the REQ_CASELESS value or zero, according to the current setting of the caseless flag. REQ_CASELESS is a bit value > 255. It is added into the firstbyte or reqbyte variables to record the case status of the value. This is used only for ASCII characters. */ req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0; /* Switch on next character until the end of the branch */ for (;; ptr++) { BOOL negate_class; BOOL should_flip_negation; BOOL possessive_quantifier; BOOL is_quantifier; BOOL is_recurse; BOOL reset_bracount; int class_charcount; int class_lastchar; int newoptions; int recno; int refsign; int skipbytes; int subreqbyte; int subfirstbyte; int terminator; int mclength; uschar mcbuffer[8]; /* Get next byte in the pattern */ c = *ptr; /* If we are in the pre-compile phase, accumulate the length used for the previous cycle of this loop. */ if (lengthptr != NULL) { #ifdef DEBUG if (code > cd->hwm) cd->hwm = code; /* High water info */ #endif if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */ { *errorcodeptr = ERR52; goto FAILED; } /* There is at least one situation where code goes backwards: this is the case of a zero quantifier after a class (e.g. [ab]{0}). At compile time, the class is simply eliminated. However, it is created first, so we have to allow memory for it. Therefore, don't ever reduce the length at this point. */ if (code < last_code) code = last_code; /* Paranoid check for integer overflow */ if (OFLOW_MAX - *lengthptr < code - last_code) { *errorcodeptr = ERR20; goto FAILED; } *lengthptr += code - last_code; DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c)); /* If "previous" is set and it is not at the start of the work space, move it back to there, in order to avoid filling up the work space. Otherwise, if "previous" is NULL, reset the current code pointer to the start. */ if (previous != NULL) { if (previous > orig_code) { memmove(orig_code, previous, code - previous); code -= previous - orig_code; previous = orig_code; } } else code = orig_code; /* Remember where this code item starts so we can pick up the length next time round. */ last_code = code; } /* In the real compile phase, just check the workspace used by the forward reference list. */ else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE) { *errorcodeptr = ERR52; goto FAILED; } /* If in \Q...\E, check for the end; if not, we have a literal */ if (inescq && c != 0) { if (c == '\\' && ptr[1] == 'E') { inescq = FALSE; ptr++; continue; } else { if (previous_callout != NULL) { if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ complete_callout(previous_callout, ptr, cd); previous_callout = NULL; } if ((options & PCRE_AUTO_CALLOUT) != 0) { previous_callout = code; code = auto_callout(code, ptr, cd); } goto NORMAL_CHAR; } } /* Fill in length of a previous callout, except when the next thing is a quantifier. */ is_quantifier = c == '*' || c == '+' || c == '?' || (c == '{' && is_counted_repeat(ptr+1)); if (!is_quantifier && previous_callout != NULL && after_manual_callout-- <= 0) { if (lengthptr == NULL) /* Don't attempt in pre-compile phase */ complete_callout(previous_callout, ptr, cd); previous_callout = NULL; } /* In extended mode, skip white space and comments */ if ((options & PCRE_EXTENDED) != 0) { if ((cd->ctypes[c] & ctype_space) != 0) continue; if (c == '#') { while (*(++ptr) != 0) { if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; } } if (*ptr != 0) continue; /* Else fall through to handle end of string */ c = 0; } } /* No auto callout for quantifiers. */ if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier) { previous_callout = code; code = auto_callout(code, ptr, cd); } switch(c) { /* ===================================================================*/ case 0: /* The branch terminates at string end */ case '|': /* or | or ) */ case ')': *firstbyteptr = firstbyte; *reqbyteptr = reqbyte; *codeptr = code; *ptrptr = ptr; if (lengthptr != NULL) { if (OFLOW_MAX - *lengthptr < code - last_code) { *errorcodeptr = ERR20; goto FAILED; } *lengthptr += code - last_code; /* To include callout length */ DPRINTF((">> end branch\n")); } return TRUE; /* ===================================================================*/ /* Handle single-character metacharacters. In multiline mode, ^ disables the setting of any following char as a first character. */ case '^': if ((options & PCRE_MULTILINE) != 0) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; } previous = NULL; *code++ = OP_CIRC; break; case '$': previous = NULL; *code++ = OP_DOLL; break; /* There can never be a first char if '.' is first, whatever happens about repeats. The value of reqbyte doesn't change either. */ case '.': if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; previous = code; *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY; break; /* ===================================================================*/ /* Character classes. If the included characters are all < 256, we build a 32-byte bitmap of the permitted characters, except in the special case where there is only one such character. For negated classes, we build the map as usual, then invert it at the end. However, we use a different opcode so that data characters > 255 can be handled correctly. If the class contains characters outside the 0-255 range, a different opcode is compiled. It may optionally have a bit map for characters < 256, but those above are are explicitly listed afterwards. A flag byte tells whether the bitmap is present, and whether this is a negated class or not. In JavaScript compatibility mode, an isolated ']' causes an error. In default (Perl) mode, it is treated as a data character. */ case ']': if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { *errorcodeptr = ERR64; goto FAILED; } goto NORMAL_CHAR; case '[': previous = code; /* PCRE supports POSIX class stuff inside a class. Perl gives an error if they are encountered at the top level, so we'll do that too. */ if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && check_posix_syntax(ptr, &tempptr)) { *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31; goto FAILED; } /* If the first character is '^', set the negation flag and skip it. Also, if the first few characters (either before or after ^) are \Q\E or \E we skip them too. This makes for compatibility with Perl. */ negate_class = FALSE; for (;;) { c = *(++ptr); if (c == '\\') { if (ptr[1] == 'E') ptr++; else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3; else break; } else if (!negate_class && c == '^') negate_class = TRUE; else break; } /* Empty classes are allowed in JavaScript compatibility mode. Otherwise, an initial ']' is taken as a data character -- the code below handles that. In JS mode, [] must always fail, so generate OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */ if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0) { *code++ = negate_class? OP_ALLANY : OP_FAIL; if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; break; } /* If a class contains a negative special such as \S, we need to flip the negation flag at the end, so that support for characters > 255 works correctly (they are all included in the class). */ should_flip_negation = FALSE; /* Keep a count of chars with values < 256 so that we can optimize the case of just a single character (as long as it's < 256). However, For higher valued UTF-8 characters, we don't yet do any optimization. */ class_charcount = 0; class_lastchar = -1; /* Initialize the 32-char bit map to all zeros. We build the map in a temporary bit of memory, in case the class contains only 1 character (less than 256), because in that case the compiled code doesn't use the bit map. */ memset(classbits, 0, 32 * sizeof(uschar)); #ifdef SUPPORT_UTF8 class_utf8 = FALSE; /* No chars >= 256 */ class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */ class_utf8data_base = class_utf8data; /* For resetting in pass 1 */ #endif /* Process characters until ] is reached. By writing this as a "do" it means that an initial ] is taken as a data character. At the start of the loop, c contains the first byte of the character. */ if (c != 0) do { const uschar *oldptr; #ifdef SUPPORT_UTF8 if (utf8 && c > 127) { /* Braces are required because the */ GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */ } /* In the pre-compile phase, accumulate the length of any UTF-8 extra data and reset the pointer. This is so that very large classes that contain a zillion UTF-8 characters no longer overwrite the work space (which is on the stack). */ if (lengthptr != NULL) { *lengthptr += class_utf8data - class_utf8data_base; class_utf8data = class_utf8data_base; } #endif /* Inside \Q...\E everything is literal except \E */ if (inescq) { if (c == '\\' && ptr[1] == 'E') /* If we are at \E */ { inescq = FALSE; /* Reset literal state */ ptr++; /* Skip the 'E' */ continue; /* Carry on with next */ } goto CHECK_RANGE; /* Could be range if \E follows */ } /* Handle POSIX class names. Perl allows a negation extension of the form [:^name:]. A square bracket that doesn't match the syntax is treated as a literal. We also recognize the POSIX constructions [.ch.] and [=ch=] ("collating elements") and fault them, as Perl 5.6 and 5.8 do. */ if (c == '[' && (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') && check_posix_syntax(ptr, &tempptr)) { BOOL local_negate = FALSE; int posix_class, taboffset, tabopt; register const uschar *cbits = cd->cbits; uschar pbits[32]; if (ptr[1] != ':') { *errorcodeptr = ERR31; goto FAILED; } ptr += 2; if (*ptr == '^') { local_negate = TRUE; should_flip_negation = TRUE; /* Note negative special */ ptr++; } posix_class = check_posix_name(ptr, tempptr - ptr); if (posix_class < 0) { *errorcodeptr = ERR30; goto FAILED; } /* If matching is caseless, upper and lower are converted to alpha. This relies on the fact that the class table starts with alpha, lower, upper as the first 3 entries. */ if ((options & PCRE_CASELESS) != 0 && posix_class <= 2) posix_class = 0; /* We build the bit map for the POSIX class in a chunk of local store because we may be adding and subtracting from it, and we don't want to subtract bits that may be in the main map already. At the end we or the result into the bit map that is being built. */ posix_class *= 3; /* Copy in the first table (always present) */ memcpy(pbits, cbits + posix_class_maps[posix_class], 32 * sizeof(uschar)); /* If there is a second table, add or remove it as required. */ taboffset = posix_class_maps[posix_class + 1]; tabopt = posix_class_maps[posix_class + 2]; if (taboffset >= 0) { if (tabopt >= 0) for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset]; else for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset]; } /* Not see if we need to remove any special characters. An option value of 1 removes vertical space and 2 removes underscore. */ if (tabopt < 0) tabopt = -tabopt; if (tabopt == 1) pbits[1] &= ~0x3c; else if (tabopt == 2) pbits[11] &= 0x7f; /* Add the POSIX table or its complement into the main table that is being built and we are done. */ if (local_negate) for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c]; else for (c = 0; c < 32; c++) classbits[c] |= pbits[c]; ptr = tempptr + 1; class_charcount = 10; /* Set > 1; assumes more than 1 per class */ continue; /* End of POSIX syntax handling */ } /* Backslash may introduce a single character, or it may introduce one of the specials, which just set a flag. The sequence \b is a special case. Inside a class (and only there) it is treated as backspace. Elsewhere it marks a word boundary. Other escapes have preset maps ready to 'or' into the one we are building. We assume they have more than one character in them, so set class_charcount bigger than one. */ if (c == '\\') { c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; if (-c == ESC_b) c = '\b'; /* \b is backspace in a class */ else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */ else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */ else if (-c == ESC_Q) /* Handle start of quoted string */ { if (ptr[1] == '\\' && ptr[2] == 'E') { ptr += 2; /* avoid empty string */ } else inescq = TRUE; continue; } else if (-c == ESC_E) continue; /* Ignore orphan \E */ if (c < 0) { register const uschar *cbits = cd->cbits; class_charcount += 2; /* Greater than 1 is what matters */ /* Save time by not doing this in the pre-compile phase. */ if (lengthptr == NULL) switch (-c) { case ESC_d: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit]; continue; case ESC_D: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit]; continue; case ESC_w: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word]; continue; case ESC_W: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word]; continue; case ESC_s: for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space]; classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */ continue; case ESC_S: should_flip_negation = TRUE; for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space]; classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */ continue; default: /* Not recognized; fall through */ break; /* Need "default" setting to stop compiler warning. */ } /* In the pre-compile phase, just do the recognition. */ else if (c == -ESC_d || c == -ESC_D || c == -ESC_w || c == -ESC_W || c == -ESC_s || c == -ESC_S) continue; /* We need to deal with \H, \h, \V, and \v in both phases because they use extra memory. */ if (-c == ESC_h) { SETBIT(classbits, 0x09); /* VT */ SETBIT(classbits, 0x20); /* SPACE */ SETBIT(classbits, 0xa0); /* NSBP */ #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data); *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data); class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data); *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data); *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data); *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data); } #endif continue; } if (-c == ESC_H) { for (c = 0; c < 32; c++) { int x = 0xff; switch (c) { case 0x09/8: x ^= 1 << (0x09%8); break; case 0x20/8: x ^= 1 << (0x20%8); break; case 0xa0/8: x ^= 1 << (0xa0%8); break; default: break; } classbits[c] |= x; } #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data); class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data); class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data); class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data); class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data); class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); } #endif continue; } if (-c == ESC_v) { SETBIT(classbits, 0x0a); /* LF */ SETBIT(classbits, 0x0b); /* VT */ SETBIT(classbits, 0x0c); /* FF */ SETBIT(classbits, 0x0d); /* CR */ SETBIT(classbits, 0x85); /* NEL */ #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); } #endif continue; } if (-c == ESC_V) { for (c = 0; c < 32; c++) { int x = 0xff; switch (c) { case 0x0a/8: x ^= 1 << (0x0a%8); x ^= 1 << (0x0b%8); x ^= 1 << (0x0c%8); x ^= 1 << (0x0d%8); break; case 0x85/8: x ^= 1 << (0x85%8); break; default: break; } classbits[c] |= x; } #ifdef SUPPORT_UTF8 if (utf8) { class_utf8 = TRUE; *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data); class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data); *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data); class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data); } #endif continue; } /* We need to deal with \P and \p in both phases. */ #ifdef SUPPORT_UCP if (-c == ESC_p || -c == ESC_P) { BOOL negated; int pdata; int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr); if (ptype < 0) goto FAILED; class_utf8 = TRUE; *class_utf8data++ = ((-c == ESC_p) != negated)? XCL_PROP : XCL_NOTPROP; *class_utf8data++ = ptype; *class_utf8data++ = pdata; class_charcount -= 2; /* Not a < 256 character */ continue; } #endif /* Unrecognized escapes are faulted if PCRE is running in its strict mode. By default, for compatibility with Perl, they are treated as literals. */ if ((options & PCRE_EXTRA) != 0) { *errorcodeptr = ERR7; goto FAILED; } class_charcount -= 2; /* Undo the default count from above */ c = *ptr; /* Get the final character and fall through */ } /* Fall through if we have a single character (c >= 0). This may be greater than 256 in UTF-8 mode. */ } /* End of backslash handling */ /* A single character may be followed by '-' to form a range. However, Perl does not permit ']' to be the end of the range. A '-' character at the end is treated as a literal. Perl ignores orphaned \E sequences entirely. The code for handling \Q and \E is messy. */ CHECK_RANGE: while (ptr[1] == '\\' && ptr[2] == 'E') { inescq = FALSE; ptr += 2; } oldptr = ptr; /* Remember \r or \n */ if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF; /* Check for range */ if (!inescq && ptr[1] == '-') { int d; ptr += 2; while (*ptr == '\\' && ptr[1] == 'E') ptr += 2; /* If we hit \Q (not followed by \E) at this point, go into escaped mode. */ while (*ptr == '\\' && ptr[1] == 'Q') { ptr += 2; if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; } inescq = TRUE; break; } if (*ptr == 0 || (!inescq && *ptr == ']')) { ptr = oldptr; goto LONE_SINGLE_CHARACTER; } #ifdef SUPPORT_UTF8 if (utf8) { /* Braces are required because the */ GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */ } else #endif d = *ptr; /* Not UTF-8 mode */ /* The second part of a range can be a single-character escape, but not any of the other escapes. Perl 5.6 treats a hyphen as a literal in such circumstances. */ if (!inescq && d == '\\') { d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE); if (*errorcodeptr != 0) goto FAILED; /* \b is backspace; \X is literal X; \R is literal R; any other special means the '-' was literal */ if (d < 0) { if (d == -ESC_b) d = '\b'; else if (d == -ESC_X) d = 'X'; else if (d == -ESC_R) d = 'R'; else { ptr = oldptr; goto LONE_SINGLE_CHARACTER; /* A few lines below */ } } } /* Check that the two values are in the correct order. Optimize one-character ranges */ if (d < c) { *errorcodeptr = ERR8; goto FAILED; } if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */ /* Remember \r or \n */ if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF; /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless matching, we have to use an XCLASS with extra data items. Caseless matching for characters > 127 is available only if UCP support is available. */ #ifdef SUPPORT_UTF8 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127))) { class_utf8 = TRUE; /* With UCP support, we can find the other case equivalents of the relevant characters. There may be several ranges. Optimize how they fit with the basic range. */ #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { unsigned int occ, ocd; unsigned int cc = c; unsigned int origd = d; while (get_othercase_range(&cc, origd, &occ, &ocd)) { if (occ >= (unsigned int)c && ocd <= (unsigned int)d) continue; /* Skip embedded ranges */ if (occ < (unsigned int)c && ocd >= (unsigned int)c - 1) /* Extend the basic range */ { /* if there is overlap, */ c = occ; /* noting that if occ < c */ continue; /* we can't have ocd > d */ } /* because a subrange is */ if (ocd > (unsigned int)d && occ <= (unsigned int)d + 1) /* always shorter than */ { /* the basic range. */ d = ocd; continue; } if (occ == ocd) { *class_utf8data++ = XCL_SINGLE; } else { *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(occ, class_utf8data); } class_utf8data += _pcre_ord2utf8(ocd, class_utf8data); } } #endif /* SUPPORT_UCP */ /* Now record the original range, possibly modified for UCP caseless overlapping ranges. */ *class_utf8data++ = XCL_RANGE; class_utf8data += _pcre_ord2utf8(c, class_utf8data); class_utf8data += _pcre_ord2utf8(d, class_utf8data); /* With UCP support, we are done. Without UCP support, there is no caseless matching for UTF-8 characters > 127; we can use the bit map for the smaller ones. */ #ifdef SUPPORT_UCP continue; /* With next character in the class */ #else if ((options & PCRE_CASELESS) == 0 || c > 127) continue; /* Adjust upper limit and fall through to set up the map */ d = 127; #endif /* SUPPORT_UCP */ } #endif /* SUPPORT_UTF8 */ /* We use the bit map for all cases when not in UTF-8 mode; else ranges that lie entirely within 0-127 when there is UCP support; else for partial ranges without UCP support. */ class_charcount += d - c + 1; class_lastchar = d; /* We can save a bit of time by skipping this in the pre-compile. */ if (lengthptr == NULL) for (; c <= d; c++) { classbits[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { int uc = cd->fcc[c]; /* flip case */ classbits[uc/8] |= (1 << (uc&7)); } } continue; /* Go get the next char in the class */ } /* Handle a lone single character - we can get here for a normal non-escape char, or after \ that introduces a single character or for an apparent range that isn't. */ LONE_SINGLE_CHARACTER: /* Handle a character that cannot go in the bit map */ #ifdef SUPPORT_UTF8 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127))) { class_utf8 = TRUE; *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(c, class_utf8data); #ifdef SUPPORT_UCP if ((options & PCRE_CASELESS) != 0) { unsigned int othercase; if ((othercase = UCD_OTHERCASE(c)) != c) { *class_utf8data++ = XCL_SINGLE; class_utf8data += _pcre_ord2utf8(othercase, class_utf8data); } } #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ /* Handle a single-byte character */ { classbits[c/8] |= (1 << (c&7)); if ((options & PCRE_CASELESS) != 0) { c = cd->fcc[c]; /* flip case */ classbits[c/8] |= (1 << (c&7)); } class_charcount++; class_lastchar = c; } } /* Loop until ']' reached. This "while" is the end of the "do" above. */ while ((c = *(++ptr)) != 0 && (c != ']' || inescq)); if (c == 0) /* Missing terminating ']' */ { *errorcodeptr = ERR6; goto FAILED; } /* This code has been disabled because it would mean that \s counts as an explicit \r or \n reference, and that's not really what is wanted. Now we set the flag only if there is a literal "\r" or "\n" in the class. */ #if 0 /* Remember whether \r or \n are in this class */ if (negate_class) { if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF; } else { if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF; } #endif /* If class_charcount is 1, we saw precisely one character whose value is less than 256. As long as there were no characters >= 128 and there was no use of \p or \P, in other words, no use of any XCLASS features, we can optimize. In UTF-8 mode, we can optimize the negative case only if there were no characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR operate on single-bytes only. This is an historical hangover. Maybe one day we can tidy these opcodes to handle multi-byte characters. The optimization throws away the bit map. We turn the item into a 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note that OP_NOT does not support multibyte characters. In the positive case, it can cause firstbyte to be set. Otherwise, there can be no first char if this item is first, whatever repeat count may follow. In the case of reqbyte, save the previous value for reinstating. */ #ifdef SUPPORT_UTF8 if (class_charcount == 1 && !class_utf8 && (!utf8 || !negate_class || class_lastchar < 128)) #else if (class_charcount == 1) #endif { zeroreqbyte = reqbyte; /* The OP_NOT opcode works on one-byte characters only. */ if (negate_class) { if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; *code++ = OP_NOT; *code++ = class_lastchar; break; } /* For a single, positive character, get the value into mcbuffer, and then we can handle this with the normal one-character code. */ #ifdef SUPPORT_UTF8 if (utf8 && class_lastchar > 127) mclength = _pcre_ord2utf8(class_lastchar, mcbuffer); else #endif { mcbuffer[0] = class_lastchar; mclength = 1; } goto ONE_CHAR; } /* End of 1-char optimization */ /* The general case - not the one-char optimization. If this is the first thing in the branch, there can be no first char setting, whatever the repeat count. Any reqbyte setting must remain unchanged after any kind of repeat. */ if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE; zerofirstbyte = firstbyte; zeroreqbyte = reqbyte; /* If there are characters with values > 255, we have to compile an extended class, with its own opcode, unless there was a negated special such as \S in the class, because in that case all characters > 255 are in the class, so any that were explicitly given as well can be ignored. If (when there are explicit characters > 255 that must be listed) there are no characters < 256, we can omit the bitmap in the actual compiled code. */ #ifdef SUPPORT_UTF8 if (class_utf8 && !should_flip_negation) { *class_utf8data++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; *code = negate_class? XCL_NOT : 0; /* If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. */ if (class_charcount > 0) { *code++ |= XCL_MAP; memmove(code + 32, code, class_utf8data - code); memcpy(code, classbits, 32); code = class_utf8data + 32; } else code = class_utf8data; /* Now fill in the complete length of the item */ PUT(previous, 1, code - previous); break; /* End of class handling */ } #endif /* If there are no characters > 255, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the whole class was negated and whether there were negative specials such as \S in the class. Then copy the 32-byte map into the code vector, negating it if necessary. */ *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; if (negate_class) { if (lengthptr == NULL) /* Save time in the pre-compile phase */ for (c = 0; c < 32; c++) code[c] = ~classbits[c]; } else { memcpy(code, classbits, 32); } code += 32; break; /* ===================================================================*/ /* Various kinds of repeat; '{' is not necessarily a quantifier, but this has been tested above. */ case '{': if (!is_quantifier) goto NORMAL_CHAR; ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr); if (*errorcodeptr != 0) goto FAILED; goto REPEAT; case '*': repeat_min = 0; repeat_max = -1; goto REPEAT; case '+': repeat_min = 1; repeat_max = -1; goto REPEAT; case '?': repeat_min = 0; repeat_max = 1; REPEAT: if (previous == NULL) { *errorcodeptr = ERR9; goto FAILED; } if (repeat_min == 0) { firstbyte = zerofirstbyte; /* Adjust for zero repeat */ reqbyte = zeroreqbyte; /* Ditto */ } /* Remember whether this is a variable length repeat */ reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY; op_type = 0; /* Default single-char op codes */ possessive_quantifier = FALSE; /* Default not possessive quantifier */ /* Save start of previous item, in case we have to move it up to make space for an inserted OP_ONCE for the additional '+' extension. */ tempcode = previous; /* If the next character is '+', we have a possessive quantifier. This implies greediness, whatever the setting of the PCRE_UNGREEDY option. If the next character is '?' this is a minimizing repeat, by default, but if PCRE_UNGREEDY is set, it works the other way round. We change the repeat type to the non-default. */ if (ptr[1] == '+') { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; ptr++; } else if (ptr[1] == '?') { repeat_type = greedy_non_default; ptr++; } else repeat_type = greedy_default; /* If previous was a character match, abolish the item and generate a repeat item instead. If a char item has a minumum of more than one, ensure that it is set in reqbyte - it might not be if a sequence such as x{3} is the first thing in a branch because the x will have gone into firstbyte instead. */ if (*previous == OP_CHAR || *previous == OP_CHARNC) { /* Deal with UTF-8 characters that take up more than one byte. It's easier to write this out separately than try to macrify it. Use c to hold the length of the character in bytes, plus 0x80 to flag that it's a length rather than a small character. */ #ifdef SUPPORT_UTF8 if (utf8 && (code[-1] & 0x80) != 0) { uschar *lastchar = code - 1; while((*lastchar & 0xc0) == 0x80) lastchar--; c = code - lastchar; /* Length of UTF-8 character */ memcpy(utf8_char, lastchar, c); /* Save the char */ c |= 0x80; /* Flag c as a length */ } else #endif /* Handle the case of a single byte - either with no UTF8 support, or with UTF-8 disabled, or for a UTF-8 character < 128. */ { c = code[-1]; if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt; } /* If the repetition is unlimited, it pays to see if the next thing on the line is something that cannot possibly match this character. If so, automatically possessifying this item gains some performance in the case where the match fails. */ if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; } goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */ } /* If previous was a single negated character ([^a] or similar), we use one of the special opcodes, replacing it. The code is shared with single- character repeats by setting opt_type to add a suitable offset into repeat_type. We can also test for auto-possessification. OP_NOT is currently used only for single-byte chars. */ else if (*previous == OP_NOT) { op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */ c = previous[1]; if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; } goto OUTPUT_SINGLE_REPEAT; } /* If previous was a character type match (\d or similar), abolish it and create a suitable repeat item. The code is shared with single-character repeats by setting op_type to add a suitable offset into repeat_type. Note the the Unicode property types will be present only when SUPPORT_UCP is defined, but we don't wrap the little bits of code here because it just makes it horribly messy. */ else if (*previous < OP_EODN) { uschar *oldcode; int prop_type, prop_value; op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */ c = *previous; if (!possessive_quantifier && repeat_max < 0 && check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd)) { repeat_type = 0; /* Force greedy */ possessive_quantifier = TRUE; } OUTPUT_SINGLE_REPEAT: if (*previous == OP_PROP || *previous == OP_NOTPROP) { prop_type = previous[1]; prop_value = previous[2]; } else prop_type = prop_value = -1; oldcode = code; code = previous; /* Usually overwrite previous item */ /* If the maximum is zero then the minimum must also be zero; Perl allows this case, so we do too - by simply omitting the item altogether. */ if (repeat_max == 0) goto END_REPEAT; /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; /* Combine the op_type with the repeat_type */ repeat_type += op_type; /* A minimum of zero is handled either as the special case * or ?, or as an UPTO, with the maximum given. */ if (repeat_min == 0) { if (repeat_max == -1) *code++ = OP_STAR + repeat_type; else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type; else { *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); } } /* A repeat minimum of 1 is optimized into some special cases. If the maximum is unlimited, we use OP_PLUS. Otherwise, the original item is left in place and, if the maximum is greater than 1, we use OP_UPTO with one less than the maximum. */ else if (repeat_min == 1) { if (repeat_max == -1) *code++ = OP_PLUS + repeat_type; else { code = oldcode; /* leave previous item in place */ if (repeat_max == 1) goto END_REPEAT; *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max - 1); } } /* The case {n,n} is just an EXACT, while the general case {n,m} is handled as an EXACT followed by an UPTO. */ else { *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */ PUT2INC(code, 0, repeat_min); /* If the maximum is unlimited, insert an OP_STAR. Before doing so, we have to insert the character for the previous code. For a repeated Unicode property match, there are two extra bytes that define the required property. In UTF-8 mode, long characters have their length in c, with the 0x80 bit as a flag. */ if (repeat_max < 0) { #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif { *code++ = c; if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } } *code++ = OP_STAR + repeat_type; } /* Else insert an UPTO if the max is greater than the min, again preceded by the character, for the previously inserted code. If the UPTO is just for 1 instance, we can use QUERY instead. */ else if (repeat_max != repeat_min) { #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif *code++ = c; if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } repeat_max -= repeat_min; if (repeat_max == 1) { *code++ = OP_QUERY + repeat_type; } else { *code++ = OP_UPTO + repeat_type; PUT2INC(code, 0, repeat_max); } } } /* The character or character type itself comes last in all cases. */ #ifdef SUPPORT_UTF8 if (utf8 && c >= 128) { memcpy(code, utf8_char, c & 7); code += c & 7; } else #endif *code++ = c; /* For a repeated Unicode property match, there are two extra bytes that define the required property. */ #ifdef SUPPORT_UCP if (prop_type >= 0) { *code++ = prop_type; *code++ = prop_value; } #endif } /* If previous was a character class or a back reference, we put the repeat stuff after it, but just skip the item if the repeat was {0,0}. */ else if (*previous == OP_CLASS || *previous == OP_NCLASS || #ifdef SUPPORT_UTF8 *previous == OP_XCLASS || #endif *previous == OP_REF) { if (repeat_max == 0) { code = previous; goto END_REPEAT; } /* All real repeats make it impossible to handle partial matching (maybe one day we will be able to remove this restriction). */ if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) *code++ = OP_CRPLUS + repeat_type; else if (repeat_min == 0 && repeat_max == 1) *code++ = OP_CRQUERY + repeat_type; else { *code++ = OP_CRRANGE + repeat_type; PUT2INC(code, 0, repeat_min); if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */ PUT2INC(code, 0, repeat_max); } } /* If previous was a bracket group, we may have to replicate it in certain cases. */ else if (*previous == OP_BRA || *previous == OP_CBRA || *previous == OP_ONCE || *previous == OP_COND) { register int i; int ketoffset = 0; int len = code - previous; uschar *bralink = NULL; /* Repeating a DEFINE group is pointless */ if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF) { *errorcodeptr = ERR55; goto FAILED; } /* If the maximum repeat count is unlimited, find the end of the bracket by scanning through from the start, and compute the offset back to it from the current code pointer. There may be an OP_OPT setting following the final KET, so we can't find the end just by going back from the code pointer. */ if (repeat_max == -1) { register uschar *ket = previous; do ket += GET(ket, 1); while (*ket != OP_KET); ketoffset = code - ket; } /* The case of a zero minimum is special because of the need to stick OP_BRAZERO in front of it, and because the group appears once in the data, whereas in other cases it appears the minimum number of times. For this reason, it is simplest to treat this case separately, as otherwise the code gets far too messy. There are several special subcases when the minimum is zero. */ if (repeat_min == 0) { /* If the maximum is also zero, we used to just omit the group from the output altogether, like this: ** if (repeat_max == 0) ** { ** code = previous; ** goto END_REPEAT; ** } However, that fails when a group is referenced as a subroutine from elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it so that it is skipped on execution. As we don't have a list of which groups are referenced, we cannot do this selectively. If the maximum is 1 or unlimited, we just have to stick in the BRAZERO and do no more at this point. However, we do need to adjust any OP_RECURSE calls inside the group that refer to the group itself or any internal or forward referenced group, because the offset is from the start of the whole regex. Temporarily terminate the pattern while doing this. */ if (repeat_max <= 1) /* Covers 0, 1, and unlimited */ { *code = OP_END; adjust_recurse(previous, 1, utf8, cd, save_hwm); memmove(previous+1, previous, len); code++; if (repeat_max == 0) { *previous++ = OP_SKIPZERO; goto END_REPEAT; } *previous++ = OP_BRAZERO + repeat_type; } /* If the maximum is greater than 1 and limited, we have to replicate in a nested fashion, sticking OP_BRAZERO before each set of brackets. The first one has to be handled carefully because it's the original copy, which has to be moved up. The remainder can be handled by code that is common with the non-zero minimum case below. We have to adjust the value or repeat_max, since one less copy is required. Once again, we may have to adjust any OP_RECURSE calls inside the group. */ else { int offset; *code = OP_END; adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm); memmove(previous + 2 + LINK_SIZE, previous, len); code += 2 + LINK_SIZE; *previous++ = OP_BRAZERO + repeat_type; *previous++ = OP_BRA; /* We chain together the bracket offset fields that have to be filled in later when the ends of the brackets are reached. */ offset = (bralink == NULL)? 0 : previous - bralink; bralink = previous; PUTINC(previous, 0, offset); } repeat_max--; } /* If the minimum is greater than zero, replicate the group as many times as necessary, and adjust the maximum to the number of subsequent copies that we need. If we set a first char from the group, and didn't set a required char, copy the latter from the former. If there are any forward reference subroutine calls in the group, there will be entries on the workspace list; replicate these with an appropriate increment. */ else { if (repeat_min > 1) { /* In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. Do some paranoid checks for potential integer overflow. */ if (lengthptr != NULL) { int delta = (repeat_min - 1)*length_prevgroup; if ((double)(repeat_min - 1)*(double)length_prevgroup > (double)INT_MAX || OFLOW_MAX - *lengthptr < delta) { *errorcodeptr = ERR20; goto FAILED; } *lengthptr += delta; } /* This is compiling for real */ else { if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte; for (i = 1; i < repeat_min; i++) { uschar *hc; uschar *this_hwm = cd->hwm; memcpy(code, previous, len); for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len); cd->hwm += LINK_SIZE; } save_hwm = this_hwm; code += len; } } } if (repeat_max > 0) repeat_max -= repeat_min; } /* This code is common to both the zero and non-zero minimum cases. If the maximum is limited, it replicates the group in a nested fashion, remembering the bracket starts on a stack. In the case of a zero minimum, the first one was set up above. In all cases the repeat_max now specifies the number of additional copies needed. Again, we must remember to replicate entries on the forward reference list. */ if (repeat_max >= 0) { /* In the pre-compile phase, we don't actually do the replication. We just adjust the length as if we had. For each repetition we must add 1 to the length for BRAZERO and for all but the last repetition we must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some paranoid checks to avoid integer overflow. */ if (lengthptr != NULL && repeat_max > 0) { int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) - 2 - 2*LINK_SIZE; /* Last one doesn't nest */ if ((double)repeat_max * (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE) > (double)INT_MAX || OFLOW_MAX - *lengthptr < delta) { *errorcodeptr = ERR20; goto FAILED; } *lengthptr += delta; } /* This is compiling for real */ else for (i = repeat_max - 1; i >= 0; i--) { uschar *hc; uschar *this_hwm = cd->hwm; *code++ = OP_BRAZERO + repeat_type; /* All but the final copy start a new nesting, maintaining the chain of brackets outstanding. */ if (i != 0) { int offset; *code++ = OP_BRA; offset = (bralink == NULL)? 0 : code - bralink; bralink = code; PUTINC(code, 0, offset); } memcpy(code, previous, len); for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE) { PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1)); cd->hwm += LINK_SIZE; } save_hwm = this_hwm; code += len; } /* Now chain through the pending brackets, and fill in their length fields (which are holding the chain links pro tem). */ while (bralink != NULL) { int oldlinkoffset; int offset = code - bralink + 1; uschar *bra = code - offset; oldlinkoffset = GET(bra, 1); bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset; *code++ = OP_KET; PUTINC(code, 0, offset); PUT(bra, 1, offset); } } /* If the maximum is unlimited, set a repeater in the final copy. We can't just offset backwards from the current code point, because we don't know if there's been an options resetting after the ket. The correct offset was computed above. Then, when we are doing the actual compile phase, check to see whether this group is a non-atomic one that could match an empty string. If so, convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so that runtime checking can be done. [This check is also applied to atomic groups at runtime, but in a different way.] */ else { uschar *ketcode = code - ketoffset; uschar *bracode = ketcode - GET(ketcode, 1); *ketcode = OP_KETRMAX + repeat_type; if (lengthptr == NULL && *bracode != OP_ONCE) { uschar *scode = bracode; do { if (could_be_empty_branch(scode, ketcode, utf8)) { *bracode += OP_SBRA - OP_BRA; break; } scode += GET(scode, 1); } while (*scode == OP_ALT); } } } /* If previous is OP_FAIL, it was generated by an empty class [] in JavaScript mode. The other ways in which OP_FAIL can be generated, that is by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat" error above. We can just ignore the repeat in JS case. */ else if (*previous == OP_FAIL) goto END_REPEAT; /* Else there's some kind of shambles */ else { *errorcodeptr = ERR11; goto FAILED; } /* If the character following a repeat is '+', or if certain optimization tests above succeeded, possessive_quantifier is TRUE. For some of the simpler opcodes, there is an special alternative opcode for this. For anything else, we wrap the entire repeated item inside OP_ONCE brackets. The '+' notation is just syntactic sugar, taken from Sun's Java package, but the special opcodes can optimize it a bit. The repeated item starts at tempcode, not at previous, which might be the first part of a string whose (former) last char we repeated. Possessifying an 'exact' quantifier has no effect, so we can ignore it. But an 'upto' may follow. We skip over an 'exact' item, and then test the length of what remains before proceeding. */ if (possessive_quantifier) { int len; if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT || *tempcode == OP_NOTEXACT) tempcode += _pcre_OP_lengths[*tempcode] + ((*tempcode == OP_TYPEEXACT && (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0); len = code - tempcode; if (len > 0) switch (*tempcode) { case OP_STAR: *tempcode = OP_POSSTAR; break; case OP_PLUS: *tempcode = OP_POSPLUS; break; case OP_QUERY: *tempcode = OP_POSQUERY; break; case OP_UPTO: *tempcode = OP_POSUPTO; break; case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break; case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break; case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break; case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break; case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break; case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break; case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break; case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break; default: memmove(tempcode + 1+LINK_SIZE, tempcode, len); code += 1 + LINK_SIZE; len += 1 + LINK_SIZE; tempcode[0] = OP_ONCE; *code++ = OP_KET; PUTINC(code, 0, len); PUT(tempcode, 1, len); break; } } /* In all case we no longer have a previous item. We also set the "follows varying string" flag for subsequently encountered reqbytes if it isn't already set and we have just passed a varying length item. */ END_REPEAT: previous = NULL; cd->req_varyopt |= reqvary; break; /* ===================================================================*/ /* Start of nested parenthesized sub-expression, or comment or lookahead or lookbehind or option setting or condition or all the other extended parenthesis forms. */ case '(': newoptions = options; skipbytes = 0; bravalue = OP_CBRA; save_hwm = cd->hwm; reset_bracount = FALSE; /* First deal with various "verbs" that can be introduced by '*'. */ if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0) { int i, namelen; const char *vn = verbnames; const uschar *name = ++ptr; previous = NULL; while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {}; if (*ptr == ':') { *errorcodeptr = ERR59; /* Not supported */ goto FAILED; } if (*ptr != ')') { *errorcodeptr = ERR60; goto FAILED; } namelen = ptr - name; for (i = 0; i < verbcount; i++) { if (namelen == verbs[i].len && strncmp((char *)name, vn, namelen) == 0) { *code = verbs[i].op; if (*code++ == OP_ACCEPT) cd->had_accept = TRUE; break; } vn += verbs[i].len + 1; } if (i < verbcount) continue; *errorcodeptr = ERR60; goto FAILED; } /* Deal with the extended parentheses; all are introduced by '?', and the appearance of any of them means that this is not a capturing group. */ else if (*ptr == '?') { int i, set, unset, namelen; int *optset; const uschar *name; uschar *slot; switch (*(++ptr)) { case '#': /* Comment; skip to ket */ ptr++; while (*ptr != 0 && *ptr != ')') ptr++; if (*ptr == 0) { *errorcodeptr = ERR18; goto FAILED; } continue; /* ------------------------------------------------------------ */ case '|': /* Reset capture count for each branch */ reset_bracount = TRUE; /* Fall through */ /* ------------------------------------------------------------ */ case ':': /* Non-capturing bracket */ bravalue = OP_BRA; ptr++; break; /* ------------------------------------------------------------ */ case '(': bravalue = OP_COND; /* Conditional group */ /* A condition can be an assertion, a number (referring to a numbered group), a name (referring to a named group), or 'R', referring to recursion. R | pcrecomp.c | 2403 |
pcreconf.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_config(int what, void *where)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_config(int what, void *where) { switch (what) { case PCRE_CONFIG_UTF8: #ifdef SUPPORT_UTF8 *((int *)where) = 1; #else *((int *)where) = 0; #endif break; case PCRE_CONFIG_UNICODE_PROPERTIES: #ifdef SUPPORT_UCP *((int *)where) = 1; #else *((int *)where) = 0; #endif break; case PCRE_CONFIG_NEWLINE: *((int *)where) = NEWLINE; break; case PCRE_CONFIG_BSR: #ifdef BSR_ANYCRLF *((int *)where) = 1; #else *((int *)where) = 0; #endif break; case PCRE_CONFIG_LINK_SIZE: *((int *)where) = LINK_SIZE; break; case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD: *((int *)where) = POSIX_MALLOC_THRESHOLD; break; case PCRE_CONFIG_MATCH_LIMIT: *((unsigned int *)where) = MATCH_LIMIT; break; case PCRE_CONFIG_MATCH_LIMIT_RECURSION: *((unsigned int *)where) = MATCH_LIMIT_RECURSION; break; case PCRE_CONFIG_STACKRECURSE: #ifdef NO_RECURSE *((int *)where) = 0; #else *((int *)where) = 1; #endif break; default: return PCRE_ERROR_BADOPTION; } return 0; } | pcreconf.c | 65 |
pcredfa.c | |||
Type | Function | Source | Line |
STATIC VOID | pchars(unsigned char *p, int length, FILE *f)
static void pchars(unsigned char *p, int length, FILE *f) { int c; while (length-- > 0) { if (isprint(c = *(p++))) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c); } } #endif /************************************************* * Execute a Regular Expression - DFA engine * *************************************************/ /* This internal function applies a compiled pattern to a subject string, starting at a given point, using a DFA engine. This function is called from the external one, possibly multiple times if the pattern is not anchored. The function calls itself recursively for some kinds of subpattern. Arguments: md the match_data block with fixed information this_start_code the opening bracket of this subexpression's code current_subject where we currently are in the subject string start_offset start offset in the subject string offsets vector to contain the matching string offsets offsetcount size of same workspace vector of workspace wscount size of same ims the current ims flags rlevel function call recursion level recursing regex recursive call level Returns: > 0 => number of match offset pairs placed in offsets = 0 => offsets overflowed; longest matches are present -1 => failed to match < -1 => some kind of unexpected problem The following macros are used for adding states to the two state vectors (one for the current character, one for the following character). */ #define ADD_ACTIVE(x,y) \ if (active_count++ < wscount) \ { \ next_active_state->offset = (x); \ next_active_state->count = (y); \ next_active_state->ims = ims; \ next_active_state++; \ DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_ACTIVE_DATA(x,y,z) \ if (active_count++ < wscount) \ { \ next_active_state->offset = (x); \ next_active_state->count = (y); \ next_active_state->ims = ims; \ next_active_state->data = (z); \ next_active_state++; \ DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_NEW(x,y) \ if (new_count++ < wscount) \ { \ next_new_state->offset = (x); \ next_new_state->count = (y); \ next_new_state->ims = ims; \ next_new_state++; \ DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \ } \ else return PCRE_ERROR_DFA_WSSIZE #define ADD_NEW_DATA(x,y,z) \ if (new_count++ < wscount) \ { \ next_new_state->offset = (x); \ next_new_state->count = (y); \ next_new_state->ims = ims; \ next_new_state->data = (z); \ next_new_state++; \ DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \ } \ else return PCRE_ERROR_DFA_WSSIZE | pcredfa.c | 188 |
ELSE RETURN PCRE_ERROR_DFA_WSSIZE STATIC INT | internal_dfa_exec( dfa_match_data *md, const uschar *this_start_code, const uschar *current_subject, int start_offset, int *offsets, int offsetcount, int *workspace, int wscount, int ims, int rlevel, int recursing)
static int internal_dfa_exec( dfa_match_data *md, const uschar *this_start_code, const uschar *current_subject, int start_offset, int *offsets, int offsetcount, int *workspace, int wscount, int ims, int rlevel, int recursing) { stateblock *active_states, *new_states, *temp_states; stateblock *next_active_state, *next_new_state; const uschar *ctypes, *lcc, *fcc; const uschar *ptr; const uschar *end_code, *first_op; int active_count, new_count, match_count; /* Some fields in the md block are frequently referenced, so we load them into independent variables in the hope that this will perform better. */ const uschar *start_subject = md->start_subject; const uschar *end_subject = md->end_subject; const uschar *start_code = md->start_code; #ifdef SUPPORT_UTF8 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0; #else BOOL utf8 = FALSE; #endif rlevel++; offsetcount &= (-2); wscount -= 2; wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) / (2 * INTS_PER_STATEBLOCK); DPRINTF(("\n%.*s---------------------\n" "%.*sCall to internal_dfa_exec f=%d r=%d\n", rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing)); ctypes = md->tables + ctypes_offset; lcc = md->tables + lcc_offset; fcc = md->tables + fcc_offset; match_count = PCRE_ERROR_NOMATCH; /* A negative number */ active_states = (stateblock *)(workspace + 2); next_new_state = new_states = active_states + wscount; new_count = 0; first_op = this_start_code + 1 + LINK_SIZE + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); /* The first thing in any (sub) pattern is a bracket of some sort. Push all the alternative states onto the list, and find out where the end is. This makes is possible to use this function recursively, when we want to stop at a matching internal ket rather than at the end. If the first opcode in the first alternative is OP_REVERSE, we are dealing with a backward assertion. In that case, we have to find out the maximum amount to move back, and set up each alternative appropriately. */ if (*first_op == OP_REVERSE) { int max_back = 0; int gone_back; end_code = this_start_code; do { int back = GET(end_code, 2+LINK_SIZE); if (back > max_back) max_back = back; end_code += GET(end_code, 1); } while (*end_code == OP_ALT); /* If we can't go back the amount required for the longest lookbehind pattern, go back as far as we can; some alternatives may still be viable. */ #ifdef SUPPORT_UTF8 /* In character mode we have to step back character by character */ if (utf8) { for (gone_back = 0; gone_back < max_back; gone_back++) { if (current_subject <= start_subject) break; current_subject--; while (current_subject > start_subject && (*current_subject & 0xc0) == 0x80) current_subject--; } } else #endif /* In byte-mode we can do this quickly. */ { gone_back = (current_subject - max_back < start_subject)? current_subject - start_subject : max_back; current_subject -= gone_back; } /* Now we can process the individual branches. */ end_code = this_start_code; do { int back = GET(end_code, 2+LINK_SIZE); if (back <= gone_back) { int bstate = end_code - start_code + 2 + 2*LINK_SIZE; ADD_NEW_DATA(-bstate, 0, gone_back - back); } end_code += GET(end_code, 1); } while (*end_code == OP_ALT); } /* This is the code for a "normal" subpattern (not a backward assertion). The start of a whole pattern is always one of these. If we are at the top level, we may be asked to restart matching from the same point that we reached for a previous partial match. We still have to scan through the top-level branches to find the end state. */ else { end_code = this_start_code; /* Restarting */ if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0) { do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT); new_count = workspace[1]; if (!workspace[0]) memcpy(new_states, active_states, new_count * sizeof(stateblock)); } /* Not restarting */ else { int length = 1 + LINK_SIZE + ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0); do { ADD_NEW(end_code - start_code + length, 0); end_code += GET(end_code, 1); length = 1 + LINK_SIZE; } while (*end_code == OP_ALT); } } workspace[0] = 0; /* Bit indicating which vector is current */ DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code)); /* Loop for scanning the subject */ ptr = current_subject; for (;;) { int i, j; int clen, dlen; unsigned int c, d; /* Make the new state list into the active state list and empty the new state list. */ temp_states = active_states; active_states = new_states; new_states = temp_states; active_count = new_count; new_count = 0; workspace[0] ^= 1; /* Remember for the restarting feature */ workspace[1] = active_count; #ifdef DEBUG printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); pchars((uschar *)ptr, strlen((char *)ptr), stdout); printf("\"\n"); printf("%.*sActive states: ", rlevel*2-2, SP); for (i = 0; i < active_count; i++) printf("%d/%d ", active_states[i].offset, active_states[i].count); printf("\n"); #endif /* Set the pointers for adding new states */ next_active_state = active_states + active_count; next_new_state = new_states; /* Load the current character from the subject outside the loop, as many different states may want to look at it, and we assume that at least one will. */ if (ptr < end_subject) { clen = 1; /* Number of bytes in the character */ #ifdef SUPPORT_UTF8 if (utf8) { GETCHARLEN(c, ptr, clen); } else #endif /* SUPPORT_UTF8 */ c = *ptr; } else { clen = 0; /* This indicates the end of the subject */ c = NOTACHAR; /* This value should never actually be used */ } /* Scan up the active states and act on each one. The result of an action may be to add more states to the currently active list (e.g. on hitting a parenthesis) or it may be to put states on the new list, for considering when we move the character pointer on. */ for (i = 0; i < active_count; i++) { stateblock *current_state = active_states + i; const uschar *code; int state_offset = current_state->offset; int count, codevalue; #ifdef DEBUG printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); if (clen == 0) printf("EOL\n"); else if (c > 32 && c < 127) printf("'%c'\n", c); else printf("0x%02x\n", c); #endif /* This variable is referred to implicity in the ADD_xxx macros. */ ims = current_state->ims; /* A negative offset is a special case meaning "hold off going to this (negated) state until the number of characters in the data field have been skipped". */ if (state_offset < 0) { if (current_state->data > 0) { DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP)); ADD_NEW_DATA(state_offset, current_state->count, current_state->data - 1); continue; } else { current_state->offset = state_offset = -state_offset; } } /* Check for a duplicate state with the same count, and skip if found. */ for (j = 0; j < i; j++) { if (active_states[j].offset == state_offset && active_states[j].count == current_state->count) { DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP)); goto NEXT_ACTIVE_STATE; } } /* The state offset is the offset to the opcode */ code = start_code + state_offset; codevalue = *code; /* If this opcode is followed by an inline character, load it. It is tempting to test for the presence of a subject character here, but that is wrong, because sometimes zero repetitions of the subject are permitted. We also use this mechanism for opcodes such as OP_TYPEPLUS that take an argument that is not a data character - but is always one byte long. We have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert these ones to new opcodes. */ if (coptable[codevalue] > 0) { dlen = 1; #ifdef SUPPORT_UTF8 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else #endif /* SUPPORT_UTF8 */ d = code[coptable[codevalue]]; if (codevalue >= OP_TYPESTAR) { switch(d) { case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM; case OP_NOTPROP: case OP_PROP: codevalue += OP_PROP_EXTRA; break; case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break; case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break; case OP_NOT_HSPACE: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break; case OP_NOT_VSPACE: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break; default: break; } } } else { dlen = 0; /* Not strictly necessary, but compilers moan */ d = NOTACHAR; /* if these variables are not set. */ } /* Now process the individual opcodes */ switch (codevalue) { /* ========================================================================== */ /* Reached a closing bracket. If not at the end of the pattern, carry on with the next opcode. Otherwise, unless we have an empty string and PCRE_NOTEMPTY is set, save the match data, shifting up all previous matches so we always have the longest first. */ case OP_KET: case OP_KETRMIN: case OP_KETRMAX: if (code != end_code) { ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0); if (codevalue != OP_KET) { ADD_ACTIVE(state_offset - GET(code, 1), 0); } } else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) { if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; else if (match_count > 0 && ++match_count * 2 >= offsetcount) match_count = 0; count = ((match_count == 0)? offsetcount : match_count * 2) - 2; if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int)); if (offsetcount >= 2) { offsets[0] = current_subject - start_subject; offsets[1] = ptr - start_subject; DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, offsets[1] - offsets[0], current_subject)); } if ((md->moptions & PCRE_DFA_SHORTEST) != 0) { DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); return match_count; } } break; /* ========================================================================== */ /* These opcodes add to the current list of states without looking at the current character. */ /*-----------------------------------------------------------------*/ case OP_ALT: do { code += GET(code, 1); } while (*code == OP_ALT); ADD_ACTIVE(code - start_code, 0); break; /*-----------------------------------------------------------------*/ case OP_BRA: case OP_SBRA: do { ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); code += GET(code, 1); } while (*code == OP_ALT); break; /*-----------------------------------------------------------------*/ case OP_CBRA: case OP_SCBRA: ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0); code += GET(code, 1); while (*code == OP_ALT) { ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); code += GET(code, 1); } break; /*-----------------------------------------------------------------*/ case OP_BRAZERO: case OP_BRAMINZERO: ADD_ACTIVE(state_offset + 1, 0); code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); break; /*-----------------------------------------------------------------*/ case OP_SKIPZERO: code += 1 + GET(code, 2); while (*code == OP_ALT) code += GET(code, 1); ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); break; /*-----------------------------------------------------------------*/ case OP_CIRC: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || ((ims & PCRE_MULTILINE) != 0 && ptr != end_subject && WAS_NEWLINE(ptr))) { ADD_ACTIVE(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_EOD: if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_OPT: ims = code[1]; ADD_ACTIVE(state_offset + 2, 0); break; /*-----------------------------------------------------------------*/ case OP_SOD: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_SOM: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); } break; /* ========================================================================== */ /* These opcodes inspect the next subject character, and sometimes the previous one as well, but do not have an argument. The variable clen contains the length of the current character and is zero if we are at the end of the subject. */ /*-----------------------------------------------------------------*/ case OP_ANY: if (clen > 0 && !IS_NEWLINE(ptr)) { ADD_NEW(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_ALLANY: if (clen > 0) { ADD_NEW(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_EODN: if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen)) { ADD_ACTIVE(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_DOLL: if ((md->moptions & PCRE_NOTEOL) == 0) { if (clen == 0 || (IS_NEWLINE(ptr) && ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) )) { ADD_ACTIVE(state_offset + 1, 0); } } else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr)) { ADD_ACTIVE(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_DIGIT: case OP_WHITESPACE: case OP_WORDCHAR: if (clen > 0 && c < 256 && ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0) { ADD_NEW(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_NOT_DIGIT: case OP_NOT_WHITESPACE: case OP_NOT_WORDCHAR: if (clen > 0 && (c >= 256 || ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)) { ADD_NEW(state_offset + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_WORD_BOUNDARY: case OP_NOT_WORD_BOUNDARY: { int left_word, right_word; if (ptr > start_subject) { const uschar *temp = ptr - 1; #ifdef SUPPORT_UTF8 if (utf8) BACKCHAR(temp); #endif GETCHARTEST(d, temp); left_word = d < 256 && (ctypes[d] & ctype_word) != 0; } else left_word = 0; if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; else right_word = 0; if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) { ADD_ACTIVE(state_offset + 1, 0); } } break; /*-----------------------------------------------------------------*/ /* Check the next character by Unicode property. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ #ifdef SUPPORT_UCP case OP_PROP: case OP_NOTPROP: if (clen > 0) { BOOL OK; const ucd_record * prop = GET_UCD(c); switch(code[1]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[2]; break; case PT_PC: OK = prop->chartype == code[2]; break; case PT_SC: OK = prop->script == code[2]; break; /* Should never occur, but keep compilers from grumbling. */ default: OK = codevalue != OP_PROP; break; } if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); } } break; #endif /* ========================================================================== */ /* These opcodes likewise inspect the subject character, but have an argument that is not a data character. It is one of these opcodes: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */ case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (count > 0 && codevalue == OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /*-----------------------------------------------------------------*/ case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW(state_offset + 2, 0); } } break; /*-----------------------------------------------------------------*/ case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSSTAR) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW(state_offset, 0); } } break; /*-----------------------------------------------------------------*/ case OP_TYPEEXACT: count = current_state->count; /* Number already matched */ if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 4, 0); } else { ADD_NEW(state_offset, count); } } } break; /*-----------------------------------------------------------------*/ case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: ADD_ACTIVE(state_offset + 4, 0); count = current_state->count; /* Number already matched */ if (clen > 0) { if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || (c < 256 && (d != OP_ANY || !IS_NEWLINE(ptr)) && ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) { if (codevalue == OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 4, 0); } else { ADD_NEW(state_offset, count); } } } break; /* ========================================================================== */ /* These are virtual opcodes that are used when something like OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its argument. It keeps the code above fast for the other cases. The argument is in the d variable. */ #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEPLUS: case OP_PROP_EXTRA + OP_TYPEMINPLUS: case OP_PROP_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); } if (clen > 0) { BOOL OK; const ucd_record * prop = GET_UCD(c); switch(code[2]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[3]; break; case PT_PC: OK = prop->chartype == code[3]; break; case PT_SC: OK = prop->script == code[3]; break; /* Should never occur, but keep compilers from grumbling. */ default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /*-----------------------------------------------------------------*/ case OP_EXTUNI_EXTRA + OP_TYPEPLUS: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } count++; ADD_NEW_DATA(-state_offset, count, ncount); } break; #endif /*-----------------------------------------------------------------*/ case OP_ANYNL_EXTRA + OP_TYPEPLUS: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL01; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; /* Fall through */ ANYNL01: case 0x000a: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, ncount); break; default: break; } } break; /*-----------------------------------------------------------------*/ case OP_VSPACE_EXTRA + OP_TYPEPLUS: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_VSPACE)) { if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, 0); } } break; /*-----------------------------------------------------------------*/ case OP_HSPACE_EXTRA + OP_TYPEPLUS: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); } if (clen > 0) { BOOL OK; switch (c) { case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW_DATA(-state_offset, count, 0); } } break; /*-----------------------------------------------------------------*/ #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEQUERY: case OP_PROP_EXTRA + OP_TYPEMINQUERY: case OP_PROP_EXTRA + OP_TYPEPOSQUERY: count = 4; goto QS1; case OP_PROP_EXTRA + OP_TYPESTAR: case OP_PROP_EXTRA + OP_TYPEMINSTAR: case OP_PROP_EXTRA + OP_TYPEPOSSTAR: count = 0; QS1: ADD_ACTIVE(state_offset + 4, 0); if (clen > 0) { BOOL OK; const ucd_record * prop = GET_UCD(c); switch(code[2]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[3]; break; case PT_PC: OK = prop->chartype == code[3]; break; case PT_SC: OK = prop->script == code[3]; break; /* Should never occur, but keep compilers from grumbling. */ default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW(state_offset + count, 0); } } break; /*-----------------------------------------------------------------*/ case OP_EXTUNI_EXTRA + OP_TYPEQUERY: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS2; case OP_EXTUNI_EXTRA + OP_TYPESTAR: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR: count = 0; QS2: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); } break; #endif /*-----------------------------------------------------------------*/ case OP_ANYNL_EXTRA + OP_TYPEQUERY: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS3; case OP_ANYNL_EXTRA + OP_TYPESTAR: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR: count = 0; QS3: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL02; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; /* Fall through */ ANYNL02: case 0x000a: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, ncount); break; default: break; } } break; /*-----------------------------------------------------------------*/ case OP_VSPACE_EXTRA + OP_TYPEQUERY: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS4; case OP_VSPACE_EXTRA + OP_TYPESTAR: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR: count = 0; QS4: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_VSPACE)) { if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, 0); } } break; /*-----------------------------------------------------------------*/ case OP_HSPACE_EXTRA + OP_TYPEQUERY: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY: count = 2; goto QS5; case OP_HSPACE_EXTRA + OP_TYPESTAR: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR: count = 0; QS5: ADD_ACTIVE(state_offset + 2, 0); if (clen > 0) { BOOL OK; switch (c) { case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR || codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW_DATA(-(state_offset + count), 0, 0); } } break; /*-----------------------------------------------------------------*/ #ifdef SUPPORT_UCP case OP_PROP_EXTRA + OP_TYPEEXACT: case OP_PROP_EXTRA + OP_TYPEUPTO: case OP_PROP_EXTRA + OP_TYPEMINUPTO: case OP_PROP_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 6, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; const ucd_record * prop = GET_UCD(c); switch(code[4]) { case PT_ANY: OK = TRUE; break; case PT_LAMP: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt; break; case PT_GC: OK = _pcre_ucp_gentype[prop->chartype] == code[5]; break; case PT_PC: OK = prop->chartype == code[5]; break; case PT_SC: OK = prop->script == code[5]; break; /* Should never occur, but keep compilers from grumbling. */ default: OK = codevalue != OP_PROP; break; } if (OK == (d == OP_PROP)) { if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + 6, 0); } else { ADD_NEW(state_offset, count); } } } break; /*-----------------------------------------------------------------*/ case OP_EXTUNI_EXTRA + OP_TYPEEXACT: case OP_EXTUNI_EXTRA + OP_TYPEUPTO: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; /* Number already matched */ if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } while (nptr < end_subject) { int nd; int ndlen = 1; GETCHARLEN(nd, nptr, ndlen); if (UCD_CATEGORY(nd) != ucp_M) break; ncount++; nptr += ndlen; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } } break; #endif /*-----------------------------------------------------------------*/ case OP_ANYNL_EXTRA + OP_TYPEEXACT: case OP_ANYNL_EXTRA + OP_TYPEUPTO: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { int ncount = 0; switch (c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; goto ANYNL03; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1; /* Fall through */ ANYNL03: case 0x000a: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); } else { ADD_NEW_DATA(-state_offset, count, ncount); } break; default: break; } } break; /*-----------------------------------------------------------------*/ case OP_VSPACE_EXTRA + OP_TYPEEXACT: case OP_VSPACE_EXTRA + OP_TYPEUPTO: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; switch (c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: OK = TRUE; break; default: OK = FALSE; } if (OK == (d == OP_VSPACE)) { if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } } break; /*-----------------------------------------------------------------*/ case OP_HSPACE_EXTRA + OP_TYPEEXACT: case OP_HSPACE_EXTRA + OP_TYPEUPTO: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT) { ADD_ACTIVE(state_offset + 4, 0); } count = current_state->count; /* Number already matched */ if (clen > 0) { BOOL OK; switch (c) { case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ OK = TRUE; break; default: OK = FALSE; break; } if (OK == (d == OP_HSPACE)) { if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW_DATA(-(state_offset + 4), 0, 0); } else { ADD_NEW_DATA(-state_offset, count, 0); } } } break; /* ========================================================================== */ /* These opcodes are followed by a character that is usually compared to the current subject character; it is loaded into d. We still get here even if there is no subject character, because in some cases zero repetitions are permitted. */ /*-----------------------------------------------------------------*/ case OP_CHAR: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); } break; /*-----------------------------------------------------------------*/ case OP_CHARNC: if (clen == 0) break; #ifdef SUPPORT_UTF8 if (utf8) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { unsigned int othercase; if (c < 128) othercase = fcc[c]; else /* If we have Unicode property support, we can use it to test the other case of the character. */ #ifdef SUPPORT_UCP othercase = UCD_OTHERCASE(c); #else othercase = NOTACHAR; #endif if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); } } } else #endif /* SUPPORT_UTF8 */ /* Non-UTF-8 mode */ { if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); } } break; #ifdef SUPPORT_UCP /*-----------------------------------------------------------------*/ /* This is a tricky one because it can match more than one character. Find out how many characters to skip, and then set up a negative state to wait for them to pass before continuing. */ case OP_EXTUNI: if (clen > 0 && UCD_CATEGORY(c) != ucp_M) { const uschar *nptr = ptr + clen; int ncount = 0; while (nptr < end_subject) { int nclen = 1; GETCHARLEN(c, nptr, nclen); if (UCD_CATEGORY(c) != ucp_M) break; ncount++; nptr += nclen; } ADD_NEW_DATA(-(state_offset + 1), 0, ncount); } break; #endif /*-----------------------------------------------------------------*/ /* This is a tricky like EXTUNI because it too can match more than one character (when CR is followed by LF). In this case, set up a negative state to wait for one character to pass before continuing. */ case OP_ANYNL: if (clen > 0) switch(c) { case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break; case 0x000a: ADD_NEW(state_offset + 1, 0); break; case 0x000d: if (ptr + 1 < end_subject && ptr[1] == 0x0a) { ADD_NEW_DATA(-(state_offset + 1), 0, 1); } else { ADD_NEW(state_offset + 1, 0); } break; } break; /*-----------------------------------------------------------------*/ case OP_NOT_VSPACE: if (clen > 0) switch(c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: break; default: ADD_NEW(state_offset + 1, 0); break; } break; /*-----------------------------------------------------------------*/ case OP_VSPACE: if (clen > 0) switch(c) { case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x0085: case 0x2028: case 0x2029: ADD_NEW(state_offset + 1, 0); break; default: break; } break; /*-----------------------------------------------------------------*/ case OP_NOT_HSPACE: if (clen > 0) switch(c) { case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ break; default: ADD_NEW(state_offset + 1, 0); break; } break; /*-----------------------------------------------------------------*/ case OP_HSPACE: if (clen > 0) switch(c) { case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ ADD_NEW(state_offset + 1, 0); break; } break; /*-----------------------------------------------------------------*/ /* Match a negated single character. This is only used for one-byte characters, that is, we know that d < 256. The character we are checking (c) can be multibyte. */ case OP_NOT: if (clen > 0) { unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d; if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); } } break; /*-----------------------------------------------------------------*/ case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTPOSPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); } if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { if (count > 0 && (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS)) { active_count--; /* Remove non-match possibility */ next_active_state--; } count++; ADD_NEW(state_offset, count); } } break; /*-----------------------------------------------------------------*/ case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: case OP_NOTQUERY: case OP_NOTMINQUERY: case OP_NOTPOSQUERY: ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW(state_offset + dlen + 1, 0); } } break; /*-----------------------------------------------------------------*/ case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_NOTSTAR: case OP_NOTMINSTAR: case OP_NOTPOSSTAR: ADD_ACTIVE(state_offset + dlen + 1, 0); if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR) { active_count--; /* Remove non-match possibility */ next_active_state--; } ADD_NEW(state_offset, 0); } } break; /*-----------------------------------------------------------------*/ case OP_EXACT: case OP_NOTEXACT: count = current_state->count; /* Number already matched */ if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + dlen + 3, 0); } else { ADD_NEW(state_offset, count); } } } break; /*-----------------------------------------------------------------*/ case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: case OP_NOTUPTO: case OP_NOTMINUPTO: case OP_NOTPOSUPTO: ADD_ACTIVE(state_offset + dlen + 3, 0); count = current_state->count; /* Number already matched */ if (clen > 0) { unsigned int otherd = NOTACHAR; if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (utf8 && d >= 128) { #ifdef SUPPORT_UCP otherd = UCD_OTHERCASE(d); #endif /* SUPPORT_UCP */ } else #endif /* SUPPORT_UTF8 */ otherd = fcc[d]; } if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR)) { if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO) { active_count--; /* Remove non-match possibility */ next_active_state--; } if (++count >= GET2(code, 1)) { ADD_NEW(state_offset + dlen + 3, 0); } else { ADD_NEW(state_offset, count); } } } break; /* ========================================================================== */ /* These are the class-handling opcodes */ case OP_CLASS: case OP_NCLASS: case OP_XCLASS: { BOOL isinclass = FALSE; int next_state_offset; const uschar *ecode; /* For a simple class, there is always just a 32-byte table, and we can set isinclass from it. */ if (codevalue != OP_XCLASS) { ecode = code + 33; if (clen > 0) { isinclass = (c > 255)? (codevalue == OP_NCLASS) : ((code[1 + c/8] & (1 << (c&7))) != 0); } } /* An extended class may have a table or a list of single characters, ranges, or both, and it may be positive or negative. There's a function that sorts all this out. */ else { ecode = code + GET(code, 1); if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE); } /* At this point, isinclass is set for all kinds of class, and ecode points to the byte after the end of the class. If there is a quantifier, this is where it will be. */ next_state_offset = ecode - start_code; switch (*ecode) { case OP_CRSTAR: case OP_CRMINSTAR: ADD_ACTIVE(next_state_offset + 1, 0); if (isinclass) { ADD_NEW(state_offset, 0); } break; case OP_CRPLUS: case OP_CRMINPLUS: count = current_state->count; /* Already matched */ if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); } if (isinclass) { count++; ADD_NEW(state_offset, count); } break; case OP_CRQUERY: case OP_CRMINQUERY: ADD_ACTIVE(next_state_offset + 1, 0); if (isinclass) { ADD_NEW(next_state_offset + 1, 0); } break; case OP_CRRANGE: case OP_CRMINRANGE: count = current_state->count; /* Already matched */ if (count >= GET2(ecode, 1)) { ADD_ACTIVE(next_state_offset + 5, 0); } if (isinclass) { int max = GET2(ecode, 3); if (++count >= max && max != 0) /* Max 0 => no limit */ { ADD_NEW(next_state_offset + 5, 0); } else { ADD_NEW(state_offset, count); } } break; default: if (isinclass) { ADD_NEW(next_state_offset, 0); } break; } } break; /* ========================================================================== */ /* These are the opcodes for fancy brackets of various kinds. We have to use recursion in order to handle them. The "always failing" assersion (?!) is optimised when compiling to OP_FAIL, so we have to support that, though the other "backtracking verbs" are not supported. */ case OP_FAIL: break; case OP_ASSERT: case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: { int rc; int local_offsets[2]; int local_workspace[1000]; const uschar *endasscode = code + GET(code, 1); while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); rc = internal_dfa_exec( md, /* static match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ ptr - start_subject, /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ ims, /* the current ims flags */ rlevel, /* function recursion level */ recursing); /* pass on regex recursion */ if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } } break; /*-----------------------------------------------------------------*/ case OP_COND: case OP_SCOND: { int local_offsets[1000]; int local_workspace[1000]; int condcode = code[LINK_SIZE+1]; /* Back reference conditions are not supported */ if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; /* The DEFINE condition is always false */ if (condcode == OP_DEF) { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } /* The only supported version of OP_RREF is for the value RREF_ANY, which means "test if in any recursion". We can't test for specifically recursed groups. */ else if (condcode == OP_RREF) { int value = GET2(code, LINK_SIZE+2); if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } } /* Otherwise, the condition is an assertion */ else { int rc; const uschar *asscode = code + LINK_SIZE + 1; const uschar *endasscode = asscode + GET(asscode, 1); while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1); rc = internal_dfa_exec( md, /* fixed match data */ asscode, /* this subexpression's code */ ptr, /* where we currently are */ ptr - start_subject, /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ ims, /* the current ims flags */ rlevel, /* function recursion level */ recursing); /* pass on regex recursion */ if ((rc >= 0) == (condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } } } break; /*-----------------------------------------------------------------*/ case OP_RECURSE: { int local_offsets[1000]; int local_workspace[1000]; int rc; DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP, recursing + 1)); rc = internal_dfa_exec( md, /* fixed match data */ start_code + GET(code, 1), /* this subexpression's code */ ptr, /* where we currently are */ ptr - start_subject, /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ ims, /* the current ims flags */ rlevel, /* function recursion level */ recursing + 1); /* regex recurse level */ DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP, recursing + 1, rc)); /* Ran out of internal offsets */ if (rc == 0) return PCRE_ERROR_DFA_RECURSE; /* For each successful matched substring, set up the next state with a count of characters to skip before trying it. Note that the count is in characters, not bytes. */ if (rc > 0) { for (rc = rc*2 - 2; rc >= 0; rc -= 2) { const uschar *p = start_subject + local_offsets[rc]; const uschar *pp = start_subject + local_offsets[rc+1]; int charcount = local_offsets[rc+1] - local_offsets[rc]; while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; if (charcount > 0) { ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1)); } else { ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0); } } } else if (rc != PCRE_ERROR_NOMATCH) return rc; } break; /*-----------------------------------------------------------------*/ case OP_ONCE: { int local_offsets[2]; int local_workspace[1000]; int rc = internal_dfa_exec( md, /* fixed match data */ code, /* this subexpression's code */ ptr, /* where we currently are */ ptr - start_subject, /* start offset */ local_offsets, /* offset vector */ sizeof(local_offsets)/sizeof(int), /* size of same */ local_workspace, /* workspace vector */ sizeof(local_workspace)/sizeof(int), /* size of same */ ims, /* the current ims flags */ rlevel, /* function recursion level */ recursing); /* pass on regex recursion */ if (rc >= 0) { const uschar *end_subpattern = code; int charcount = local_offsets[1] - local_offsets[0]; int next_state_offset, repeat_state_offset; do { end_subpattern += GET(end_subpattern, 1); } while (*end_subpattern == OP_ALT); next_state_offset = end_subpattern - start_code + LINK_SIZE + 1; /* If the end of this subpattern is KETRMAX or KETRMIN, we must arrange for the repeat state also to be added to the relevant list. Calculate the offset, or set -1 for no repeat. */ repeat_state_offset = (*end_subpattern == OP_KETRMAX || *end_subpattern == OP_KETRMIN)? end_subpattern - start_code - GET(end_subpattern, 1) : -1; /* If we have matched an empty string, add the next state at the current character pointer. This is important so that the duplicate checking kicks in, which is what breaks infinite loops that match an empty string. */ if (charcount == 0) { ADD_ACTIVE(next_state_offset, 0); } /* Optimization: if there are no more active states, and there are no new states yet set up, then skip over the subject string right here, to save looping. Otherwise, set up the new state to swing into action when the end of the substring is reached. */ else if (i + 1 >= active_count && new_count == 0) { ptr += charcount; clen = 0; ADD_NEW(next_state_offset, 0); /* If we are adding a repeat state at the new character position, we must fudge things so that it is the only current state. Otherwise, it might be a duplicate of one we processed before, and that would cause it to be skipped. */ if (repeat_state_offset >= 0) { next_active_state = active_states; active_count = 0; i = -1; ADD_ACTIVE(repeat_state_offset, 0); } } else { const uschar *p = start_subject + local_offsets[0]; const uschar *pp = start_subject + local_offsets[1]; while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--; ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1)); if (repeat_state_offset >= 0) { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); } } } else if (rc != PCRE_ERROR_NOMATCH) return rc; } break; /* ========================================================================== */ /* Handle callouts */ case OP_CALLOUT: if (pcre_callout != NULL) { int rrc; pcre_callout_block cb; cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = code[1]; cb.offset_vector = offsets; cb.subject = (PCRE_SPTR)start_subject; cb.subject_length = end_subject - start_subject; cb.start_match = current_subject - start_subject; cb.current_position = ptr - start_subject; cb.pattern_position = GET(code, 2); cb.next_item_length = GET(code, 2 + LINK_SIZE); cb.capture_top = 1; cb.capture_last = -1; cb.callout_data = md->callout_data; if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); } } break; /* ========================================================================== */ default: /* Unsupported opcode */ return PCRE_ERROR_DFA_UITEM; } NEXT_ACTIVE_STATE: continue; } /* End of loop scanning active states */ /* We have finished the processing at the current subject character. If no new states have been set for the next character, we have found all the matches that we are going to find. If we are at the top level and partial matching has been requested, check for appropriate conditions. */ if (new_count <= 0) { if (match_count < 0 && /* No matches found */ rlevel == 1 && /* Top level match function */ (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ ptr >= end_subject && /* Reached end of subject */ ptr > current_subject) /* Matched non-empty string */ { if (offsetcount >= 2) { offsets[0] = current_subject - start_subject; offsets[1] = end_subject - start_subject; } match_count = PCRE_ERROR_PARTIAL; } DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count, rlevel*2-2, SP)); break; /* In effect, "return", but see the comment below */ } /* One or more states are active for the next character. */ ptr += clen; /* Advance to next subject character */ } /* Loop to move along the subject string */ /* Control gets here from "break" a few lines above. We do it this way because if we use "return" above, we have compiler trouble. Some compilers warn if there's nothing here because they think the function doesn't return a value. On the other hand, if we put a dummy statement here, some more clever compilers complain that it can't be reached. Sigh. */ return match_count; } | pcredfa.c | 282 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount, int *workspace, int wscount) { real_pcre *re = (real_pcre *)argument_re; dfa_match_data match_block; dfa_match_data *md = &match_block; BOOL utf8, anchored, startline, firstline; const uschar *current_subject, *end_subject, *lcc; pcre_study_data internal_study; const pcre_study_data *study = NULL; real_pcre internal_re; const uschar *req_byte_ptr; const uschar *start_bits = NULL; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; int newline; /* Plausibility checks */ if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; if (re == NULL || subject == NULL || workspace == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE; /* We need to find the pointer to any study data before we test for byte flipping, so we scan the extra_data block first. This may set two fields in the match block, so we must initialize them beforehand. However, the other fields in the match block must not be set until after the byte flipping. */ md->tables = re->tables; md->callout_data = NULL; if (extra_data != NULL) { unsigned int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT; if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) return PCRE_ERROR_DFA_UMLIMIT; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) md->tables = extra_data->tables; } /* Check that the first field in the block is the magic number. If it is not, test for a regex that was compiled on a host of opposite endianness. If this is the case, flipped values are put in internal_re and internal_study if there was study data too. */ if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } /* Set some local values */ current_subject = (const unsigned char *)subject + start_offset; end_subject = (const unsigned char *)subject + length; req_byte_ptr = current_subject - 1; #ifdef SUPPORT_UTF8 utf8 = (re->options & PCRE_UTF8) != 0; #else utf8 = FALSE; #endif anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 || (re->options & PCRE_ANCHORED) != 0; /* The remaining fixed data for passing around. */ md->start_code = (const uschar *)argument_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (const unsigned char *)subject; md->end_subject = end_subject; md->moptions = options; md->poptions = re->options; /* If the BSR option is not set at match time, copy what was set at compile time. */ if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0) { if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE); #ifdef BSR_ANYCRLF else md->moptions |= PCRE_BSR_ANYCRLF; #endif } /* Handle different types of newline. The three bits give eight cases. If nothing is set at run time, whatever was used at compile time applies. */ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Compile-time default */ case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; } if (newline == -2) { md->nltype = NLTYPE_ANYCRLF; } else if (newline < 0) { md->nltype = NLTYPE_ANY; } else { md->nltype = NLTYPE_FIXED; if (newline > 255) { md->nllen = 2; md->nl[0] = (newline >> 8) & 255; md->nl[1] = newline & 255; } else { md->nllen = 1; md->nl[0] = newline; } } /* Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar *)subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { int tb = ((uschar *)subject)[start_offset]; if (tb > 127) { tb &= 0xc0; if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; } } } #endif /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ if (md->tables == NULL) md->tables = _pcre_default_tables; /* The lower casing table and the "must be at the start of a line" flag are used in a loop when finding where to start. */ lcc = md->tables + lcc_offset; startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; /* Set up the first character to match, if available. The first_byte value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first char may be unset for an unanchored pattern, of course. If there's no first char and the pattern was studied, there may be a bitmap of possible first characters. */ if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) first_byte = lcc[first_byte]; } else { if (startline && study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } } /* For anchored or unanchored matches, there may be a "last known required character" set. */ if ((re->flags & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */ } /* Call the main matching function, looping for a non-anchored regex after a failed match. Unless restarting, optimize by moving to the first match character if possible, when not anchored. Then unless wanting a partial match, check for a required later character. */ for (;;) { int rc; if ((options & PCRE_DFA_RESTART) == 0) { const uschar *save_end_subject = end_subject; /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. Implement this by temporarily adjusting end_subject so that we stop scanning at a newline. If the match fails at the newline, later code breaks this loop. */ if (firstline) { USPTR t = current_subject; #ifdef SUPPORT_UTF8 if (utf8) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; while (t < end_subject && (*t & 0xc0) == 0x80) t++; } } else #endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } if (first_byte >= 0) { if (first_byte_caseless) while (current_subject < end_subject && lcc[*current_subject] != first_byte) current_subject++; else while (current_subject < end_subject && *current_subject != first_byte) current_subject++; } /* Or to just after a linebreak for a multiline match if possible */ else if (startline) { if (current_subject > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 if (utf8) { while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) { current_subject++; while(current_subject < end_subject && (*current_subject & 0xc0) == 0x80) current_subject++; } } else #endif while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) current_subject++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. */ if (current_subject[-1] == '\r' && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && current_subject < end_subject && *current_subject == '\n') current_subject++; } } /* Or to a non-unique first char after study */ else if (start_bits != NULL) { while (current_subject < end_subject) { register unsigned int c = *current_subject; if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; else break; } } /* Restore fudged end_subject */ end_subject = save_end_subject; } /* If req_byte is set, we know that that character must appear in the subject for the match to succeed. If the first character is set, req_byte must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of work in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as does using an autoincrement and backing off on a match. HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching /^C/ on a 32-megabyte string... so we don't do this when the string is sufficiently long. ALSO: this processing is disabled when partial matching is requested. */ if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX && (options & PCRE_PARTIAL) == 0) { register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ if (p > req_byte_ptr) { if (req_byte_caseless) { while (p < end_subject) { register int pp = *p++; if (pp == req_byte || pp == req_byte2) { p--; break; } } } else { while (p < end_subject) { if (*p++ == req_byte) { p--; break; } } } /* If we can't find the required character, break the matching loop, which will cause a return or PCRE_ERROR_NOMATCH. */ if (p >= end_subject) break; /* If we have found the required character, save the point where we found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. */ req_byte_ptr = p; } } /* OK, now we can do the business */ rc = internal_dfa_exec( md, /* fixed match data */ md->start_code, /* this subexpression's code */ current_subject, /* where we currently are */ start_offset, /* start offset in subject */ offsets, /* offset vector */ offsetcount, /* size of same */ workspace, /* workspace vector */ wscount, /* size of same */ re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */ 0, /* function recurse level */ 0); /* regex recurse level */ /* Anything other than "no match" means we are done, always; otherwise, carry on only if not anchored. */ if (rc != PCRE_ERROR_NOMATCH || anchored) return rc; /* Advance to the next subject character unless we are at the end of a line and firstline is set. */ if (firstline && IS_NEWLINE(current_subject)) break; current_subject++; if (utf8) { while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80) current_subject++; } if (current_subject > end_subject) break; /* If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ if (current_subject[-1] == '\r' && current_subject < end_subject && *current_subject == '\n' && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF || md->nllen == 2)) current_subject++; } /* "Bumpalong" loop */ return PCRE_ERROR_NOMATCH; } | pcredfa.c | 2508 |
pcreexec.c | |||
Type | Function | Source | Line |
STATIC VOID | pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
static void pchars(const uschar *p, int length, BOOL is_subject, match_data *md) { unsigned int c; if (is_subject && length > md->end_subject - p) length = md->end_subject - p; while (length-- > 0) if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); } #endif | pcreexec.c | 109 |
STATIC BOOL | match_ref(int offset, register USPTR eptr, int length, match_data *md, unsigned long int ims)
static BOOL match_ref(int offset, register USPTR eptr, int length, match_data *md, unsigned long int ims) { USPTR p = md->start_subject + md->offset_vector[offset]; #ifdef DEBUG if (eptr >= md->end_subject) printf("matching subject | pcreexec.c | 138 |
STATIC INT | match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth)
static int match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth) { /* These variables do not need to be preserved over recursion in this function, so they can be ordinary variables in all cases. Mark some of them with "register" because they are used a lot in loops. */ register int rrc; /* Returns from recursive calls */ register int i; /* Used for loops not involving calls to RMATCH() */ register unsigned int c; /* Character values not kept over RMATCH() calls */ register BOOL utf8; /* Local copy of UTF-8 flag for speed */ BOOL minimize, possessive; /* Quantifier options */ /* When recursion is not being used, all "local" variables that have to be preserved over calls to RMATCH() are part of a "frame" which is obtained from heap storage. Set up the top-level frame here; others are obtained from the heap whenever RMATCH() does a "recursion". See the macro definitions above. */ #ifdef NO_RECURSE heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); frame->Xprevframe = NULL; /* Marks the top level */ /* Copy in the original argument variables */ frame->Xeptr = eptr; frame->Xecode = ecode; frame->Xmstart = mstart; frame->Xoffset_top = offset_top; frame->Xims = ims; frame->Xeptrb = eptrb; frame->Xflags = flags; frame->Xrdepth = rdepth; /* This is where control jumps back to to effect "recursion" */ HEAP_RECURSE: /* Macros make the argument variables come from the current frame */ #define eptr frame->Xeptr #define ecode frame->Xecode #define mstart frame->Xmstart #define offset_top frame->Xoffset_top #define ims frame->Xims #define eptrb frame->Xeptrb #define flags frame->Xflags #define rdepth frame->Xrdepth /* Ditto for the local variables */ #ifdef SUPPORT_UTF8 #define charptr frame->Xcharptr #endif #define callpat frame->Xcallpat #define data frame->Xdata #define next frame->Xnext #define pp frame->Xpp #define prev frame->Xprev #define saved_eptr frame->Xsaved_eptr #define new_recursive frame->Xnew_recursive #define cur_is_word frame->Xcur_is_word #define condition frame->Xcondition #define prev_is_word frame->Xprev_is_word #define original_ims frame->Xoriginal_ims #ifdef SUPPORT_UCP #define prop_type frame->Xprop_type #define prop_value frame->Xprop_value #define prop_fail_result frame->Xprop_fail_result #define prop_category frame->Xprop_category #define prop_chartype frame->Xprop_chartype #define prop_script frame->Xprop_script #define oclength frame->Xoclength #define occhars frame->Xocchars #endif #define ctype frame->Xctype #define fc frame->Xfc #define fi frame->Xfi #define length frame->Xlength #define max frame->Xmax #define min frame->Xmin #define number frame->Xnumber #define offset frame->Xoffset #define op frame->Xop #define save_capture_last frame->Xsave_capture_last #define save_offset1 frame->Xsave_offset1 #define save_offset2 frame->Xsave_offset2 #define save_offset3 frame->Xsave_offset3 #define stacksave frame->Xstacksave #define newptrb frame->Xnewptrb /* When recursion is being used, local variables are allocated on the stack and get preserved during recursion in the normal way. In this environment, fi and i, and fc and c, can be the same variables. */ #else /* NO_RECURSE not defined */ #define fi i #define fc c #ifdef SUPPORT_UTF8 /* Many of these variables are used only */ const uschar *charptr; /* in small blocks of the code. My normal */ #endif /* style of coding would have declared */ const uschar *callpat; /* them within each of those blocks. */ const uschar *data; /* However, in order to accommodate the */ const uschar *next; /* version of this code that uses an */ USPTR pp; /* external "stack" implemented on the */ const uschar *prev; /* heap, it is easier to declare them all */ USPTR saved_eptr; /* here, so the declarations can be cut */ /* out in a block. The only declarations */ recursion_info new_recursive; /* within blocks below are for variables */ /* that do not have to be preserved over */ BOOL cur_is_word; /* a recursive call to RMATCH(). */ BOOL condition; BOOL prev_is_word; unsigned long int original_ims; #ifdef SUPPORT_UCP int prop_type; int prop_value; int prop_fail_result; int prop_category; int prop_chartype; int prop_script; int oclength; uschar occhars[8]; #endif int ctype; int length; int max; int min; int number; int offset; int op; int save_capture_last; int save_offset1, save_offset2, save_offset3; int stacksave[REC_STACK_SAVE_MAX]; eptrblock newptrb; #endif /* NO_RECURSE */ /* These statements are here to stop the compiler complaining about unitialized variables. */ #ifdef SUPPORT_UCP prop_value = 0; prop_fail_result = 0; #endif /* This label is used for tail recursion, which is used in a few cases even when NO_RECURSE is not defined, in order to reduce the amount of stack that is used. Thanks to Ian Taylor for noticing this possibility and sending the original patch. */ TAIL_RECURSE: /* OK, now we can get on with the real code of the function. Recursive calls are specified by the macro RMATCH and RRETURN is used to return. When NO_RECURSE is *not* defined, these just turn into a recursive call to match() and a "return", respectively (possibly with some debugging if DEBUG is defined). However, RMATCH isn't like a function call because it's quite a complicated macro. It has to be used in one particular way. This shouldn't, however, impact performance when true recursion is being used. */ #ifdef SUPPORT_UTF8 utf8 = md->utf8; /* Local copy of the flag */ #else utf8 = FALSE; #endif /* First check that we haven't called match() too many times, or that we haven't exceeded the recursive call limit. */ if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); original_ims = ims; /* Save for resetting on ')' */ /* At the start of a group with an unlimited repeat that may match an empty string, the match_cbegroup flag is set. When this is the case, add the current subject pointer to the chain of such remembered pointers, to be checked when we hit the closing ket, in order to break infinite loops that match no characters. When match() is called in other circumstances, don't add to the chain. The match_cbegroup flag must NOT be used with tail recursion, because the memory block that is used is on the stack, so a new one may be required for each match(). */ if ((flags & match_cbegroup) != 0) { newptrb.epb_saved_eptr = eptr; newptrb.epb_prev = eptrb; eptrb = &newptrb; } /* Now start processing the opcodes. */ for (;;) { minimize = possessive = FALSE; op = *ecode; /* For partial matching, remember if we ever hit the end of the subject after matching at least one subject character. */ if (md->partial && eptr >= md->end_subject && eptr > mstart) md->hitend = TRUE; switch(op) { case OP_FAIL: RRETURN(MATCH_NOMATCH); case OP_PRUNE: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM51); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_PRUNE); case OP_COMMIT: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM52); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_COMMIT); case OP_SKIP: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM53); if (rrc != MATCH_NOMATCH) RRETURN(rrc); md->start_match_ptr = eptr; /* Pass back current position */ RRETURN(MATCH_SKIP); case OP_THEN: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM54); if (rrc != MATCH_NOMATCH) RRETURN(rrc); RRETURN(MATCH_THEN); /* Handle a capturing bracket. If there is space in the offset vector, save the current subject position in the working slot at the top of the vector. We mustn't change the current values of the data slot, because they may be set from a previous iteration of this group, and be referred to by a reference inside the group. If the bracket fails to match, we need to restore this value and also the values of the final offsets, in case they were set by a previous iteration of the same bracket. If there isn't enough space in the offset vector, treat this as if it were a non-capturing bracket. Don't worry about setting the flag for the error case here; that is handled in the code for KET. */ case OP_CBRA: case OP_SCBRA: number = GET2(ecode, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG printf("start bracket %d\n", number); printf("subject="); pchars(eptr, 16, TRUE, md); printf("\n"); #endif if (offset < md->offset_max) { save_offset1 = md->offset_vector[offset]; save_offset2 = md->offset_vector[offset+1]; save_offset3 = md->offset_vector[md->offset_end - number]; save_capture_last = md->capture_last; DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); md->offset_vector[md->offset_end - number] = eptr - md->start_subject; flags = (op == OP_SCBRA)? match_cbegroup : 0; do { RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM1); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); md->capture_last = save_capture_last; ecode += GET(ecode, 1); } while (*ecode == OP_ALT); DPRINTF(("bracket %d failed\n", number)); md->offset_vector[offset] = save_offset1; md->offset_vector[offset+1] = save_offset2; md->offset_vector[md->offset_end - number] = save_offset3; RRETURN(MATCH_NOMATCH); } /* FALL THROUGH ... Insufficient room for saving captured contents. Treat as a non-capturing bracket. */ /* VVVVVVVVVVVVVVVVVVVVVVVVV */ /* VVVVVVVVVVVVVVVVVVVVVVVVV */ DPRINTF(("insufficient capture room: treat as non-capturing\n")); /* VVVVVVVVVVVVVVVVVVVVVVVVV */ /* VVVVVVVVVVVVVVVVVVVVVVVVV */ /* Non-capturing bracket. Loop for all the alternatives. When we get to the final alternative within the brackets, we would return the result of a recursive call to match() whatever happened. We can reduce stack usage by turning this into a tail recursion, except in the case when match_cbegroup is set.*/ case OP_BRA: case OP_SBRA: DPRINTF(("start non-capturing bracket\n")); flags = (op >= OP_SBRA)? match_cbegroup : 0; for (;;) { if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ { if (flags == 0) /* Not a possibly empty group */ { ecode += _pcre_OP_lengths[*ecode]; DPRINTF(("bracket 0 tail recursion\n")); goto TAIL_RECURSE; } /* Possibly empty group; can't use tail recursion. */ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM48); RRETURN(rrc); } /* For non-final alternatives, continue the loop for a NOMATCH result; otherwise return. */ RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, eptrb, flags, RM2); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } /* Control never reaches here. */ /* Conditional group: compilation checked that there are no more than two branches. If the condition is false, skipping the first branch takes us past the end if there is only one branch, but that's OK because that is exactly what going to the ket would do. As there is only one branch to be obeyed, we can use tail recursion to avoid using another stack frame. */ case OP_COND: case OP_SCOND: if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ { offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ condition = md->recursive != NULL && (offset == RREF_ANY || offset == md->recursive->group_num); ecode += condition? 3 : GET(ecode, 1); } else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ { offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ condition = offset < offset_top && md->offset_vector[offset] >= 0; ecode += condition? 3 : GET(ecode, 1); } else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ { condition = FALSE; ecode += GET(ecode, 1); } /* The condition is an assertion. Call match() to evaluate it - setting the final argument match_condassert causes it to stop at the end of an assertion. */ else { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, match_condassert, RM3); if (rrc == MATCH_MATCH) { condition = TRUE; ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); while (*ecode == OP_ALT) ecode += GET(ecode, 1); } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { RRETURN(rrc); /* Need braces because of following else */ } else { condition = FALSE; ecode += GET(ecode, 1); } } /* We are now at the branch that is to be obeyed. As there is only one, we can use tail recursion to avoid using another stack frame, except when match_cbegroup is required for an unlimited repeat of a possibly empty group. If the second alternative doesn't exist, we can just plough on. */ if (condition || *ecode == OP_ALT) { ecode += 1 + LINK_SIZE; if (op == OP_SCOND) /* Possibly empty group */ { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); RRETURN(rrc); } else /* Group must match something */ { flags = 0; goto TAIL_RECURSE; } } else /* Condition false & no 2nd alternative */ { ecode += 1 + LINK_SIZE; } break; /* End of the pattern, either real or forced. If we are in a top-level recursion, we should restore the offsets appropriately and continue from after the call. */ case OP_ACCEPT: case OP_END: if (md->recursive != NULL && md->recursive->group_num == 0) { recursion_info *rec = md->recursive; DPRINTF(("End of pattern in a (?0) recursion\n")); md->recursive = rec->prevrec; memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); mstart = rec->save_start; ims = original_ims; ecode = rec->after_call; break; } /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty string - backtracking will then try other alternatives, if any. */ if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ md->start_match_ptr = mstart; /* and the start (\K can modify) */ RRETURN(MATCH_MATCH); /* Change option settings */ case OP_OPT: ims = ecode[1]; ecode += 2; DPRINTF(("ims set to %02lx\n", ims)); break; /* Assertion brackets. Check the alternative branches in turn - the matching won't pass the KET for an assertion. If any one branch matches, the assertion is true. Lookbehind assertions have an OP_REVERSE item at the start of each branch to move the current point backwards, so the code at this level is identical to the lookahead case. */ case OP_ASSERT: case OP_ASSERTBACK: do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM4); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode, 1); } while (*ecode == OP_ALT); if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); /* If checking an assertion for a condition, return MATCH_MATCH. */ if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); /* Continue from after the assertion, updating the offsets high water mark, since extracts may have been taken during the assertion. */ do ecode += GET(ecode,1); while (*ecode == OP_ALT); ecode += 1 + LINK_SIZE; offset_top = md->end_offset_top; continue; /* Negative assertion: all branches must fail to match */ case OP_ASSERT_NOT: case OP_ASSERTBACK_NOT: do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, RM5); if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); ecode += 1 + LINK_SIZE; continue; /* Move the subject pointer back. This occurs only at the start of each branch of a lookbehind assertion. If we are too close to the start to move back, this match function fails. When working with UTF-8 we move back a number of characters, not bytes. */ case OP_REVERSE: #ifdef SUPPORT_UTF8 if (utf8) { i = GET(ecode, 1); while (i-- > 0) { eptr--; if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); BACKCHAR(eptr); } } else #endif /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ { eptr -= GET(ecode, 1); if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); } /* Skip to next op code */ ecode += 1 + LINK_SIZE; break; /* The callout item calls an external function, if one is provided, passing details of the match so far. This is mainly for debugging, though the function is able to force a failure. */ case OP_CALLOUT: if (pcre_callout != NULL) { pcre_callout_block cb; cb.version = 1; /* Version 1 of the callout block */ cb.callout_number = ecode[1]; cb.offset_vector = md->offset_vector; cb.subject = (PCRE_SPTR)md->start_subject; cb.subject_length = md->end_subject - md->start_subject; cb.start_match = mstart - md->start_subject; cb.current_position = eptr - md->start_subject; cb.pattern_position = GET(ecode, 2); cb.next_item_length = GET(ecode, 2 + LINK_SIZE); cb.capture_top = offset_top/2; cb.capture_last = md->capture_last; cb.callout_data = md->callout_data; if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); if (rrc < 0) RRETURN(rrc); } ecode += 2 + 2*LINK_SIZE; break; /* Recursion either matches the current regex, or some subexpression. The offset data is the offset to the starting bracket from the start of the whole pattern. (This is so that it works from duplicated subpatterns.) If there are any capturing brackets started but not finished, we have to save their starting points and reinstate them after the recursion. However, we don't know how many such there are (offset_top records the completed total) so we just have to save all the potential data. There may be up to 65535 such values, which is too large to put on the stack, but using malloc for small numbers seems expensive. As a compromise, the stack is used when there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc is used. A problem is what to do if the malloc fails ... there is no way of returning to the top level with an error. Save the top REC_STACK_SAVE_MAX values on the stack, and accept that the rest may be wrong. There are also other values that have to be saved. We use a chained sequence of blocks that actually live on the stack. Thanks to Robin Houston for the original version of this logic. */ case OP_RECURSE: { callpat = md->start_code + GET(ecode, 1); new_recursive.group_num = (callpat == md->start_code)? 0 : GET2(callpat, 1 + LINK_SIZE); /* Add to "recursing stack" */ new_recursive.prevrec = md->recursive; md->recursive = &new_recursive; /* Find where to continue from afterwards */ ecode += 1 + LINK_SIZE; new_recursive.after_call = ecode; /* Now save the offset data. */ new_recursive.saved_max = md->offset_end; if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) new_recursive.offset_save = stacksave; else { new_recursive.offset_save = (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); } memcpy(new_recursive.offset_save, md->offset_vector, new_recursive.saved_max * sizeof(int)); new_recursive.save_start = mstart; mstart = eptr; /* OK, now we can do the recursion. For each top-level alternative we restore the offset and recursion data. */ DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; do { RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, md, ims, eptrb, flags, RM6); if (rrc == MATCH_MATCH) { DPRINTF(("Recursion matched\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_MATCH); } else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) { DPRINTF(("Recursion gave error %d\n", rrc)); RRETURN(rrc); } md->recursive = &new_recursive; memcpy(md->offset_vector, new_recursive.offset_save, new_recursive.saved_max * sizeof(int)); callpat += GET(callpat, 1); } while (*callpat == OP_ALT); DPRINTF(("Recursion didn't match\n")); md->recursive = new_recursive.prevrec; if (new_recursive.offset_save != stacksave) (pcre_free)(new_recursive.offset_save); RRETURN(MATCH_NOMATCH); } /* Control never reaches here */ /* "Once" brackets are like assertion brackets except that after a match, the point in the subject string is not moved back. Thus there can never be a move back into the brackets. Friedl calls these "atomic" subpatterns. Check the alternative branches in turn - the matching won't pass the KET for this kind of subpattern. If any one branch matches, we carry on as at the end of a normal bracket, leaving the subject pointer. */ case OP_ONCE: prev = ecode; saved_eptr = eptr; do { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); if (rrc == MATCH_MATCH) break; if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); ecode += GET(ecode,1); } while (*ecode == OP_ALT); /* If hit the end of the group (which could be repeated), fail */ if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); /* Continue as from after the assertion, updating the offsets high water mark, since extracts may have been taken. */ do ecode += GET(ecode, 1); while (*ecode == OP_ALT); offset_top = md->end_offset_top; eptr = md->end_match_ptr; /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. This is the forcible breaking of infinite loops as implemented in Perl 5.005. If there is an options reset, it will get obeyed in the normal course of events. */ if (*ecode == OP_KET || eptr == saved_eptr) { ecode += 1+LINK_SIZE; break; } /* The repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. The second "call" of match() uses tail recursion, to avoid using another stack frame. We need to reset any options that changed within the bracket before re-running it, so check the next opcode. */ if (ecode[1+LINK_SIZE] == OP_OPT) { ims = (ims & ~PCRE_IMS) | ecode[4]; DPRINTF(("ims set to %02lx at group repeat\n", ims)); } if (*ecode == OP_KETRMIN) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode = prev; flags = 0; goto TAIL_RECURSE; } else /* OP_KETRMAX */ { RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; flags = 0; goto TAIL_RECURSE; } /* Control never gets here */ /* An alternation is the end of a branch; scan along to find the end of the bracketed group and go to there. */ case OP_ALT: do ecode += GET(ecode,1); while (*ecode == OP_ALT); break; /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group, indicating that it may occur zero times. It may repeat infinitely, or not at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets with fixed upper repeat limits are compiled as a number of copies, with the optional ones preceded by BRAZERO or BRAMINZERO. */ case OP_BRAZERO: { next = ecode+1; RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); if (rrc != MATCH_NOMATCH) RRETURN(rrc); do next += GET(next,1); while (*next == OP_ALT); ecode = next + 1 + LINK_SIZE; } break; case OP_BRAMINZERO: { next = ecode+1; do next += GET(next, 1); while (*next == OP_ALT); RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode++; } break; case OP_SKIPZERO: { next = ecode+1; do next += GET(next,1); while (*next == OP_ALT); ecode = next + 1 + LINK_SIZE; } break; /* End of a group, repeated or non-repeating. */ case OP_KET: case OP_KETRMIN: case OP_KETRMAX: prev = ecode - GET(ecode, 1); /* If this was a group that remembered the subject start, in order to break infinite repeats of empty string matches, retrieve the subject start from the chain. Otherwise, set it NULL. */ if (*prev >= OP_SBRA) { saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */ eptrb = eptrb->epb_prev; /* Backup to previous group */ } else saved_eptr = NULL; /* If we are at the end of an assertion group, stop matching and return MATCH_MATCH, but record the current high water mark for use by positive assertions. Do this also for the "once" (atomic) groups. */ if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT || *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT || *prev == OP_ONCE) { md->end_match_ptr = eptr; /* For ONCE */ md->end_offset_top = offset_top; RRETURN(MATCH_MATCH); } /* For capturing groups we have to check the group number back at the start and if necessary complete handling an extraction by setting the offsets and bumping the high water mark. Note that whole-pattern recursion is coded as a recurse into group 0, so it won't be picked up here. Instead, we catch it when the OP_END is reached. Other recursion is handled here. */ if (*prev == OP_CBRA || *prev == OP_SCBRA) { number = GET2(prev, 1+LINK_SIZE); offset = number << 1; #ifdef DEBUG printf("end bracket %d", number); printf("\n"); #endif md->capture_last = number; if (offset >= md->offset_max) md->offset_overflow = TRUE; else { md->offset_vector[offset] = md->offset_vector[md->offset_end - number]; md->offset_vector[offset+1] = eptr - md->start_subject; if (offset_top <= offset) offset_top = offset + 2; } /* Handle a recursively called group. Restore the offsets appropriately and continue from after the call. */ if (md->recursive != NULL && md->recursive->group_num == number) { recursion_info *rec = md->recursive; DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); md->recursive = rec->prevrec; mstart = rec->save_start; memcpy(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); ecode = rec->after_call; ims = original_ims; break; } } /* For both capturing and non-capturing groups, reset the value of the ims flags, in case they got changed during the group. */ ims = original_ims; DPRINTF(("ims reset to %02lx\n", ims)); /* For a non-repeating ket, just continue at this level. This also happens for a repeating ket if no characters were matched in the group. This is the forcible breaking of infinite loops as implemented in Perl 5.005. If there is an options reset, it will get obeyed in the normal course of events. */ if (*ecode == OP_KET || eptr == saved_eptr) { ecode += 1 + LINK_SIZE; break; } /* The repeating kets try the rest of the pattern or restart from the preceding bracket, in the appropriate order. In the second case, we can use tail recursion to avoid using another stack frame, unless we have an unlimited repeat of a group that can match an empty string. */ flags = (*prev >= OP_SBRA)? match_cbegroup : 0; if (*ecode == OP_KETRMIN) { RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (flags != 0) /* Could match an empty string */ { RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50); RRETURN(rrc); } ecode = prev; goto TAIL_RECURSE; } else /* OP_KETRMAX */ { RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13); if (rrc != MATCH_NOMATCH) RRETURN(rrc); ecode += 1 + LINK_SIZE; flags = 0; goto TAIL_RECURSE; } /* Control never gets here */ /* Start of subject unless notbol, or after internal newline if multiline */ case OP_CIRC: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH); if ((ims & PCRE_MULTILINE) != 0) { if (eptr != md->start_subject && (eptr == md->end_subject || !WAS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); ecode++; break; } /* ... else fall through */ /* Start of subject assertion */ case OP_SOD: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH); ecode++; break; /* Start of match assertion */ case OP_SOM: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH); ecode++; break; /* Reset the start of match point */ case OP_SET_SOM: mstart = eptr; ecode++; break; /* Assert before internal newline if multiline, or before a terminating newline unless endonly is set, else end of subject unless noteol is set. */ case OP_DOLL: if ((ims & PCRE_MULTILINE) != 0) { if (eptr < md->end_subject) { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); } else { if (md->noteol) RRETURN(MATCH_NOMATCH); } ecode++; break; } else { if (md->noteol) RRETURN(MATCH_NOMATCH); if (!md->endonly) { if (eptr != md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; } } /* ... else fall through for endonly */ /* End of subject assertion (\z) */ case OP_EOD: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; break; /* End of subject or ending \n assertion (\Z) */ case OP_EODN: if (eptr != md->end_subject && (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen)) RRETURN(MATCH_NOMATCH); ecode++; break; /* Word boundary assertions */ case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: { /* Find out if the previous and current characters are "word" characters. It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to be "non-word" characters. */ #ifdef SUPPORT_UTF8 if (utf8) { if (eptr == md->start_subject) prev_is_word = FALSE; else { const uschar *lastptr = eptr - 1; while((*lastptr & 0xc0) == 0x80) lastptr--; GETCHAR(c, lastptr); prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } if (eptr >= md->end_subject) cur_is_word = FALSE; else { GETCHAR(c, eptr); cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0; } } else #endif /* More streamlined when not in UTF-8 mode */ { prev_is_word = (eptr != md->start_subject) && ((md->ctypes[eptr[-1]] & ctype_word) != 0); cur_is_word = (eptr < md->end_subject) && ((md->ctypes[*eptr] & ctype_word) != 0); } /* Now see if the situation is what we want */ if ((*ecode++ == OP_WORD_BOUNDARY)? cur_is_word == prev_is_word : cur_is_word != prev_is_word) RRETURN(MATCH_NOMATCH); } break; /* Match a single character type; inline for speed */ case OP_ANY: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); /* Fall through */ case OP_ALLANY: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; ecode++; break; /* Match a single byte, even in UTF-8 mode. This opcode really does match any byte, even newline, independent of the setting of PCRE_DOTALL. */ case OP_ANYBYTE: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_DIGIT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_digit) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_DIGIT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 || #endif (md->ctypes[c] & ctype_digit) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_WHITESPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_space) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_WHITESPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 || #endif (md->ctypes[c] & ctype_space) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_NOT_WORDCHAR: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c < 256 && #endif (md->ctypes[c] & ctype_word) != 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_WORDCHAR: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); if ( #ifdef SUPPORT_UTF8 c >= 256 || #endif (md->ctypes[c] & ctype_word) == 0 ) RRETURN(MATCH_NOMATCH); ecode++; break; case OP_ANYNL: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } ecode++; break; case OP_NOT_HSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ RRETURN(MATCH_NOMATCH); } ecode++; break; case OP_HSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ break; } ecode++; break; case OP_NOT_VSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ RRETURN(MATCH_NOMATCH); } ecode++; break; case OP_VSPACE: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ break; } ecode++; break; #ifdef SUPPORT_UCP /* Check the next character by Unicode property. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ case OP_PROP: case OP_NOTPROP: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { const ucd_record * prop = GET_UCD(c); switch(ecode[1]) { case PT_ANY: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH); break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH); break; case PT_GC: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_PC: if ((ecode[2] != prop->chartype) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; case PT_SC: if ((ecode[2] != prop->script) == (op == OP_PROP)) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } ecode += 3; } break; /* Match an extended Unicode sequence. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ case OP_EXTUNI: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); { int category = UCD_CATEGORY(c); if (category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } category = UCD_CATEGORY(c); if (category != ucp_M) break; eptr += len; } } ecode++; break; #endif /* Match a back reference, possibly repeatedly. Look past the end of the item to see if there is repeat information following. The code is similar to that for character classes, but repeated for efficiency. Then obey similar code to character type repeats - written out again for speed. However, if the referenced string is the empty string, always treat it as matched, any number of times (otherwise there could be infinite loops). */ case OP_REF: { offset = GET2(ecode, 1) << 1; /* Doubled ref number */ ecode += 3; /* If the reference is unset, there are two possibilities: (a) In the default, Perl-compatible state, set the length to be longer than the amount of subject left; this ensures that every attempt at a match fails. We can't just fail here, because of the possibility of quantifiers with zero minima. (b) If the JavaScript compatibility flag is set, set the length to zero so that the back reference matches an empty string. Otherwise, set the length to the length of what was matched by the referenced subpattern. */ if (offset >= offset_top || md->offset_vector[offset] < 0) length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1; else length = md->offset_vector[offset+1] - md->offset_vector[offset]; /* Set up for repetition, or handle the non-repeated case */ switch (*ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = *ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows */ if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; continue; /* With the main loop */ } /* If the length of the reference is zero, just continue with the main loop. */ if (length == 0) continue; /* First, ensure the minimum number of matches are present. We get back the length of the reference string explicitly rather than passing the address of eptr, so that eptr can be a register variable. */ for (i = 1; i <= min; i++) { if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; } /* If min = max, continue at the same level without recursion. They are not both allowed to be zero. */ if (min == max) continue; /* If minimizing, keep trying and advancing the pointer */ if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || !match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH); eptr += length; } /* Control never gets here */ } /* If maximizing, find the longest string and work backwards */ else { pp = eptr; for (i = min; i < max; i++) { if (!match_ref(offset, eptr, length, md, ims)) break; eptr += length; } while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr -= length; } RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ /* Match a bit-mapped character class, possibly repeatedly. This op code is used when all the characters in the class have values in the range 0-255, and either the matching is caseful, or the characters are in the range 0-127 when UTF-8 processing is enabled. The only difference between OP_CLASS and OP_NCLASS occurs when a data character outside the range is encountered. First, look past the end of the item to see if there is repeat information following. Then obey similar code to character type repeats - written out again for speed. */ case OP_NCLASS: case OP_CLASS: { data = ecode + 1; /* Save for matching */ ecode += 33; /* Advance past the item */ switch (*ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = *ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows */ min = max = 1; break; } /* First, ensure the minimum number of matches are present. */ #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c > 255) { if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else { if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } } else #endif /* Not UTF-8 mode */ { for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); c = *eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* If max == min we can continue with the main loop without the need to recurse. */ if (min == max) continue; /* If minimizing, keep testing the rest of the expression and advancing the pointer while it matches the class. */ if (minimize) { #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c > 255) { if (op == OP_CLASS) RRETURN(MATCH_NOMATCH); } else { if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } } else #endif /* Not UTF-8 mode */ { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); c = *eptr++; if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ } /* If maximizing, find the longest possible run, then work backwards. */ else { pp = eptr; #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c > 255) { if (op == OP_CLASS) break; } else { if ((data[c/8] & (1 << (c&7))) == 0) break; } eptr += len; } for (;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ BACKCHAR(eptr); } } else #endif /* Not UTF-8 mode */ { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if ((data[c/8] & (1 << (c&7))) == 0) break; eptr++; } while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ /* Match an extended character class. This opcode is encountered only in UTF-8 mode, because that's the only time it is compiled. */ #ifdef SUPPORT_UTF8 case OP_XCLASS: { data = ecode + 1 + LINK_SIZE; /* Save for matching */ ecode += GET(ecode, 1); /* Advance past the item */ switch (*ecode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRPLUS: case OP_CRMINPLUS: case OP_CRQUERY: case OP_CRMINQUERY: c = *ecode++ - OP_CRSTAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; break; case OP_CRRANGE: case OP_CRMINRANGE: minimize = (*ecode == OP_CRMINRANGE); min = GET2(ecode, 1); max = GET2(ecode, 3); if (max == 0) max = INT_MAX; ecode += 5; break; default: /* No repeat follows */ min = max = 1; break; } /* First, ensure the minimum number of matches are present. */ for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } /* If max == min we can continue with the main loop without the need to recurse. */ if (min == max) continue; /* If minimizing, keep testing the rest of the expression and advancing the pointer while it matches the class. */ if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } /* If maximizing, find the longest possible run, then work backwards. */ else { pp = eptr; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (!_pcre_xclass(c, data)) break; eptr += len; } for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ if (utf8) BACKCHAR(eptr); } RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } #endif /* End of XCLASS */ /* Match a single character, casefully */ case OP_CHAR: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; ecode++; GETCHARLEN(fc, ecode, length); if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH); } else #endif /* Non-UTF-8 mode */ { if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH); ecode += 2; } break; /* Match a single character, caselessly */ case OP_CHARNC: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; ecode++; GETCHARLEN(fc, ecode, length); if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); /* If the pattern character's value is < 128, we have only one byte, and can use the fast lookup table. */ if (fc < 128) { if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } /* Otherwise we must pick up the subject character */ else { unsigned int dc; GETCHARINC(dc, eptr); ecode += length; /* If we have Unicode property support, we can use it to test the other case of the character, if there is one. */ if (fc != dc) { #ifdef SUPPORT_UCP if (dc != UCD_OTHERCASE(fc)) #endif RRETURN(MATCH_NOMATCH); } } } else #endif /* SUPPORT_UTF8 */ /* Non-UTF-8 mode */ { if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH); if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); ecode += 2; } break; /* Match a single character repeatedly. */ case OP_EXACT: min = max = GET2(ecode, 1); ecode += 3; goto REPEATCHAR; case OP_POSUPTO: possessive = TRUE; /* Fall through */ case OP_UPTO: case OP_MINUPTO: min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_MINUPTO; ecode += 3; goto REPEATCHAR; case OP_POSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATCHAR; case OP_POSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATCHAR; case OP_POSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATCHAR; case OP_STAR: case OP_MINSTAR: case OP_PLUS: case OP_MINPLUS: case OP_QUERY: case OP_MINQUERY: c = *ecode++ - OP_STAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; /* Common code for all repeated single-character matches. We can give up quickly if there are fewer than the minimum number of characters left in the subject. */ REPEATCHAR: #ifdef SUPPORT_UTF8 if (utf8) { length = 1; charptr = ecode; GETCHARLEN(fc, ecode, length); if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); ecode += length; /* Handle multibyte character matching specially here. There is support for caseless matching if UCP support is present. */ if (length > 1) { #ifdef SUPPORT_UCP unsigned int othercase; if ((ims & PCRE_CASELESS) != 0 && (othercase = UCD_OTHERCASE(fc)) != fc) oclength = _pcre_ord2utf8(othercase, occhars); else oclength = 0; #endif /* SUPPORT_UCP */ for (i = 1; i <= min; i++) { if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP /* Need braces because of following else */ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } else { if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); eptr += oclength; } #else /* without SUPPORT_UCP */ else { RRETURN(MATCH_NOMATCH); } #endif /* SUPPORT_UCP */ } if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP /* Need braces because of following else */ else if (oclength == 0) { RRETURN(MATCH_NOMATCH); } else { if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH); eptr += oclength; } #else /* without SUPPORT_UCP */ else { RRETURN (MATCH_NOMATCH); } #endif /* SUPPORT_UCP */ } /* Control never gets here */ } else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) { if (eptr > md->end_subject - length) break; if (memcmp(eptr, charptr, length) == 0) eptr += length; #ifdef SUPPORT_UCP else if (oclength == 0) break; else { if (memcmp(eptr, occhars, oclength) != 0) break; eptr += oclength; } #else /* without SUPPORT_UCP */ else break; #endif /* SUPPORT_UCP */ } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr == pp) RRETURN(MATCH_NOMATCH); #ifdef SUPPORT_UCP eptr--; BACKCHAR(eptr); #else /* without SUPPORT_UCP */ eptr -= length; #endif /* SUPPORT_UCP */ } } /* Control never gets here */ } /* If the length of a UTF-8 character is 1, we fall through here, and obey the code as for non-UTF-8 characters below, though in this case the value of fc will always be < 128. */ } else #endif /* SUPPORT_UTF8 */ /* When not in UTF-8 mode, load a single-byte character. */ { if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); fc = *ecode++; } /* The value of fc at this point is always less than 256, though we may or may not be in UTF-8 mode. The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the minimum number of matches are present. If min = max, continue at the same level without recursing. Otherwise, if minimizing, keep trying the rest of the expression and advancing one matching character if failing, up to the maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. */ DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max, max, eptr)); if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; for (i = 1; i <= min; i++) if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } /* Caseful comparisons (includes all multi-byte characters) */ else { for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH); if (min == max) continue; if (minimize) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || fc != *eptr++) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } else /* Maximize */ { pp = eptr; for (i = min; i < max; i++) { if (eptr >= md->end_subject || fc != *eptr) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ /* Match a negated single one-byte character. The character we are checking can be multibyte. */ case OP_NOT: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); ecode++; GETCHARINCTEST(c, eptr); if ((ims & PCRE_CASELESS) != 0) { #ifdef SUPPORT_UTF8 if (c < 256) #endif c = md->lcc[c]; if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH); } else { if (*ecode++ == c) RRETURN(MATCH_NOMATCH); } break; /* Match a negated single one-byte character repeatedly. This is almost a repeat of the code for a repeated single character, but I haven't found a nice way of commoning these up that doesn't require a test of the positive/negative option for each character match. Maybe that wouldn't add very much to the time taken, but character matching *is* what this is all about... */ case OP_NOTEXACT: min = max = GET2(ecode, 1); ecode += 3; goto REPEATNOTCHAR; case OP_NOTUPTO: case OP_NOTMINUPTO: min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_NOTMINUPTO; ecode += 3; goto REPEATNOTCHAR; case OP_NOTPOSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATNOTCHAR; case OP_NOTPOSUPTO: possessive = TRUE; min = 0; max = GET2(ecode, 1); ecode += 3; goto REPEATNOTCHAR; case OP_NOTSTAR: case OP_NOTMINSTAR: case OP_NOTPLUS: case OP_NOTMINPLUS: case OP_NOTQUERY: case OP_NOTMINQUERY: c = *ecode++ - OP_NOTSTAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; /* Common code for all repeated single-byte matches. We can give up quickly if there are fewer than the minimum number of bytes left in the subject. */ REPEATNOTCHAR: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); fc = *ecode++; /* The code is duplicated for the caseless and caseful cases, for speed, since matching characters is likely to be quite common. First, ensure the minimum number of matches are present. If min = max, continue at the same level without recursing. Otherwise, if minimizing, keep trying the rest of the expression and advancing one matching character if failing, up to the maximum. Alternatively, if maximizing, find the maximum number of characters and work backwards. */ DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max, max, eptr)); if ((ims & PCRE_CASELESS) != 0) { fc = md->lcc[fc]; #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif /* Not UTF-8 mode */ { for (i = 1; i <= min; i++) if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) { #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (d < 256) d = md->lcc[d]; if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif /* Not UTF-8 mode */ { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ } /* Maximize case */ else { pp = eptr; #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(d, eptr, len); if (d < 256) d = md->lcc[d]; if (fc == d) break; eptr += len; } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ BACKCHAR(eptr); } } else #endif /* Not UTF-8 mode */ { for (i = min; i < max; i++) { if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } /* Control never gets here */ } /* Caseful comparisons */ else { #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (i = 1; i <= min; i++) { GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif /* Not UTF-8 mode */ { for (i = 1; i <= min; i++) if (fc == *eptr++) RRETURN(MATCH_NOMATCH); } if (min == max) continue; if (minimize) { #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(d, eptr); if (fc == d) RRETURN(MATCH_NOMATCH); } } else #endif /* Not UTF-8 mode */ { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || fc == *eptr++) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ } /* Maximize case */ else { pp = eptr; #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { register unsigned int d; for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(d, eptr, len); if (fc == d) break; eptr += len; } if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ BACKCHAR(eptr); } } else #endif /* Not UTF-8 mode */ { for (i = min; i < max; i++) { if (eptr >= md->end_subject || fc == *eptr) break; eptr++; } if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35); if (rrc != MATCH_NOMATCH) RRETURN(rrc); eptr--; } } RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ /* Match a single character type repeatedly; several different opcodes share code. This is very similar to the code for single characters, but we repeat it in the interests of efficiency. */ case OP_TYPEEXACT: min = max = GET2(ecode, 1); minimize = TRUE; ecode += 3; goto REPEATTYPE; case OP_TYPEUPTO: case OP_TYPEMINUPTO: min = 0; max = GET2(ecode, 1); minimize = *ecode == OP_TYPEMINUPTO; ecode += 3; goto REPEATTYPE; case OP_TYPEPOSSTAR: possessive = TRUE; min = 0; max = INT_MAX; ecode++; goto REPEATTYPE; case OP_TYPEPOSPLUS: possessive = TRUE; min = 1; max = INT_MAX; ecode++; goto REPEATTYPE; case OP_TYPEPOSQUERY: possessive = TRUE; min = 0; max = 1; ecode++; goto REPEATTYPE; case OP_TYPEPOSUPTO: possessive = TRUE; min = 0; max = GET2(ecode, 1); ecode += 3; goto REPEATTYPE; case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPLUS: case OP_TYPEMINPLUS: case OP_TYPEQUERY: case OP_TYPEMINQUERY: c = *ecode++ - OP_TYPESTAR; minimize = (c & 1) != 0; min = rep_min[c]; /* Pick up values from tables; */ max = rep_max[c]; /* zero for max => infinity */ if (max == 0) max = INT_MAX; /* Common code for all repeated single character type matches. Note that in UTF-8 mode, '.' matches a character of any length, but for the other character types, the valid characters are all one-byte long. */ REPEATTYPE: ctype = *ecode++; /* Code for the character type */ #ifdef SUPPORT_UCP if (ctype == OP_PROP || ctype == OP_NOTPROP) { prop_fail_result = ctype == OP_NOTPROP; prop_type = *ecode++; prop_value = *ecode++; } else prop_type = -1; #endif /* First, ensure the minimum number of matches are present. Use inline code for maximizing the speed, and do the type test once at the start (i.e. keep it out of the loop). Also we can test that there are at least the minimum number of bytes before we start. This isn't as effective in UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that is tidier. Also separate the UCP code, which can be the same for both UTF-8 and single-bytes. */ if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH); if (min > 0) { #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: if (prop_fail_result) RRETURN(MATCH_NOMATCH); for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); } break; case PT_LAMP: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_GC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_PC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; case PT_SC: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } break; default: RRETURN(PCRE_ERROR_INTERNAL); } } /* Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ else if (ctype == OP_EXTUNI) { for (i = 1; i <= min; i++) { GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } } else #endif /* SUPPORT_UCP */ /* Handle all other cases when the coding is UTF-8 */ #ifdef SUPPORT_UTF8 if (utf8) switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; case OP_ALLANY: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } break; case OP_ANYBYTE: eptr += min; break; case OP_ANYNL: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } break; case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ RRETURN(MATCH_NOMATCH); } } break; case OP_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ break; } } break; case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ RRETURN(MATCH_NOMATCH); } } break; case OP_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ break; } } break; case OP_NOT_DIGIT: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (c < 128 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); } break; case OP_DIGIT: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)) RRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; case OP_WHITESPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)) RRETURN(MATCH_NOMATCH); while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80); } break; case OP_WORDCHAR: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject || *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); /* No need to skip more bytes - we know it's a 1-byte character */ } break; default: RRETURN(PCRE_ERROR_INTERNAL); } /* End switch(ctype) */ else #endif /* SUPPORT_UTF8 */ /* Code for the non-UTF-8 case for minimum matching of operators other than OP_PROP and OP_NOTPROP. We can assume that there are the minimum number of bytes present, as this was tested above. */ switch(ctype) { case OP_ANY: for (i = 1; i <= min; i++) { if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); eptr++; } break; case OP_ALLANY: eptr += min; break; case OP_ANYBYTE: eptr += min; break; /* Because of the CRLF case, we can't assume the minimum number of bytes are present in this case. */ case OP_ANYNL: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } } break; case OP_NOT_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(*eptr++) { default: break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ RRETURN(MATCH_NOMATCH); } } break; case OP_HSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ break; } } break; case OP_NOT_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(*eptr++) { default: break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ RRETURN(MATCH_NOMATCH); } } break; case OP_VSPACE: for (i = 1; i <= min; i++) { if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); switch(*eptr++) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ break; } } break; case OP_NOT_DIGIT: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: for (i = 1; i <= min; i++) if ((md->ctypes[*eptr++] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } /* If min = max, continue at the same level without recursing */ if (min == max) continue; /* If minimizing, we have to test the rest of the pattern before each subsequent match. Again, separate the UTF-8 case for speed, and also separate the UCP cases. */ if (minimize) { #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); if (prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ case PT_LAMP: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ case PT_GC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ case PT_PC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ case PT_SC: for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) RRETURN(MATCH_NOMATCH); } /* Control never gets here */ default: RRETURN(PCRE_ERROR_INTERNAL); } } /* Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ else if (ctype == OP_EXTUNI) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH); GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH); while (eptr < md->end_subject) { int len = 1; if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } } else #endif /* SUPPORT_UCP */ #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); GETCHARINC(c, eptr); switch(ctype) { case OP_ANY: /* This is the non-NL case */ case OP_ALLANY: case OP_ANYBYTE: break; case OP_ANYNL: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: case 0x2028: case 0x2029: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; case OP_NOT_HSPACE: switch(c) { default: break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ break; } break; case OP_NOT_VSPACE: switch(c) { default: break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ break; } break; case OP_NOT_DIGIT: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } } else #endif /* Not UTF-8 mode */ { for (fi = min;; fi++) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (fi >= max || eptr >= md->end_subject || (ctype == OP_ANY && IS_NEWLINE(eptr))) RRETURN(MATCH_NOMATCH); c = *eptr++; switch(ctype) { case OP_ANY: /* This is the non-NL case */ case OP_ALLANY: case OP_ANYBYTE: break; case OP_ANYNL: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x000d: if (eptr < md->end_subject && *eptr == 0x0a) eptr++; break; case 0x000a: break; case 0x000b: case 0x000c: case 0x0085: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH); break; } break; case OP_NOT_HSPACE: switch(c) { default: break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ RRETURN(MATCH_NOMATCH); } break; case OP_HSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ break; } break; case OP_NOT_VSPACE: switch(c) { default: break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ RRETURN(MATCH_NOMATCH); } break; case OP_VSPACE: switch(c) { default: RRETURN(MATCH_NOMATCH); case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ break; } break; case OP_NOT_DIGIT: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH); break; case OP_DIGIT: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WHITESPACE: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WHITESPACE: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH); break; case OP_NOT_WORDCHAR: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH); break; case OP_WORDCHAR: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH); break; default: RRETURN(PCRE_ERROR_INTERNAL); } } } /* Control never gets here */ } /* If maximizing, it is worth using inline code for speed, doing the type test once at the start (i.e. keep it out of the loop). Again, keep the UTF-8 and UCP stuff separate. */ else { pp = eptr; /* Remember where we started */ #ifdef SUPPORT_UCP if (prop_type >= 0) { switch(prop_type) { case PT_ANY: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (prop_fail_result) break; eptr+= len; } break; case PT_LAMP: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == ucp_Lu || prop_chartype == ucp_Ll || prop_chartype == ucp_Lt) == prop_fail_result) break; eptr+= len; } break; case PT_GC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_category = UCD_CATEGORY(c); if ((prop_category == prop_value) == prop_fail_result) break; eptr+= len; } break; case PT_PC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_chartype = UCD_CHARTYPE(c); if ((prop_chartype == prop_value) == prop_fail_result) break; eptr+= len; } break; case PT_SC: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); prop_script = UCD_SCRIPT(c); if ((prop_script == prop_value) == prop_fail_result) break; eptr+= len; } break; } /* eptr is now past the end of the maximum run */ if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ if (utf8) BACKCHAR(eptr); } } /* Match extended Unicode sequences. We will get here only if the support is in the binary; otherwise a compile-time error occurs. */ else if (ctype == OP_EXTUNI) { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; GETCHARINCTEST(c, eptr); prop_category = UCD_CATEGORY(c); if (prop_category == ucp_M) break; while (eptr < md->end_subject) { int len = 1; if (!utf8) c = *eptr; else { GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr += len; } } /* eptr is now past the end of the maximum run */ if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ for (;;) /* Move back over one extended */ { int len = 1; if (!utf8) c = *eptr; else { BACKCHAR(eptr); GETCHARLEN(c, eptr, len); } prop_category = UCD_CATEGORY(c); if (prop_category != ucp_M) break; eptr--; } } } else #endif /* SUPPORT_UCP */ #ifdef SUPPORT_UTF8 /* UTF-8 mode */ if (utf8) { switch(ctype) { case OP_ANY: if (max < INT_MAX) { for (i = min; i < max; i++) { if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } /* Handle unlimited UTF-8 repeat */ else { for (i = min; i < max; i++) { if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } break; case OP_ALLANY: if (max < INT_MAX) { for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; } } else eptr = md->end_subject; /* Unlimited UTF-8 repeat */ break; /* The byte case is the same as non-UTF8 */ case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) c = md->end_subject - eptr; eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c == 0x000d) { if (++eptr >= md->end_subject) break; if (*eptr == 0x000a) eptr++; } else { if (c != 0x000a && (md->bsr_anycrlf || (c != 0x000b && c != 0x000c && c != 0x0085 && c != 0x2028 && c != 0x2029))) break; eptr += len; } } break; case OP_NOT_HSPACE: case OP_HSPACE: for (i = min; i < max; i++) { BOOL gotspace; int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); switch(c) { default: gotspace = FALSE; break; case 0x09: /* HT */ case 0x20: /* SPACE */ case 0xa0: /* NBSP */ case 0x1680: /* OGHAM SPACE MARK */ case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */ case 0x2000: /* EN QUAD */ case 0x2001: /* EM QUAD */ case 0x2002: /* EN SPACE */ case 0x2003: /* EM SPACE */ case 0x2004: /* THREE-PER-EM SPACE */ case 0x2005: /* FOUR-PER-EM SPACE */ case 0x2006: /* SIX-PER-EM SPACE */ case 0x2007: /* FIGURE SPACE */ case 0x2008: /* PUNCTUATION SPACE */ case 0x2009: /* THIN SPACE */ case 0x200A: /* HAIR SPACE */ case 0x202f: /* NARROW NO-BREAK SPACE */ case 0x205f: /* MEDIUM MATHEMATICAL SPACE */ case 0x3000: /* IDEOGRAPHIC SPACE */ gotspace = TRUE; break; } if (gotspace == (ctype == OP_NOT_HSPACE)) break; eptr += len; } break; case OP_NOT_VSPACE: case OP_VSPACE: for (i = min; i < max; i++) { BOOL gotspace; int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); switch(c) { default: gotspace = FALSE; break; case 0x0a: /* LF */ case 0x0b: /* VT */ case 0x0c: /* FF */ case 0x0d: /* CR */ case 0x85: /* NEL */ case 0x2028: /* LINE SEPARATOR */ case 0x2029: /* PARAGRAPH SEPARATOR */ gotspace = TRUE; break; } if (gotspace == (ctype == OP_NOT_VSPACE)) break; eptr += len; } break; case OP_NOT_DIGIT: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break; eptr+= len; } break; case OP_DIGIT: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break; eptr+= len; } break; case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break; eptr+= len; } break; case OP_WHITESPACE: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break; eptr+= len; } break; case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break; eptr+= len; } break; case OP_WORDCHAR: for (i = min; i < max; i++) { int len = 1; if (eptr >= md->end_subject) break; GETCHARLEN(c, eptr, len); if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break; eptr+= len; } break; default: RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ if (possessive) continue; for(;;) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46); if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (eptr-- == pp) break; /* Stop if tried at original pos */ BACKCHAR(eptr); } } else #endif /* SUPPORT_UTF8 */ /* Not UTF-8 mode */ { switch(ctype) { case OP_ANY: for (i = min; i < max; i++) { if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; eptr++; } break; case OP_ALLANY: case OP_ANYBYTE: c = max - min; if (c > (unsigned int)(md->end_subject - eptr)) c = md->end_subject - eptr; eptr += c; break; case OP_ANYNL: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if (c == 0x000d) { if (++eptr >= md->end_subject) break; if (*eptr == 0x000a) eptr++; } else { if (c != 0x000a && (md->bsr_anycrlf || (c != 0x000b && c != 0x000c && c != 0x0085))) break; eptr++; } } break; case OP_NOT_HSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if (c == 0x09 || c == 0x20 || c == 0xa0) break; eptr++; } break; case OP_HSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if (c != 0x09 && c != 0x20 && c != 0xa0) break; eptr++; } break; case OP_NOT_VSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85) break; eptr++; } break; case OP_VSPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject) break; c = *eptr; if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85) break; eptr++; } break; case OP_NOT_DIGIT: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0) break; eptr++; } break; case OP_DIGIT: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0) break; eptr++; } break; case OP_NOT_WHITESPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0) break; eptr++; } break; case OP_WHITESPACE: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0) break; eptr++; } break; case OP_NOT_WORDCHAR: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0) break; eptr++; } break; case OP_WORDCHAR: for (i = min; i < max; i++) { if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0) break; eptr++; } break; default: RRETURN(PCRE_ERROR_INTERNAL); } /* eptr is now past the end of the maximum run */ if (possessive) continue; while (eptr >= pp) { RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47); eptr--; if (rrc != MATCH_NOMATCH) RRETURN(rrc); } } /* Get here if we can't make it match with any permitted repetitions */ RRETURN(MATCH_NOMATCH); } /* Control never gets here */ /* There's been some horrible disaster. Arrival here can only mean there is something seriously wrong in the code above or the OP_xxx definitions. */ default: DPRINTF(("Unknown opcode %d\n", *ecode)); RRETURN(PCRE_ERROR_UNKNOWN_OPCODE); } /* Do not stick any code in here without much thought; it is assumed that "continue" in the code above comes out to here to repeat the main loop. */ } /* End of main loop */ /* Control never reaches here */ /* When compiling to use the heap rather than the stack for recursive calls to match(), the RRETURN() macro jumps here. The number that is saved in frame->Xwhere indicates which label we actually want to return to. */ #ifdef NO_RECURSE #define LBL(val) case val: goto L_RM##val; HEAP_RETURN: switch (frame->Xwhere) { LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17) LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33) LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52) LBL(53) LBL(54) #ifdef SUPPORT_UTF8 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30) LBL(32) LBL(34) LBL(42) LBL(46) #ifdef SUPPORT_UCP LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45) #endif /* SUPPORT_UCP */ #endif /* SUPPORT_UTF8 */ default: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere)); return PCRE_ERROR_INTERNAL; } #undef LBL #endif /* NO_RECURSE */ } /*************************************************************************** **************************************************************************** RECURSION IN THE match() FUNCTION Undefine all the macros that were defined above to handle this. */ #ifdef NO_RECURSE #undef eptr #undef ecode #undef mstart #undef offset_top #undef ims #undef eptrb #undef flags #undef callpat #undef charptr #undef data #undef next #undef pp #undef prev #undef saved_eptr #undef new_recursive #undef cur_is_word #undef condition #undef prev_is_word #undef original_ims #undef ctype #undef length #undef max #undef min #undef number #undef offset #undef op #undef save_capture_last #undef save_offset1 #undef save_offset2 #undef save_offset3 #undef stacksave #undef newptrb #endif /* These two are defined as macros in both cases */ #undef fc #undef fi /*************************************************************************** ***************************************************************************/ | pcreexec.c | 427 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, PCRE_SPTR subject, int length, int start_offset, int options, int *offsets, int offsetcount) { int rc, resetcount, ocount; int first_byte = -1; int req_byte = -1; int req_byte2 = -1; int newline; unsigned long int ims; BOOL using_temporary_offsets = FALSE; BOOL anchored; BOOL startline; BOOL firstline; BOOL first_byte_caseless = FALSE; BOOL req_byte_caseless = FALSE; BOOL utf8; match_data match_block; match_data *md = &match_block; const uschar *tables; const uschar *start_bits = NULL; USPTR start_match = (USPTR)subject + start_offset; USPTR end_subject; USPTR req_byte_ptr = start_match - 1; pcre_study_data internal_study; const pcre_study_data *study; real_pcre internal_re; const real_pcre *external_re = (const real_pcre *)argument_re; const real_pcre *re = external_re; /* Plausibility checks */ if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION; if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL; if (offsetcount < 0) return PCRE_ERROR_BADCOUNT; /* Fish out the optional data from the extra_data structure, first setting the default values. */ study = NULL; md->match_limit = MATCH_LIMIT; md->match_limit_recursion = MATCH_LIMIT_RECURSION; md->callout_data = NULL; /* The table pointer is always in native byte order. */ tables = external_re->tables; if (extra_data != NULL) { register unsigned int flags = extra_data->flags; if ((flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) md->match_limit = extra_data->match_limit; if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0) md->match_limit_recursion = extra_data->match_limit_recursion; if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0) md->callout_data = extra_data->callout_data; if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables; } /* If the exec call supplied NULL for tables, use the inbuilt ones. This is a feature that makes it possible to save compiled regex and re-use them in other programs later. */ if (tables == NULL) tables = _pcre_default_tables; /* Check that the first field in the block is the magic number. If it is not, test for a regex that was compiled on a host of opposite endianness. If this is the case, flipped values are put in internal_re and internal_study if there was study data too. */ if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } /* Set up other data */ anchored = ((re->options | options) & PCRE_ANCHORED) != 0; startline = (re->flags & PCRE_STARTLINE) != 0; firstline = (re->options & PCRE_FIRSTLINE) != 0; /* The code starts after the real_pcre block and the capture name table. */ md->start_code = (const uschar *)external_re + re->name_table_offset + re->name_count * re->name_entry_size; md->start_subject = (USPTR)subject; md->start_offset = start_offset; md->end_subject = md->start_subject + length; end_subject = md->end_subject; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0; md->notbol = (options & PCRE_NOTBOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0; md->notempty = (options & PCRE_NOTEMPTY) != 0; md->partial = (options & PCRE_PARTIAL) != 0; md->hitend = FALSE; md->recursive = NULL; /* No recursion at top level */ md->lcc = tables + lcc_offset; md->ctypes = tables + ctypes_offset; /* Handle different \R options. */ switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) { case 0: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0) md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0; else #ifdef BSR_ANYCRLF md->bsr_anycrlf = TRUE; #else md->bsr_anycrlf = FALSE; #endif break; case PCRE_BSR_ANYCRLF: md->bsr_anycrlf = TRUE; break; case PCRE_BSR_UNICODE: md->bsr_anycrlf = FALSE; break; default: return PCRE_ERROR_BADNEWLINE; } /* Handle different types of newline. The three bits give eight cases. If nothing is set at run time, whatever was used at compile time applies. */ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) & PCRE_NEWLINE_BITS) { case 0: newline = NEWLINE; break; /* Compile-time default */ case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_CR+ PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break; default: return PCRE_ERROR_BADNEWLINE; } if (newline == -2) { md->nltype = NLTYPE_ANYCRLF; } else if (newline < 0) { md->nltype = NLTYPE_ANY; } else { md->nltype = NLTYPE_FIXED; if (newline > 255) { md->nllen = 2; md->nl[0] = (newline >> 8) & 255; md->nl[1] = newline & 255; } else { md->nllen = 1; md->nl[0] = newline; } } /* Partial matching is supported only for a restricted set of regexes at the moment. */ if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0) return PCRE_ERROR_BADPARTIAL; /* Check a UTF-8 string if required. Unfortunately there's no way of passing back the character offset. */ #ifdef SUPPORT_UTF8 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0) { if (_pcre_valid_utf8((uschar *)subject, length) >= 0) return PCRE_ERROR_BADUTF8; if (start_offset > 0 && start_offset < length) { int tb = ((uschar *)subject)[start_offset]; if (tb > 127) { tb &= 0xc0; if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET; } } } #endif /* The ims options can vary during the matching as a result of the presence of (?ims) items in the pattern. They are kept in a local variable so that restoring at the exit of a group is easy. */ ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL); /* If the expression has got more back references than the offsets supplied can hold, we get a temporary chunk of working store to use during the matching. Otherwise, we can use the vector supplied, rounding down its size to a multiple of 3. */ ocount = offsetcount - (offsetcount % 3); if (re->top_backref > 0 && re->top_backref >= ocount/3) { ocount = re->top_backref * 3 + 3; md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int)); if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY; using_temporary_offsets = TRUE; DPRINTF(("Got memory to hold back references\n")); } else md->offset_vector = offsets; md->offset_end = ocount; md->offset_max = (2*ocount)/3; md->offset_overflow = FALSE; md->capture_last = -1; /* Compute the minimum number of offsets that we need to reset each time. Doing this makes a huge difference to execution time when there aren't many brackets in the pattern. */ resetcount = 2 + re->top_bracket * 2; if (resetcount > offsetcount) resetcount = ocount; /* Reset the working variable associated with each extraction. These should never be used unless previously set, but they get saved and restored, and so we initialize them to avoid reading uninitialized locations. */ if (md->offset_vector != NULL) { register int *iptr = md->offset_vector + ocount; register int *iend = iptr - resetcount/2 + 1; while (--iptr >= iend) *iptr = -1; } /* Set up the first character to match, if available. The first_byte value is never set for an anchored regular expression, but the anchoring may be forced at run time, so we have to test for anchoring. The first char may be unset for an unanchored pattern, of course. If there's no first char and the pattern was studied, there may be a bitmap of possible first characters. */ if (!anchored) { if ((re->flags & PCRE_FIRSTSET) != 0) { first_byte = re->first_byte & 255; if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE) first_byte = md->lcc[first_byte]; } else if (!startline && study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0) start_bits = study->start_bits; } /* For anchored or unanchored matches, there may be a "last known required character" set. */ if ((re->flags & PCRE_REQCHSET) != 0) { req_byte = re->req_byte & 255; req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0; req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */ } /* ==========================================================================*/ /* Loop for handling unanchored repeated matching attempts; for anchored regexs the loop runs just once. */ for(;;) { USPTR save_end_subject = end_subject; USPTR new_start_match; /* Reset the maximum number of extractions we might see. */ if (md->offset_vector != NULL) { register int *iptr = md->offset_vector; register int *iend = iptr + resetcount; while (iptr < iend) *iptr++ = -1; } /* Advance to a unique first char if possible. If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the first newline. Implement this by temporarily adjusting end_subject so that we stop scanning at a newline. If the match fails at the newline, later code breaks this loop. */ if (firstline) { USPTR t = start_match; #ifdef SUPPORT_UTF8 if (utf8) { while (t < md->end_subject && !IS_NEWLINE(t)) { t++; while (t < end_subject && (*t & 0xc0) == 0x80) t++; } } else #endif while (t < md->end_subject && !IS_NEWLINE(t)) t++; end_subject = t; } /* Now advance to a unique first byte if there is one. */ if (first_byte >= 0) { if (first_byte_caseless) while (start_match < end_subject && md->lcc[*start_match] != first_byte) start_match++; else while (start_match < end_subject && *start_match != first_byte) start_match++; } /* Or to just after a linebreak for a multiline match */ else if (startline) { if (start_match > md->start_subject + start_offset) { #ifdef SUPPORT_UTF8 if (utf8) { while (start_match < end_subject && !WAS_NEWLINE(start_match)) { start_match++; while(start_match < end_subject && (*start_match & 0xc0) == 0x80) start_match++; } } else #endif while (start_match < end_subject && !WAS_NEWLINE(start_match)) start_match++; /* If we have just passed a CR and the newline option is ANY or ANYCRLF, and we are now at a LF, advance the match position by one more character. */ if (start_match[-1] == '\r' && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) && start_match < end_subject && *start_match == '\n') start_match++; } } /* Or to a non-unique first byte after study */ else if (start_bits != NULL) { while (start_match < end_subject) { register unsigned int c = *start_match; if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break; } } /* Restore fudged end_subject */ end_subject = save_end_subject; #ifdef DEBUG /* Sigh. Some compilers never learn. */ printf(">>>> Match against: "); pchars(start_match, end_subject - start_match, TRUE, md); printf("\n"); #endif /* If req_byte is set, we know that that character must appear in the subject for the match to succeed. If the first character is set, req_byte must be later in the subject; otherwise the test starts at the match point. This optimization can save a huge amount of backtracking in patterns with nested unlimited repeats that aren't going to match. Writing separate code for cased/caseless versions makes it go faster, as does using an autoincrement and backing off on a match. HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary patterns. This showed up when somebody was matching something like /^\d+C/ on a 32-megabyte string... so we don't do this when the string is sufficiently long. ALSO: this processing is disabled when partial matching is requested. */ if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX && !md->partial) { register USPTR p = start_match + ((first_byte >= 0)? 1 : 0); /* We don't need to repeat the search if we haven't yet reached the place we found it at last time. */ if (p > req_byte_ptr) { if (req_byte_caseless) { while (p < end_subject) { register int pp = *p++; if (pp == req_byte || pp == req_byte2) { p--; break; } } } else { while (p < end_subject) { if (*p++ == req_byte) { p--; break; } } } /* If we can't find the required character, break the matching loop, forcing a match failure. */ if (p >= end_subject) { rc = MATCH_NOMATCH; break; } /* If we have found the required character, save the point where we found it, so that we don't search again next time round the loop if the start hasn't passed this character yet. */ req_byte_ptr = p; } } /* OK, we can now run the match. */ md->start_match_ptr = start_match; md->match_call_count = 0; rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0); switch(rc) { /* NOMATCH and PRUNE advance by one character. THEN at this level acts exactly like PRUNE. */ case MATCH_NOMATCH: case MATCH_PRUNE: case MATCH_THEN: new_start_match = start_match + 1; #ifdef SUPPORT_UTF8 if (utf8) while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80) new_start_match++; #endif break; /* SKIP passes back the next starting point explicitly. */ case MATCH_SKIP: new_start_match = md->start_match_ptr; break; /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */ case MATCH_COMMIT: rc = MATCH_NOMATCH; goto ENDLOOP; /* Any other return is some kind of error. */ default: goto ENDLOOP; } /* Control reaches here for the various types of "no match at this point" result. Reset the code to MATCH_NOMATCH for subsequent checking. */ rc = MATCH_NOMATCH; /* If PCRE_FIRSTLINE is set, the match must happen before or at the first newline in the subject (though it may continue over the newline). Therefore, if we have just failed to match, starting at a newline, do not continue. */ if (firstline && IS_NEWLINE(start_match)) break; /* Advance to new matching position */ start_match = new_start_match; /* Break the loop if the pattern is anchored or if we have passed the end of the subject. */ if (anchored || start_match > end_subject) break; /* If we have just passed a CR and we are now at a LF, and the pattern does not contain any explicit matches for \r or \n, and the newline option is CRLF or ANY or ANYCRLF, advance the match position by one more character. */ if (start_match[-1] == '\r' && start_match < end_subject && *start_match == '\n' && (re->flags & PCRE_HASCRORLF) == 0 && (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF || md->nllen == 2)) start_match++; } /* End of for(;;) "bumpalong" loop */ /* ==========================================================================*/ /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping conditions is true: (1) The pattern is anchored or the match was failed by (*COMMIT); (2) We are past the end of the subject; (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because this option requests that a match occur at or before the first newline in the subject. When we have a match and the offset vector is big enough to deal with any backreferences, captured substring offsets will already be set up. In the case where we had to get some local store to hold offsets for backreference processing, copy those that we can. In this case there need not be overflow if certain parts of the pattern were not used, even though there are more capturing parentheses than vector slots. */ ENDLOOP: if (rc == MATCH_MATCH) { if (using_temporary_offsets) { if (offsetcount >= 4) { memcpy(offsets + 2, md->offset_vector + 2, (offsetcount - 2) * sizeof(int)); DPRINTF(("Copied offsets from temporary memory\n")); } if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE; DPRINTF(("Freeing temporary memory\n")); (pcre_free)(md->offset_vector); } /* Set the return code to the number of captured strings, or 0 if there are too many to fit into the vector. */ rc = md->offset_overflow? 0 : md->end_offset_top/2; /* If there is space, set up the whole thing as substring 0. The value of md->start_match_ptr might be modified if \K was encountered on the success matching path. */ if (offsetcount < 2) rc = 0; else { offsets[0] = md->start_match_ptr - md->start_subject; offsets[1] = md->end_match_ptr - md->start_subject; } DPRINTF((">>>> returning %d\n", rc)); return rc; } /* Control gets here if there has been an error, or if the overall match attempt has failed at all permitted starting positions. */ if (using_temporary_offsets) { DPRINTF(("Freeing temporary memory\n")); (pcre_free)(md->offset_vector); } if (rc != MATCH_NOMATCH) { DPRINTF((">>>> error: returning %d\n", rc)); return rc; } else if (md->partial && md->hitend) { DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n")); return PCRE_ERROR_PARTIAL; } else { DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n")); return PCRE_ERROR_NOMATCH; } } | pcreexec.c | 4388 |
pcrefinf.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what, void *where) { real_pcre internal_re; pcre_study_data internal_study; const real_pcre *re = (const real_pcre *)argument_re; const pcre_study_data *study = NULL; if (re == NULL || where == NULL) return PCRE_ERROR_NULL; if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0) study = (const pcre_study_data *)extra_data->study_data; if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, study, &internal_study); if (re == NULL) return PCRE_ERROR_BADMAGIC; if (study != NULL) study = &internal_study; } switch (what) { case PCRE_INFO_OPTIONS: *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; break; case PCRE_INFO_SIZE: *((size_t *)where) = re->size; break; case PCRE_INFO_STUDYSIZE: *((size_t *)where) = (study == NULL)? 0 : study->size; break; case PCRE_INFO_CAPTURECOUNT: *((int *)where) = re->top_bracket; break; case PCRE_INFO_BACKREFMAX: *((int *)where) = re->top_backref; break; case PCRE_INFO_FIRSTBYTE: *((int *)where) = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; break; /* Make sure we pass back the pointer to the bit vector in the external block, not the internal copy (with flipped integer fields). */ case PCRE_INFO_FIRSTTABLE: *((const uschar **)where) = (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; break; case PCRE_INFO_LASTLITERAL: *((int *)where) = ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; break; case PCRE_INFO_NAMEENTRYSIZE: *((int *)where) = re->name_entry_size; break; case PCRE_INFO_NAMECOUNT: *((int *)where) = re->name_count; break; case PCRE_INFO_NAMETABLE: *((const uschar **)where) = (const uschar *)re + re->name_table_offset; break; case PCRE_INFO_DEFAULT_TABLES: *((const uschar **)where) = (const uschar *)(_pcre_default_tables); break; case PCRE_INFO_OKPARTIAL: *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0; break; case PCRE_INFO_JCHANGED: *((int *)where) = (re->flags & PCRE_JCHANGED) != 0; break; case PCRE_INFO_HASCRORLF: *((int *)where) = (re->flags & PCRE_HASCRORLF) != 0; break; default: return PCRE_ERROR_BADOPTION; } return 0; } | pcrefinf.c | 68 |
pcreget.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_get_stringnumber(const pcre *code, const char *stringname)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringnumber(const pcre *code, const char *stringname) { int rc; int entrysize; int top, bot; uschar *nametable; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) return rc; if (top <= 0) return PCRE_ERROR_NOSUBSTRING; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) return rc; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) return rc; bot = 0; while (top > bot) { int mid = (top + bot) / 2; uschar *entry = nametable + entrysize*mid; int c = strcmp(stringname, (char *)(entry + 2)); if (c == 0) return (entry[0] << 8) + entry[1]; if (c > 0) bot = mid + 1; else top = mid; } return PCRE_ERROR_NOSUBSTRING; } | pcreget.c | 68 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_get_stringtable_entries(const pcre *code, const char *stringname, char **firstptr, char **lastptr)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_stringtable_entries(const pcre *code, const char *stringname, char **firstptr, char **lastptr) { int rc; int entrysize; int top, bot; uschar *nametable, *lastentry; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMECOUNT, &top)) != 0) return rc; if (top <= 0) return PCRE_ERROR_NOSUBSTRING; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMEENTRYSIZE, &entrysize)) != 0) return rc; if ((rc = pcre_fullinfo(code, NULL, PCRE_INFO_NAMETABLE, &nametable)) != 0) return rc; lastentry = nametable + entrysize * (top - 1); bot = 0; while (top > bot) { int mid = (top + bot) / 2; uschar *entry = nametable + entrysize*mid; int c = strcmp(stringname, (char *)(entry + 2)); if (c == 0) { uschar *first = entry; uschar *last = entry; while (first > nametable) { if (strcmp(stringname, (char *)(first - entrysize + 2)) != 0) break; first -= entrysize; } while (last < lastentry) { if (strcmp(stringname, (char *)(last + entrysize + 2)) != 0) break; last += entrysize; } *firstptr = (char *)first; *lastptr = (char *)last; return entrysize; } if (c > 0) bot = mid + 1; else top = mid; } return PCRE_ERROR_NOSUBSTRING; } | pcreget.c | 117 |
STATIC INT | get_first_set(const pcre *code, const char *stringname, int *ovector)
static int get_first_set(const pcre *code, const char *stringname, int *ovector) { const real_pcre *re = (const real_pcre *)code; int entrysize; char *first, *last; uschar *entry; if ((re->options & PCRE_DUPNAMES) == 0 && (re->flags & PCRE_JCHANGED) == 0) return pcre_get_stringnumber(code, stringname); entrysize = pcre_get_stringtable_entries(code, stringname, &first, &last); if (entrysize <= 0) return entrysize; for (entry = (uschar *)first; entry <= (uschar *)last; entry += entrysize) { int n = (entry[0] << 8) + entry[1]; if (ovector[n*2] >= 0) return n; } return (first[0] << 8) + first[1]; } | pcreget.c | 185 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_copy_substring(const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_substring(const char *subject, int *ovector, int stringcount, int stringnumber, char *buffer, int size) { int yield; if (stringnumber < 0 || stringnumber >= stringcount) return PCRE_ERROR_NOSUBSTRING; stringnumber *= 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; if (size < yield + 1) return PCRE_ERROR_NOMEMORY; memcpy(buffer, subject + ovector[stringnumber], yield); buffer[yield] = 0; return yield; } | pcreget.c | 234 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int size)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_copy_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, char *buffer, int size) { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_copy_substring(subject, ovector, stringcount, n, buffer, size); } | pcreget.c | 279 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring_list(const char *subject, int *ovector, int stringcount, const char ***listptr) { int i; int size = sizeof(char *); int double_count = stringcount * 2; char **stringlist; char *p; for (i = 0; i < double_count; i += 2) size += sizeof(char *) + ovector[i+1] - ovector[i] + 1; stringlist = (char **)(pcre_malloc)(size); if (stringlist == NULL) return PCRE_ERROR_NOMEMORY; *listptr = (const char **)stringlist; p = (char *)(stringlist + stringcount + 1); for (i = 0; i < double_count; i += 2) { int len = ovector[i+1] - ovector[i]; memcpy(p, subject + ovector[i], len); *stringlist++ = p; p += len; *p++ = 0; } *stringlist = NULL; return 0; } | pcreget.c | 311 |
PCRE_EXP_DEFN VOID PCRE_CALL_CONVENTION | pcre_free_substring_list(const char **pointer)
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring_list(const char **pointer) { (pcre_free)((void *)pointer); } | pcreget.c | 356 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_get_substring(const char *subject, int *ovector, int stringcount, int stringnumber, const char **stringptr)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_substring(const char *subject, int *ovector, int stringcount, int stringnumber, const char **stringptr) { int yield; char *substring; if (stringnumber < 0 || stringnumber >= stringcount) return PCRE_ERROR_NOSUBSTRING; stringnumber *= 2; yield = ovector[stringnumber+1] - ovector[stringnumber]; substring = (char *)(pcre_malloc)(yield + 1); if (substring == NULL) return PCRE_ERROR_NOMEMORY; memcpy(substring, subject + ovector[stringnumber], yield); substring[yield] = 0; *stringptr = substring; return yield; } | pcreget.c | 389 |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_get_named_substring(const pcre *code, const char *subject, int *ovector, int stringcount, const char *stringname, const char **stringptr) { int n = get_first_set(code, stringname, ovector); if (n <= 0) return n; return pcre_get_substring(subject, ovector, stringcount, n, stringptr); } | pcreget.c | 436 |
PCRE_EXP_DEFN VOID PCRE_CALL_CONVENTION | pcre_free_substring(const char *pointer)
PCRE_EXP_DEFN void PCRE_CALL_CONVENTION pcre_free_substring(const char *pointer) { (pcre_free)((void *)pointer); } | pcreget.c | 459 |
pcreinfo.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_info(const pcre *argument_re, int *optptr, int *first_byte) { real_pcre internal_re; const real_pcre *re = (const real_pcre *)argument_re; if (re == NULL) return PCRE_ERROR_NULL; if (re->magic_number != MAGIC_NUMBER) { re = _pcre_try_flipped(re, &internal_re, NULL, NULL); if (re == NULL) return PCRE_ERROR_BADMAGIC; } if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); if (first_byte != NULL) *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; return re->top_bracket; } | pcreinfo.c | 75 |
pcremktb.c | |||
Type | Function | Source | Line |
CONST UNSIGNED CHAR * | pcre_maketables(void)
const unsigned char * pcre_maketables(void) { unsigned char *yield, *p; int i; #ifndef DFTABLES yield = (unsigned char*)(pcre_malloc)(tables_length); #else yield = (unsigned char*)malloc(tables_length); #endif if (yield == NULL) return NULL; p = yield; /* First comes the lower casing table */ for (i = 0; i < 256; i++) *p++ = tolower(i); /* Next the case-flipping table */ for (i = 0; i < 256; i++) *p++ = islower(i)? toupper(i) : tolower(i); /* Then the character class tables. Don't try to be clever and save effort on exclusive ones - in some locales things may be different. Note that the table for "space" includes everything "isspace" gives, including VT in the default locale. This makes it work for the POSIX class [:space:]. Note also that it is possible for a character to be alnum or alpha without being lower or upper, such as "male and female ordinals" (\xAA and \xBA) in the fr_FR locale (at least under Debian Linux's locales as of 12/2005). So we must test for alnum specially. */ memset(p, 0, cbit_length); for (i = 0; i < 256; i++) { if (isdigit(i)) p[cbit_digit + i/8] |= 1 << (i&7); if (isupper(i)) p[cbit_upper + i/8] |= 1 << (i&7); if (islower(i)) p[cbit_lower + i/8] |= 1 << (i&7); if (isalnum(i)) p[cbit_word + i/8] |= 1 << (i&7); if (i == '_') p[cbit_word + i/8] |= 1 << (i&7); if (isspace(i)) p[cbit_space + i/8] |= 1 << (i&7); if (isxdigit(i))p[cbit_xdigit + i/8] |= 1 << (i&7); if (isgraph(i)) p[cbit_graph + i/8] |= 1 << (i&7); if (isprint(i)) p[cbit_print + i/8] |= 1 << (i&7); if (ispunct(i)) p[cbit_punct + i/8] |= 1 << (i&7); if (iscntrl(i)) p[cbit_cntrl + i/8] |= 1 << (i&7); } p += cbit_length; /* Finally, the character type table. In this, we exclude VT from the white space chars, because Perl doesn't recognize it as such for \s and for comments within regexes. */ for (i = 0; i < 256; i++) { int x = 0; if (i != 0x0b && isspace(i)) x += ctype_space; if (isalpha(i)) x += ctype_letter; if (isdigit(i)) x += ctype_digit; if (isxdigit(i)) x += ctype_xdigit; if (isalnum(i) || i == '_') x += ctype_word; /* Note: strchr includes the terminating zero in the characters it considers. In this instance, that is ok because we want binary zero to be flagged as a meta-character, which in this sense is any character that terminates a run of data characters. */ if (strchr("\\*+?{^.$|()[", i) != 0) x += ctype_meta; *p++ = x; } return yield; } | pcremktb.c | 69 |
pcrenewl.c | |||
Type | Function | Source | Line |
BOOL | _pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, int *lenptr, BOOL utf8)
BOOL _pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, int *lenptr, BOOL utf8) { int c; if (utf8) { GETCHAR(c, ptr); } else c = *ptr; if (type == NLTYPE_ANYCRLF) switch(c) { case 0x000a: *lenptr = 1; return TRUE; /* LF */ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR */ default: return FALSE; } /* NLTYPE_ANY */ else switch(c) { case 0x000a: /* LF */ case 0x000b: /* VT */ case 0x000c: *lenptr = 1; return TRUE; /* FF */ case 0x000d: *lenptr = (ptr < endptr - 1 && ptr[1] == 0x0a)? 2 : 1; return TRUE; /* CR */ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ default: return FALSE; } } | pcrenewl.c | 75 |
BOOL | _pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, int *lenptr, BOOL utf8)
BOOL _pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, int *lenptr, BOOL utf8) { int c; ptr--; #ifdef SUPPORT_UTF8 if (utf8) { BACKCHAR(ptr); GETCHAR(c, ptr); } else c = *ptr; #else /* no UTF-8 support */ c = *ptr; #endif /* SUPPORT_UTF8 */ if (type == NLTYPE_ANYCRLF) switch(c) { case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; return TRUE; /* LF */ case 0x000d: *lenptr = 1; return TRUE; /* CR */ default: return FALSE; } else switch(c) { case 0x000a: *lenptr = (ptr > startptr && ptr[-1] == 0x0d)? 2 : 1; return TRUE; /* LF */ case 0x000b: /* VT */ case 0x000c: /* FF */ case 0x000d: *lenptr = 1; return TRUE; /* CR */ case 0x0085: *lenptr = utf8? 2 : 1; return TRUE; /* NEL */ case 0x2028: /* LS */ case 0x2029: *lenptr = 3; return TRUE; /* PS */ default: return FALSE; } } | pcrenewl.c | 125 |
pcreoutf.c | |||
Type | Function | Source | Line |
INT | _pcre_ord2utf8(int cvalue, uschar *buffer)
int _pcre_ord2utf8(int cvalue, uschar *buffer) { #ifdef SUPPORT_UTF8 register int i, j; for (i = 0; i < _pcre_utf8_table1_size; i++) if (cvalue <= _pcre_utf8_table1[i]) break; buffer += i; for (j = i; j > 0; j--) { *buffer-- = 0x80 | (cvalue & 0x3f); cvalue >>= 6; } *buffer = _pcre_utf8_table2[i] | cvalue; return i + 1; #else (void)(cvalue); /* Keep compiler happy; this function won't ever be */ (void)(buffer); /* called when SUPPORT_UTF8 is not defined. */ return 0; #endif } | pcreoutf.c | 65 |
pcrerefc.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN INT PCRE_CALL_CONVENTION | pcre_refcount(pcre *argument_re, int adjust)
PCRE_EXP_DEFN int PCRE_CALL_CONVENTION pcre_refcount(pcre *argument_re, int adjust) { real_pcre *re = (real_pcre *)argument_re; if (re == NULL) return PCRE_ERROR_NULL; re->ref_count = (-adjust > re->ref_count)? 0 : (adjust + re->ref_count > 65535)? 65535 : re->ref_count + adjust; return re->ref_count; } | pcrerefc.c | 71 |
pcrestud.c | |||
Type | Function | Source | Line |
STATIC VOID | set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
static void set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) { start_bits[c/8] |= (1 << (c&7)); if (caseless && (cd->ctypes[c] & ctype_letter) != 0) start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7)); } | pcrestud.c | 73 |
STATIC INT | set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, BOOL utf8, compile_data *cd)
static int set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, BOOL utf8, compile_data *cd) { register int c; int yield = SSB_DONE; #if 0 /* ========================================================================= */ /* The following comment and code was inserted in January 1999. In May 2006, when it was observed to cause compiler warnings about unused values, I took it out again. If anybody is still using OS/2, they will have to put it back manually. */ /* This next statement and the later reference to dummy are here in order to trick the optimizer of the IBM C compiler for OS/2 into generating correct code. Apparently IBM isn't going to fix the problem, and we would rather not disable optimization (in this module it actually makes a big difference, and the pcre module can use all the optimization it can get). */ volatile int dummy; /* ========================================================================= */ #endif do { const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE; BOOL try_next = TRUE; while (try_next) /* Loop for items in this branch */ { int rc; switch(*tcode) { /* Fail if we reach something we don't understand */ default: return SSB_FAIL; /* If we hit a bracket or a positive lookahead assertion, recurse to set bits from within the subpattern. If it can't find anything, we have to give up. If it finds some mandatory character(s), we are done for this branch. Otherwise, carry on scanning after the subpattern. */ case OP_BRA: case OP_SBRA: case OP_CBRA: case OP_SCBRA: case OP_ONCE: case OP_ASSERT: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); if (rc == SSB_FAIL) return SSB_FAIL; if (rc == SSB_DONE) try_next = FALSE; else { do tcode += GET(tcode, 1); while (*tcode == OP_ALT); tcode += 1 + LINK_SIZE; } break; /* If we hit ALT or KET, it means we haven't found anything mandatory in this branch, though we might have found something optional. For ALT, we continue with the next alternative, but we have to arrange that the final result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, return SSB_CONTINUE: if this is the top level, that indicates failure, but after a nested subpattern, it causes scanning to continue. */ case OP_ALT: yield = SSB_CONTINUE; try_next = FALSE; break; case OP_KET: case OP_KETRMAX: case OP_KETRMIN: return SSB_CONTINUE; /* Skip over callout */ case OP_CALLOUT: tcode += 2 + 2*LINK_SIZE; break; /* Skip over lookbehind and negative lookahead assertions */ case OP_ASSERT_NOT: case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* Skip over an option setting, changing the caseless flag */ case OP_OPT: caseless = (tcode[1] & PCRE_CASELESS) != 0; tcode += 2; break; /* BRAZERO does the bracket, but carries on. */ case OP_BRAZERO: case OP_BRAMINZERO: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) return SSB_FAIL; /* ========================================================================= See the comment at the head of this function concerning the next line, which was an old fudge for the benefit of OS/2. dummy = 1; ========================================================================= */ do tcode += GET(tcode,1); while (*tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* SKIPZERO skips the bracket. */ case OP_SKIPZERO: tcode++; do tcode += GET(tcode,1); while (*tcode == OP_ALT); tcode += 1 + LINK_SIZE; break; /* Single-char * or ? sets the bit and tries the next item */ case OP_STAR: case OP_MINSTAR: case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: case OP_POSQUERY: set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; /* Single-char upto sets the bit and tries the next */ case OP_UPTO: case OP_MINUPTO: case OP_POSUPTO: set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; #ifdef SUPPORT_UTF8 if (utf8 && tcode[-1] >= 0xc0) tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; /* At least one single char sets the bit and stops */ case OP_EXACT: /* Fall through */ tcode += 2; case OP_CHAR: case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: case OP_POSPLUS: set_bit(start_bits, tcode[1], caseless, cd); try_next = FALSE; break; /* Single character type sets the bits and stops */ case OP_NOT_DIGIT: for (c = 0; c < 32; c++) start_bits[c] |= ~cd->cbits[c+cbit_digit]; try_next = FALSE; break; case OP_DIGIT: for (c = 0; c < 32; c++) start_bits[c] |= cd->cbits[c+cbit_digit]; try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to discard it. */ case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] |= ~d; } try_next = FALSE; break; /* The cbit_space table has vertical tab as whitespace; we have to discard it. */ case OP_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] |= d; } try_next = FALSE; break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] |= ~cd->cbits[c+cbit_word]; try_next = FALSE; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] |= cd->cbits[c+cbit_word]; try_next = FALSE; break; /* One or more character type fudges the pointer and restarts, knowing it will hit a single character type and stop there. */ case OP_TYPEPLUS: case OP_TYPEMINPLUS: tcode++; break; case OP_TYPEEXACT: tcode += 3; break; /* Zero or more repeats of character types set the bits and then try again. */ case OP_TYPEUPTO: case OP_TYPEMINUPTO: case OP_TYPEPOSUPTO: tcode += 2; /* Fall through */ case OP_TYPESTAR: case OP_TYPEMINSTAR: case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: case OP_TYPEPOSQUERY: switch(tcode[1]) { case OP_ANY: case OP_ALLANY: return SSB_FAIL; case OP_NOT_DIGIT: for (c = 0; c < 32; c++) start_bits[c] |= ~cd->cbits[c+cbit_digit]; break; case OP_DIGIT: for (c = 0; c < 32; c++) start_bits[c] |= cd->cbits[c+cbit_digit]; break; /* The cbit_space table has vertical tab as whitespace; we have to discard it. */ case OP_NOT_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] |= ~d; } break; /* The cbit_space table has vertical tab as whitespace; we have to discard it. */ case OP_WHITESPACE: for (c = 0; c < 32; c++) { int d = cd->cbits[c+cbit_space]; if (c == 1) d &= ~0x08; start_bits[c] |= d; } break; case OP_NOT_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] |= ~cd->cbits[c+cbit_word]; break; case OP_WORDCHAR: for (c = 0; c < 32; c++) start_bits[c] |= cd->cbits[c+cbit_word]; break; } tcode += 2; break; /* Character class where all the information is in a bit map: set the bits and either carry on or not, according to the repeat count. If it was a negative class, and we are operating with UTF-8 characters, any byte with a value >= 0xc4 is a potentially valid starter because it starts a character with a value > 255. */ case OP_NCLASS: #ifdef SUPPORT_UTF8 if (utf8) { start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */ memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */ } #endif /* Fall through */ case OP_CLASS: { tcode++; /* In UTF-8 mode, the bits in a bit map correspond to character values, not to byte values. However, the bit map we are constructing is for byte values. So we have to do a conversion for characters whose value is > 127. In fact, there are only two possible starting bytes for characters in the range 128 - 255. */ #ifdef SUPPORT_UTF8 if (utf8) { for (c = 0; c < 16; c++) start_bits[c] |= tcode[c]; for (c = 128; c < 256; c++) { if ((tcode[c/8] && (1 << (c&7))) != 0) { int d = (c >> 6) | 0xc0; /* Set bit for this starter */ start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */ c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */ } } } /* In non-UTF-8 mode, the two bit maps are completely compatible. */ else #endif { for (c = 0; c < 32; c++) start_bits[c] |= tcode[c]; } /* Advance past the bit map, and act on what follows */ tcode += 32; switch (*tcode) { case OP_CRSTAR: case OP_CRMINSTAR: case OP_CRQUERY: case OP_CRMINQUERY: tcode++; break; case OP_CRRANGE: case OP_CRMINRANGE: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5; else try_next = FALSE; break; default: try_next = FALSE; break; } } break; /* End of bitmap class handling */ } /* End of switch */ } /* End of try_next loop */ code += GET(code, 1); /* Advance to next branch */ } while (*code == OP_ALT); return yield; } | pcrestud.c | 107 |
PCRE_EXP_DEFN PCRE_EXTRA * PCRE_CALL_CONVENTION | pcre_study(const pcre *external_re, int options, const char **errorptr)
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION pcre_study(const pcre *external_re, int options, const char **errorptr) { uschar start_bits[32]; pcre_extra *extra; pcre_study_data *study; const uschar *tables; uschar *code; compile_data compile_block; const real_pcre *re = (const real_pcre *)external_re; *errorptr = NULL; if (re == NULL || re->magic_number != MAGIC_NUMBER) { *errorptr = "argument is not a compiled regular expression"; return NULL; } if ((options & ~PUBLIC_STUDY_OPTIONS) != 0) { *errorptr = "unknown or incorrect option bit(s) set"; return NULL; } code = (uschar *)re + re->name_table_offset + (re->name_count * re->name_entry_size); /* For an anchored pattern, or an unanchored pattern that has a first char, or a multiline pattern that matches only at "line starts", no further processing at present. */ if ((re->options & PCRE_ANCHORED) != 0 || (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) return NULL; /* Set the character tables in the block that is passed around */ tables = re->tables; if (tables == NULL) (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, (void *)(&tables)); compile_block.lcc = tables + lcc_offset; compile_block.fcc = tables + fcc_offset; compile_block.cbits = tables + cbits_offset; compile_block.ctypes = tables + ctypes_offset; /* See if we can find a fixed set of initial characters for the pattern. */ memset(start_bits, 0, 32 * sizeof(uschar)); if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional data set later by the calling program. At the moment, the size of pcre_study_data is fixed. We nevertheless save it in a field for returning via the pcre_fullinfo() function so that if it becomes variable in the future, we don't have to change that code. */ extra = (pcre_extra *)(pcre_malloc) (sizeof(pcre_extra) + sizeof(pcre_study_data)); if (extra == NULL) { *errorptr = "failed to get memory"; return NULL; } study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra)); extra->flags = PCRE_EXTRA_STUDY_DATA; extra->study_data = study; study->size = sizeof(pcre_study_data); study->options = PCRE_STUDY_MAPPED; memcpy(study->start_bits, start_bits, sizeof(start_bits)); return extra; } | pcrestud.c | 507 |
pcretryf.c | |||
Type | Function | Source | Line |
STATIC UNSIGNED LONG INT | byteflip(unsigned long int value, int n)
static unsigned long int byteflip(unsigned long int value, int n) { if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8); return ((value & 0x000000ff) << 24) | ((value & 0x0000ff00) << 8) | ((value & 0x00ff0000) >> 8) | ((value & 0xff000000) >> 24); } | pcretryf.c | 69 |
REAL_PCRE * | _pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, const pcre_study_data *study, pcre_study_data *internal_study)
real_pcre * _pcre_try_flipped(const real_pcre *re, real_pcre *internal_re, const pcre_study_data *study, pcre_study_data *internal_study) { if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER) return NULL; *internal_re = *re; /* To copy other fields */ internal_re->size = byteflip(re->size, sizeof(re->size)); internal_re->options = byteflip(re->options, sizeof(re->options)); internal_re->flags = (pcre_uint16)byteflip(re->flags, sizeof(re->flags)); internal_re->top_bracket = (pcre_uint16)byteflip(re->top_bracket, sizeof(re->top_bracket)); internal_re->top_backref = (pcre_uint16)byteflip(re->top_backref, sizeof(re->top_backref)); internal_re->first_byte = (pcre_uint16)byteflip(re->first_byte, sizeof(re->first_byte)); internal_re->req_byte = (pcre_uint16)byteflip(re->req_byte, sizeof(re->req_byte)); internal_re->name_table_offset = (pcre_uint16)byteflip(re->name_table_offset, sizeof(re->name_table_offset)); internal_re->name_entry_size = (pcre_uint16)byteflip(re->name_entry_size, sizeof(re->name_entry_size)); internal_re->name_count = (pcre_uint16)byteflip(re->name_count, sizeof(re->name_count)); if (study != NULL) { *internal_study = *study; /* To copy other fields */ internal_study->size = byteflip(study->size, sizeof(study->size)); internal_study->options = byteflip(study->options, sizeof(study->options)); } return internal_re; } | pcretryf.c | 101 |
pcrever.c | |||
Type | Function | Source | Line |
PCRE_EXP_DEFN CONST CHAR * PCRE_CALL_CONVENTION | pcre_version(void)
PCRE_EXP_DEFN const char * PCRE_CALL_CONVENTION pcre_version(void) { return (XSTRING(Z PCRE_PRERELEASE)[1] == 0)? XSTRING(PCRE_MAJOR.PCRE_MINOR PCRE_DATE) : XSTRING(PCRE_MAJOR.PCRE_MINOR) XSTRING(PCRE_PRERELEASE PCRE_DATE); } | pcrever.c | 82 |
pcrevutf.c | |||
Type | Function | Source | Line |
INT | _pcre_valid_utf8(const uschar *string, int length)
int _pcre_valid_utf8(const uschar *string, int length) { #ifdef SUPPORT_UTF8 register const uschar *p; if (length < 0) { for (p = string; *p != 0; p++); length = p - string; } for (p = string; length-- > 0; p++) { register int ab; register int c = *p; if (c < 128) continue; if (c < 0xc0) return p - string; ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ if (length < ab || ab > 3) return p - string; length -= ab; /* Check top bits in the second byte */ if ((*(++p) & 0xc0) != 0x80) return p - string; /* Check for overlong sequences for each different length, and for the excluded range 0xd000 to 0xdfff. */ switch (ab) { /* Check for xx00 000x (overlong sequence) */ case 1: if ((c & 0x3e) == 0) return p - string; continue; /* We know there aren't any more bytes to check */ /* Check for 1110 0000, xx0x xxxx (overlong sequence) or 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ case 2: if ((c == 0xe0 && (*p & 0x20) == 0) || (c == 0xed && *p >= 0xa0)) return p - string; break; /* Check for 1111 0000, xx00 xxxx (overlong sequence) or greater than 0x0010ffff (f4 8f bf bf) */ case 3: if ((c == 0xf0 && (*p & 0x30) == 0) || (c > 0xf4 ) || (c == 0xf4 && *p > 0x8f)) return p - string; break; #if 0 /* These cases can no longer occur, as we restrict to a maximum of four bytes nowadays. Leave the code here in case we ever want to add an option for longer sequences. */ /* Check for 1111 1000, xx00 0xxx */ case 4: if (c == 0xf8 && (*p & 0x38) == 0) return p - string; break; /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ case 5: if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0)) return p - string; break; #endif } /* Check for valid bytes after the 2nd, if any; all must start 10 */ while (--ab > 0) { if ((*(++p) & 0xc0) != 0x80) return p - string; } } #else (void)(string); /* Keep picky compilers happy */ (void)(length); #endif return -1; } | pcrevutf.c | 77 |
pcrexcls.c | |||
Type | Function | Source | Line |
BOOL | _pcre_xclass(int c, const uschar *data)
BOOL _pcre_xclass(int c, const uschar *data) { int t; BOOL negated = (*data & XCL_NOT) != 0; /* Character values < 256 are matched against a bitmap, if one is present. If not, we still carry on, because there may be ranges that start below 256 in the additional data. */ if (c < 256) { if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0) return !negated; /* char found */ } /* First skip the bit map if present. Then match against the list of Unicode properties or large chars or ranges that end with a large char. We won't ever encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */ if ((*data++ & XCL_MAP) != 0) data += 32; while ((t = *data++) != XCL_END) { int x, y; if (t == XCL_SINGLE) { GETCHARINC(x, data); if (c == x) return !negated; } else if (t == XCL_RANGE) { GETCHARINC(x, data); GETCHARINC(y, data); if (c >= x && c <= y) return !negated; } #ifdef SUPPORT_UCP else /* XCL_PROP & XCL_NOTPROP */ { const ucd_record * prop = GET_UCD(c); switch(*data) { case PT_ANY: if (t == XCL_PROP) return !negated; break; case PT_LAMP: if ((prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt) == (t == XCL_PROP)) return !negated; break; case PT_GC: if ((data[1] == _pcre_ucp_gentype[prop->chartype]) == (t == XCL_PROP)) return !negated; break; case PT_PC: if ((data[1] == prop->chartype) == (t == XCL_PROP)) return !negated; break; case PT_SC: if ((data[1] == prop->script) == (t == XCL_PROP)) return !negated; break; /* This should never occur, but compilers may mutter if there is no default. */ default: return FALSE; } data += 2; } #endif /* SUPPORT_UCP */ } return negated; /* char did not match */ } | pcrexcls.c | 67 |
_hbpcreg.c | |||
Type | Function | Source | Line |
STATIC VOID * | hb_pcre_grab( size_t size )
static void * hb_pcre_grab( size_t size ) { return hb_xgrab( size ); } #if 1 #include "_hbconf.h" #endif #include "pcreinal.h" #ifndef VPCOMPAT HB_EXTERN_BEGIN PCRE_EXP_DATA_DEFN void *(*pcre_malloc)(size_t) = hb_pcre_grab; PCRE_EXP_DATA_DEFN void (*pcre_free)(void *) = hb_xfree; PCRE_EXP_DATA_DEFN void *(*pcre_stack_malloc)(size_t) = hb_pcre_grab; PCRE_EXP_DATA_DEFN void (*pcre_stack_free)(void *) = hb_xfree; PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; HB_EXTERN_END | _hbpcreg.c | 58 |
Page url: http://www.yourdomain.com/help/index.html?hbpcre.htm