update PCRE2 to version 10.31, fixes #15662

2025-11-11 13:10:58 +00:00 · 2018-05-24 00:13:24 -06:00
parent 38284bc6da
commit 5383ae005c
53 changed files with 19864 additions and 17166 deletions
--- a/thirdparty/pcre2/src/pcre2_dfa_match.c
+++ b/thirdparty/pcre2/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-         New API code Copyright (c) 2016 University of Cambridge
+          New API code Copyright (c) 2016-2018 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -83,7 +83,7 @@ in others, so I abandoned this code. */
 #include "pcre2_internal.h"

 #define PUBLIC_DFA_MATCH_OPTIONS \
-  (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
+  (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
   PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
   PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)

@@ -172,7 +172,7 @@ static const uint8_t coptable[] = {
  0,                             /* Assert not                             */
  0,                             /* Assert behind                          */
  0,                             /* Assert behind not                      */
-  0, 0,                          /* ONCE, ONCE_NC                          */
+  0,                             /* ONCE                                   */
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  0, 0,                          /* CREF, DNCREF                           */
@@ -245,7 +245,7 @@ static const uint8_t poptable[] = {
  0,                             /* Assert not                             */
  0,                             /* Assert behind                          */
  0,                             /* Assert behind not                      */
-  0, 0,                          /* ONCE, ONCE_NC                          */
+  0,                             /* ONCE                                   */
  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
  0, 0,                          /* CREF, DNCREF                           */
@@ -293,6 +293,66 @@ typedef struct stateblock {



+/*************************************************
+*               Process a callout                *
+*************************************************/
+
+/* This function is called to perform a callout.
+
+Arguments:
+  code              current code pointer
+  offsets           points to current capture offsets
+  current_subject   start of current subject match
+  ptr               current position in subject
+  mb                the match block
+  extracode         extra code offset when called from condition
+  lengthptr         where to return the callout length
+
+Returns:            the return from the callout
+*/
+
+static int
+do_callout(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
+  PCRE2_SPTR ptr, dfa_match_block *mb, PCRE2_SIZE extracode,
+  PCRE2_SIZE *lengthptr)
+{
+pcre2_callout_block *cb = mb->cb;
+
+*lengthptr = (code[extracode] == OP_CALLOUT)?
+  (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
+  (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
+
+if (mb->callout == NULL) return 0;    /* No callout provided */
+
+/* Fixed fields in the callout block are set once and for all at the start of
+matching. */
+
+cb->offset_vector    = offsets;
+cb->start_match      = (PCRE2_SIZE)(current_subject - mb->start_subject);
+cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
+cb->pattern_position = GET(code, 1 + extracode);
+cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
+
+if (code[extracode] == OP_CALLOUT)
+  {
+  cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
+  cb->callout_string_offset = 0;
+  cb->callout_string = NULL;
+  cb->callout_string_length = 0;
+  }
+else
+  {
+  cb->callout_number = 0;
+  cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
+  cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
+  cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
+  }
+
+return (mb->callout)(cb, mb->callout_data);
+}
+
+
+
 /*************************************************
 *     Match a Regular Expression - DFA engine    *
 *************************************************/
@@ -375,14 +435,10 @@ internal_dfa_match(
 {
 stateblock *active_states, *new_states, *temp_states;
 stateblock *next_active_state, *next_new_state;
-
 const uint8_t *ctypes, *lcc, *fcc;
 PCRE2_SPTR ptr;
 PCRE2_SPTR end_code;
-PCRE2_SPTR first_op;
-
 dfa_recursion_info new_recursive;
-
 int active_count, new_count, match_count;

 /* Some fields in the mb block are frequently referenced, so we load them into
@@ -400,7 +456,8 @@ BOOL utf = FALSE;

 BOOL reset_could_continue = FALSE;

-if (rlevel++ > mb->match_limit_recursion) return PCRE2_ERROR_RECURSIONLIMIT;
+if (mb->match_call_count++ >= mb->match_limit) return PCRE2_ERROR_MATCHLIMIT;
+if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
 offsetcount &= (uint32_t)(-2);  /* Round down */

 wscount -= 2;
@@ -417,21 +474,15 @@ active_states = (stateblock *)(workspace + 2);
 next_new_state = new_states = active_states + wscount;
 new_count = 0;

-first_op = this_start_code + 1 + LINK_SIZE +
-  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
-    *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
-    ? IMM2_SIZE:0);
-
 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 the alternative states onto the list, and find out where the end is. This
 makes is possible to use this function recursively, when we want to stop at a
 matching internal ket rather than at the end.

-If the first opcode in the first alternative is OP_REVERSE, we are dealing with
-a backward assertion. In that case, we have to find out the maximum amount to
-move back, and set up each alternative appropriately. */
+If we are dealing with a backward assertion we have to find out the maximum
+amount to move back, and set up each alternative appropriately. */

-if (*first_op == OP_REVERSE)
+if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
  {
  size_t max_back = 0;
  size_t gone_back;
@@ -457,7 +508,8 @@ if (*first_op == OP_REVERSE)
      {
      if (current_subject <= start_subject) break;
      current_subject--;
-      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
+      ACROSSCHAR(current_subject > start_subject, current_subject,
+        current_subject--);
      }
    }
  else
@@ -476,15 +528,17 @@ if (*first_op == OP_REVERSE)
  if (current_subject < mb->start_used_ptr)
    mb->start_used_ptr = current_subject;

-  /* Now we can process the individual branches. */
+  /* Now we can process the individual branches. There will be an OP_REVERSE at
+  the start of each branch, except when the length of the branch is zero. */

  end_code = this_start_code;
  do
    {
-    size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
+    uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0;
+    size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE);
    if (back <= gone_back)
      {
-      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
+      int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
      ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
      }
    end_code += GET(end_code, 1);
@@ -697,7 +751,7 @@ for (;;)
      case OP_TABLE_LENGTH +
        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
         (sizeof(poptable) == OP_TABLE_LENGTH)):
-      break;
+      return 0;

 /* ========================================================================== */
      /* Reached a closing bracket. If not at the end of the pattern, carry
@@ -1371,25 +1425,14 @@ for (;;)
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
      if (clen > 0)
        {
-        uint32_t lgb, rgb;
-        PCRE2_SPTR nptr = ptr + clen;
        int ncount = 0;
        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
          {
          active_count--;           /* Remove non-match possibility */
          next_active_state--;
          }
-        lgb = UCD_GRAPHBREAK(c);
-        while (nptr < end_subject)
-          {
-          dlen = 1;
-          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
-          rgb = UCD_GRAPHBREAK(d);
-          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
-          ncount++;
-          lgb = rgb;
-          nptr += dlen;
-          }
+        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
+          &ncount);
        count++;
        ADD_NEW_DATA(-state_offset, count, ncount);
        }
@@ -1632,8 +1675,6 @@ for (;;)
      ADD_ACTIVE(state_offset + 2, 0);
      if (clen > 0)
        {
-        uint32_t lgb, rgb;
-        PCRE2_SPTR nptr = ptr + clen;
        int ncount = 0;
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
@@ -1641,17 +1682,8 @@ for (;;)
          active_count--;           /* Remove non-match possibility */
          next_active_state--;
          }
-        lgb = UCD_GRAPHBREAK(c);
-        while (nptr < end_subject)
-          {
-          dlen = 1;
-          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
-          rgb = UCD_GRAPHBREAK(d);
-          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
-          ncount++;
-          lgb = rgb;
-          nptr += dlen;
-          }
+        (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
+          &ncount);
        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
        }
      break;
@@ -1904,25 +1936,15 @@ for (;;)
      count = current_state->count;  /* Number already matched */
      if (clen > 0)
        {
-        uint32_t lgb, rgb;
-        PCRE2_SPTR nptr = ptr + clen;
+        PCRE2_SPTR nptr;
        int ncount = 0;
        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
          {
          active_count--;           /* Remove non-match possibility */
          next_active_state--;
          }
-        lgb = UCD_GRAPHBREAK(c);
-        while (nptr < end_subject)
-          {
-          dlen = 1;
-          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
-          rgb = UCD_GRAPHBREAK(d);
-          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
-          ncount++;
-          lgb = rgb;
-          nptr += dlen;
-          }
+        nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
+          &ncount);
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
            reset_could_continue = TRUE;
        if (++count >= (int)GET2(code, 1))
@@ -2099,20 +2121,9 @@ for (;;)
      case OP_EXTUNI:
      if (clen > 0)
        {
-        uint32_t lgb, rgb;
-        PCRE2_SPTR nptr = ptr + clen;
        int ncount = 0;
-        lgb = UCD_GRAPHBREAK(c);
-        while (nptr < end_subject)
-          {
-          dlen = 1;
-          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
-          rgb = UCD_GRAPHBREAK(d);
-          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
-          ncount++;
-          lgb = rgb;
-          nptr += dlen;
-          }
+        PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
+          end_subject, utf, &ncount);
        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
            reset_could_continue = TRUE;
        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
@@ -2136,6 +2147,7 @@ for (;;)
        case 0x2029:
 #endif  /* Not EBCDIC */
        if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
+        /* Fall through */

        case CHAR_LF:
        ADD_NEW(state_offset + 1, 0);
@@ -2225,7 +2237,7 @@ for (;;)
      case OP_NOTI:
      if (clen > 0)
        {
-        unsigned int otherd;
+        uint32_t otherd;
 #ifdef SUPPORT_UNICODE
        if (utf && d >= 128)
          otherd = UCD_OTHERCASE(d);
@@ -2539,11 +2551,13 @@ for (;;)
          if (isinclass)
            {
            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
-            if (*ecode == OP_CRPOSRANGE)
+
+            if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
              {
              active_count--;           /* Remove non-match possibility */
              next_active_state--;
              }
+
            if (++count >= max && max != 0)   /* Max 0 => no limit */
              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
            else
@@ -2613,45 +2627,10 @@ for (;;)
        if (code[LINK_SIZE + 1] == OP_CALLOUT
            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
          {
-          PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)?
-            (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
-            (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE);
-
-          rrc = 0;
-          if (mb->callout != NULL)
-            {
-            pcre2_callout_block cb;
-            cb.version          = 1;
-            cb.capture_top      = 1;
-            cb.capture_last     = 0;
-            cb.offset_vector    = offsets;
-            cb.mark             = NULL;   /* No (*MARK) support */
-            cb.subject          = start_subject;
-            cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
-            cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
-            cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
-            cb.pattern_position = GET(code, LINK_SIZE + 2);
-            cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE);
-
-            if (code[LINK_SIZE + 1] == OP_CALLOUT)
-              {
-              cb.callout_number = code[2 + 3*LINK_SIZE];
-              cb.callout_string_offset = 0;
-              cb.callout_string = NULL;
-              cb.callout_string_length = 0;
-              }
-            else
-              {
-              cb.callout_number = 0;
-              cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE);
-              cb.callout_string = code + (2 + 5*LINK_SIZE) + 1;
-              cb.callout_string_length =
-                callout_length - (1 + 4*LINK_SIZE) - 2;
-              }
-
-            if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
-              return rrc;   /* Abandon */
-            }
+          PCRE2_SIZE callout_length;
+          rrc = do_callout(code, offsets, current_subject, ptr, mb,
+            1 + LINK_SIZE, &callout_length);
+          if (rrc < 0) return rrc;                 /* Abandon */
          if (rrc > 0) break;                      /* Fail this thread */
          code += callout_length;                  /* Skip callout data */
          }
@@ -2889,7 +2868,6 @@ for (;;)

      /*-----------------------------------------------------------------*/
      case OP_ONCE:
-      case OP_ONCE_NC:
        {
        PCRE2_SIZE local_offsets[2];
        int local_workspace[1000];
@@ -2984,44 +2962,10 @@ for (;;)
      case OP_CALLOUT:
      case OP_CALLOUT_STR:
        {
-        unsigned int callout_length = (*code == OP_CALLOUT)
-            ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE);
-        rrc = 0;
-
-        if (mb->callout != NULL)
-          {
-          pcre2_callout_block cb;
-          cb.version          = 1;
-          cb.capture_top      = 1;
-          cb.capture_last     = 0;
-          cb.offset_vector    = offsets;
-          cb.mark             = NULL;   /* No (*MARK) support */
-          cb.subject          = start_subject;
-          cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
-          cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
-          cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
-          cb.pattern_position = GET(code, 1);
-          cb.next_item_length = GET(code, 1 + LINK_SIZE);
-
-          if (*code == OP_CALLOUT)
-            {
-            cb.callout_number = code[1 + 2*LINK_SIZE];
-            cb.callout_string_offset = 0;
-            cb.callout_string = NULL;
-            cb.callout_string_length = 0;
-            }
-          else
-            {
-            cb.callout_number = 0;
-            cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE);
-            cb.callout_string = code + (1 + 4*LINK_SIZE) + 1;
-            cb.callout_string_length =
-              callout_length - (1 + 4*LINK_SIZE) - 2;
-            }
-
-          if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
-            return rrc;   /* Abandon */
-          }
+        PCRE2_SIZE callout_length;
+        rrc = do_callout(code, offsets, current_subject, ptr, mb, 0,
+          &callout_length);
+        if (rrc < 0) return rrc;   /* Abandon */
        if (rrc == 0)
          { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
        }
@@ -3069,7 +3013,7 @@ for (;;)
          )
        )
      match_count = PCRE2_ERROR_PARTIAL;
-    break;        /* In effect, "return", but see the comment below */
+    break;  /* Exit from loop along the subject string */
    }

  /* One or more states are active for the next character. */
@@ -3077,11 +3021,13 @@ for (;;)
  ptr += clen;    /* Advance to next subject character */
  }               /* Loop to move along the subject string */

-/* Control gets here from "break" a few lines above. We do it this way because
-if we use "return" above, we have compiler trouble. Some compilers warn if
-there's nothing here because they think the function doesn't return a value. On
-the other hand, if we put a dummy statement here, some more clever compilers
-complain that it can't be reached. Sigh. */
+/* Control gets here from "break" a few lines above. If we have a match and
+PCRE2_ENDANCHORED is set, the match fails. */
+
+if (match_count >= 0 &&
+    ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
+    ptr < end_subject)
+  match_count = PCRE2_ERROR_NOMATCH;

 return match_count;
 }
@@ -3138,6 +3084,7 @@ const uint8_t *start_bits = NULL;
 /* We need to have mb pointing to a match block, because the IS_NEWLINE macro
 is used below, and it expects NLBLOCK to be defined as a pointer. */

+pcre2_callout_block cb;
 dfa_match_block actual_match_block;
 dfa_match_block *mb = &actual_match_block;

@@ -3154,6 +3101,13 @@ if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
 if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
 if (start_offset > length) return PCRE2_ERROR_BADOFFSET;

+/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
+time. */
+
+if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
+   ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
+  return PCRE2_ERROR_BADOPTION;
+
 /* Check that the first field in the block is the magic number. If it is not,
 return with PCRE2_ERROR_BADMAGIC. */

@@ -3208,15 +3162,28 @@ startline = (re->flags & PCRE2_STARTLINE) != 0;
 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
 bumpalong_limit = end_subject;

-/* Get data from the match context, if present, and fill in the fields in the
-match block. It is an error to set an offset limit without setting the flag at
-compile time. */
+/* Initialize and set up the fixed fields in the callout block, with a pointer
+in the match block. */
+
+mb->cb = &cb;
+cb.version = 2;
+cb.subject = subject;
+cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
+cb.callout_flags = 0;
+cb.capture_top      = 1;      /* No capture support */
+cb.capture_last     = 0;
+cb.mark             = NULL;   /* No (*MARK) support */
+
+/* Get data from the match context, if present, and fill in the remaining
+fields in the match block. It is an error to set an offset limit without
+setting the flag at compile time. */

 if (mcontext == NULL)
  {
  mb->callout = NULL;
  mb->memctl = re->memctl;
-  mb->match_limit_recursion = PRIV(default_match_context).recursion_limit;
+  mb->match_limit = PRIV(default_match_context).match_limit;
+  mb->match_limit_depth = PRIV(default_match_context).depth_limit;
  }
 else
  {
@@ -3229,10 +3196,15 @@ else
  mb->callout = mcontext->callout;
  mb->callout_data = mcontext->callout_data;
  mb->memctl = mcontext->memctl;
-  mb->match_limit_recursion = mcontext->recursion_limit;
+  mb->match_limit = mcontext->match_limit;
+  mb->match_limit_depth = mcontext->depth_limit;
  }
-if (mb->match_limit_recursion > re->limit_recursion)
-  mb->match_limit_recursion = re->limit_recursion;
+
+if (mb->match_limit > re->limit_match)
+  mb->match_limit = re->limit_match;
+
+if (mb->match_limit_depth > re->limit_depth)
+  mb->match_limit_depth = re->limit_depth;

 mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
  re->name_count * re->name_entry_size;
@@ -3242,6 +3214,7 @@ mb->end_subject = end_subject;
 mb->start_offset = start_offset;
 mb->moptions = options;
 mb->poptions = re->overall_options;
+mb->match_call_count = 0;

 /* Process the \R and newline settings. */

@@ -3259,6 +3232,11 @@ switch(re->newline_convention)
  mb->nl[0] = CHAR_NL;
  break;

+  case PCRE2_NEWLINE_NUL:
+  mb->nllen = 1;
+  mb->nl[0] = CHAR_NUL;
+  break;
+
  case PCRE2_NEWLINE_CRLF:
  mb->nllen = 2;
  mb->nl[0] = CHAR_CR;
@@ -3325,34 +3303,27 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
  }
 #endif  /* SUPPORT_UNICODE */

-/* Set up the first code unit to match, if available. The first_codeunit value
-is never set for an anchored regular expression, but the anchoring may be
-forced at run time, so we have to test for anchoring. The first code unit may
-be unset for an unanchored pattern, of course. If there's no first code unit
-there may be a bitmap of possible first characters. */
+/* Set up the first code unit to match, if available. If there's no first code
+unit there may be a bitmap of possible first characters. */

-if (!anchored)
+if ((re->flags & PCRE2_FIRSTSET) != 0)
  {
-  if ((re->flags & PCRE2_FIRSTSET) != 0)
+  has_first_cu = TRUE;
+  first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
+  if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
    {
-    has_first_cu = TRUE;
-    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
-    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
-      {
-      first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
+    first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
-      if (utf && first_cu > 127)
-        first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+    if (utf && first_cu > 127)
+      first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
 #endif
-      }
    }
-  else
-    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
-      start_bits = re->start_bitmap;
  }
+else
+  if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
+    start_bits = re->start_bitmap;

-/* For anchored or unanchored matches, there may be a "last known required
-character" set. */
+/* There may be a "last known required code unit" set. */

 if ((re->flags & PCRE2_LASTSET) != 0)
  {
@@ -3393,13 +3364,13 @@ for (;;)
  if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
      (options & PCRE2_DFA_RESTART) == 0)
    {
-    PCRE2_SPTR save_end_subject = end_subject;
-
    /* If firstline is TRUE, the start of the match is constrained to the first
    line of a multiline string. That is, the match must be before or at the
-    first newline. Implement this by temporarily adjusting end_subject so that
-    we stop the optimization scans at a newline. If the match fails at the
-    newline, later code breaks this loop. */
+    first newline following the start of matching. Temporarily adjust
+    end_subject so that we stop the optimization scans for a first code unit
+    immediately after the first character of a newline (the first code unit can
+    legitimately be a newline). If the match fails at the newline, later code
+    breaks this loop. */

    if (firstline)
      {
@@ -3407,85 +3378,162 @@ for (;;)
 #ifdef SUPPORT_UNICODE
      if (utf)
        {
-        while (t < mb->end_subject && !IS_NEWLINE(t))
+        while (t < end_subject && !IS_NEWLINE(t))
          {
          t++;
-          ACROSSCHAR(t < end_subject, *t, t++);
+          ACROSSCHAR(t < end_subject, t, t++);
          }
        }
      else
 #endif
-      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
+      while (t < end_subject && !IS_NEWLINE(t)) t++;
      end_subject = t;
      }

-    /* Advance to a unique first code unit if there is one. */
+    /* Anchored: check the first code unit if one is recorded. This may seem
+    pointless but it can help in detecting a no match case without scanning for
+    the required code unit. */

-    if (has_first_cu)
+    if (anchored)
      {
-      PCRE2_UCHAR smc;
-      if (first_cu != first_cu2)
-        while (start_match < end_subject &&
-          (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
-          start_match++;
-      else
-        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
-          start_match++;
-      }
-
-    /* Or to just after a linebreak for a multiline match */
-
-    else if (startline)
-      {
-      if (start_match > mb->start_subject + start_offset)
+      if (has_first_cu || start_bits != NULL)
        {
-#ifdef SUPPORT_UNICODE
-        if (utf)
+        BOOL ok = start_match < end_subject;
+        if (ok)
          {
-          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+          PCRE2_UCHAR c = UCHAR21TEST(start_match);
+          ok = has_first_cu && (c == first_cu || c == first_cu2);
+          if (!ok && start_bits != NULL)
            {
-            start_match++;
-            ACROSSCHAR(start_match < end_subject, *start_match,
-              start_match++);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            if (c > 255) c = 255;
+#endif
+            ok = (start_bits[c/8] & (1 << (c&7))) != 0;
            }
          }
-        else
-#endif
-        while (start_match < end_subject && !WAS_NEWLINE(start_match))
-          start_match++;
-
-        /* If we have just passed a CR and the newline option is ANY or
-        ANYCRLF, and we are now at a LF, advance the match position by one more
-        code unit. */
-
-        if (start_match[-1] == CHAR_CR &&
-             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
-             start_match < end_subject &&
-             UCHAR21TEST(start_match) == CHAR_NL)
-          start_match++;
+        if (!ok) break;
        }
      }

-    /* Or to a non-unique first code unit if any have been identified. The
-    bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
-    code units greater than 254 set the 255 bit. */
+    /* Not anchored. Advance to a unique first code unit if there is one. In
+    8-bit mode, the use of memchr() gives a big speed up, even though we have
+    to call it twice in caseless mode, in order to find the earliest occurrence
+    of the character in either of its cases. */

-    else if (start_bits != NULL)
+    else
      {
-      while (start_match < end_subject)
+      if (has_first_cu)
        {
-        uint32_t c = UCHAR21TEST(start_match);
+        if (first_cu != first_cu2)  /* Caseless */
+          {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-        if (c > 255) c = 255;
+          PCRE2_UCHAR smc;
+          while (start_match < end_subject &&
+                (smc = UCHAR21TEST(start_match)) != first_cu &&
+                  smc != first_cu2)
+            start_match++;
+#else  /* 8-bit code units */
+          PCRE2_SPTR pp1 =
+            memchr(start_match, first_cu, end_subject-start_match);
+          PCRE2_SPTR pp2 =
+            memchr(start_match, first_cu2, end_subject-start_match);
+          if (pp1 == NULL)
+            start_match = (pp2 == NULL)? end_subject : pp2;
+          else
+            start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
 #endif
-        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
-        start_match++;
+          }
+
+        /* The caseful case */
+
+        else
+          {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          while (start_match < end_subject && UCHAR21TEST(start_match) !=
+                 first_cu)
+            start_match++;
+#else
+          start_match = memchr(start_match, first_cu, end_subject - start_match);
+          if (start_match == NULL) start_match = end_subject;
+#endif
+          }
+
+        /* If we can't find the required code unit, having reached the true end
+        of the subject, break the bumpalong loop, to force a match failure,
+        except when doing partial matching, when we let the next cycle run at
+        the end of the subject. To see why, consider the pattern /(?<=abc)def/,
+        which partially matches "abc", even though the string does not contain
+        the starting character "d". If we have not reached the true end of the
+        subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
+        we also let the cycle run, because the matching string is legitimately
+        allowed to start with the first code unit of a newline. */
+
+        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
+            start_match >= mb->end_subject)
+          break;
        }
-      }
+
+      /* If there's no first code unit, advance to just after a linebreak for a
+      multiline match if required. */
+
+      else if (startline)
+        {
+        if (start_match > mb->start_subject + start_offset)
+          {
+#ifdef SUPPORT_UNICODE
+          if (utf)
+            {
+            while (start_match < end_subject && !WAS_NEWLINE(start_match))
+              {
+              start_match++;
+              ACROSSCHAR(start_match < end_subject, start_match, start_match++);
+              }
+            }
+          else
+#endif
+          while (start_match < end_subject && !WAS_NEWLINE(start_match))
+            start_match++;
+
+          /* If we have just passed a CR and the newline option is ANY or
+          ANYCRLF, and we are now at a LF, advance the match position by one
+          more code unit. */
+
+          if (start_match[-1] == CHAR_CR &&
+               (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
+               start_match < end_subject &&
+               UCHAR21TEST(start_match) == CHAR_NL)
+            start_match++;
+          }
+        }
+
+      /* If there's no first code unit or a requirement for a multiline line
+      start, advance to a non-unique first code unit if any have been
+      identified. The bitmap contains only 256 bits. When code units are 16 or
+      32 bits wide, all code units greater than 254 set the 255 bit. */
+
+      else if (start_bits != NULL)
+        {
+        while (start_match < end_subject)
+          {
+          uint32_t c = UCHAR21TEST(start_match);
+#if PCRE2_CODE_UNIT_WIDTH != 8
+          if (c > 255) c = 255;
+#endif
+          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
+          start_match++;
+          }
+
+        /* See comment above in first_cu checking about the next line. */
+
+        if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
+            start_match >= mb->end_subject)
+          break;
+        }
+      }  /* End of first code unit handling */

    /* Restore fudged end_subject */

-    end_subject = save_end_subject;
+    end_subject = mb->end_subject;

    /* The following two optimizations are disabled for partial matching. */

@@ -3600,8 +3648,7 @@ for (;;)
 #ifdef SUPPORT_UNICODE
  if (utf)
    {
-    ACROSSCHAR(start_match < end_subject, *start_match,
-      start_match++);
+    ACROSSCHAR(start_match < end_subject, start_match, start_match++);
    }
 #endif
  if (start_match > end_subject) break;