pcre2: Update to upstream version 10.34

Changelog: https://vcs.pcre.org/pcre2/code/tags/pcre2-10.34/ChangeLog?view=markup
2025-11-14 13:41:12 +00:00 · 2020-04-30 15:09:03 +02:00
parent d29514acce
commit 824736d271
39 changed files with 7446 additions and 4735 deletions
--- a/thirdparty/pcre2/src/pcre2_match.c
+++ b/thirdparty/pcre2/src/pcre2_match.c
@@ -415,8 +415,7 @@ if (caseless)
  else
 #endif

-    /* Not in UTF mode */
-
+  /* Not in UTF mode */
    {
    for (; length > 0; length--)
      {
@@ -491,27 +490,32 @@ heap is used for a larger vector.
 *************************************************/

 /* These macros pack up tests that are used for partial matching several times
-in the code. We set the "hit end" flag if the pointer is at the end of the
-subject and also past the earliest inspected character (i.e. something has been
-matched, even if not part of the actual matched string). For hard partial
-matching, we then return immediately. The second one is used when we already
-know we are past the end of the subject. */
+in the code. The second one is used when we already know we are past the end of
+the subject. We set the "hit end" flag if the pointer is at the end of the
+subject and either (a) the pointer is past the earliest inspected character
+(i.e. something has been matched, even if not part of the actual matched
+string), or (b) the pattern contains a lookbehind. These are the conditions for
+which adding more characters may allow the current match to continue.
+
+For hard partial matching, we immediately return a partial match. Otherwise,
+carrying on means that a complete match on the current subject will be sought.
+A partial match is returned only if no complete match can be found. */

 #define CHECK_PARTIAL()\
-  if (mb->partial != 0 && Feptr >= mb->end_subject && \
-      Feptr > mb->start_used_ptr) \
+  if (Feptr >= mb->end_subject) \
    { \
-    mb->hitend = TRUE; \
-    if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
+    SCHECK_PARTIAL(); \
    }

 #define SCHECK_PARTIAL()\
-  if (mb->partial != 0 && Feptr > mb->start_used_ptr) \
+  if (mb->partial != 0 && \
+      (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \
    { \
    mb->hitend = TRUE; \
    if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \
    }

+
 /* These macros are used to implement backtracking. They simulate a recursive
 call to the match() function by means of a local vector of frames which
 remember the backtracking points. */
@@ -5127,6 +5131,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

    case OP_ASSERT:
    case OP_ASSERTBACK:
+    case OP_ASSERT_NA:
+    case OP_ASSERTBACK_NA:
    Lframe_type = GF_NOCAPTURE | Fop;
    for (;;)
      {
@@ -5412,7 +5418,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      {
      while (number-- > 0)
        {
-        if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH);
+        if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);
        Feptr--;
        BACKCHAR(Feptr);
        }
@@ -5420,7 +5426,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
    else
 #endif

-    /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
+    /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */

      {
      if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH);
@@ -5472,15 +5478,16 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

      /* If we are at the end of an assertion that is a condition, return a
      match, discarding any intermediate backtracking points. Copy back the
-      captures into the frame before N so that they are set on return. Doing
-      this for all assertions, both positive and negative, seems to match what
-      Perl does. */
+      mark setting and the captures into the frame before N so that they are
+      set on return. Doing this for all assertions, both positive and negative,
+      seems to match what Perl does. */

      if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)
        {
        memcpy((char *)P + offsetof(heapframe, ovector), Fovector,
          Foffset_top * sizeof(PCRE2_SIZE));
        P->offset_top = Foffset_top;
+        P->mark = Fmark;
        Fback_frame = (char *)F - (char *)P;
        RRETURN(MATCH_MATCH);
        }
@@ -5496,10 +5503,20 @@ fprintf(stderr, "++ op=%d\n", *Fecode);
      case OP_SCOND:
      break;

-      /* Positive assertions are like OP_ONCE, except that in addition the
+      /* Non-atomic positive assertions are like OP_BRA, except that the
      subject pointer must be put back to where it was at the start of the
      assertion. */

+      case OP_ASSERT_NA:
+      case OP_ASSERTBACK_NA:
+      if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
+      Feptr = P->eptr;
+      break;
+
+      /* Atomic positive assertions are like OP_ONCE, except that in addition
+      the subject pointer must be put back to where it was at the start of the
+      assertion. */
+
      case OP_ASSERT:
      case OP_ASSERTBACK:
      if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
@@ -5640,7 +5657,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

    case OP_EOD:
    if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH);
-    SCHECK_PARTIAL();
+    if (mb->partial != 0)
+      {
+      mb->hitend = TRUE;
+      if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
+      }
    Fecode++;
    break;

@@ -5665,7 +5686,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

    /* Either at end of string or \n before end. */

-    SCHECK_PARTIAL();
+    if (mb->partial != 0)
+      {
+      mb->hitend = TRUE;
+      if (mb->partial > 1) return PCRE2_ERROR_PARTIAL;
+      }
    Fecode++;
    break;

@@ -5743,7 +5768,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);

    case OP_NOT_WORD_BOUNDARY:
    case OP_WORD_BOUNDARY:
-    if (Feptr == mb->start_subject) prev_is_word = FALSE; else
+    if (Feptr == mb->check_subject) prev_is_word = FALSE; else
      {
      PCRE2_SPTR lastptr = Feptr - 1;
 #ifdef SUPPORT_UNICODE
@@ -5946,6 +5971,7 @@ in rrc. */
 #define LBL(val) case val: goto L_RM##val;

 RETURN_SWITCH:
+if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;
 if (Frdepth == 0) return rrc;                     /* Exit from the top level */
 F = (heapframe *)((char *)F - Fback_frame);       /* Backtrack */
 mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */
@@ -5999,9 +6025,9 @@ Arguments:

 Returns:          > 0 => success; value is the number of ovector pairs filled
                  = 0 => success, but ovector is not big enough
-                   -1 => failed to match (PCRE2_ERROR_NOMATCH)
-                   -2 => partial match (PCRE2_ERROR_PARTIAL)
-                 < -2 => some kind of unexpected problem
+                  = -1 => failed to match (PCRE2_ERROR_NOMATCH)
+                  = -2 => partial match (PCRE2_ERROR_PARTIAL)
+                  < -2 => some kind of unexpected problem
 */

 PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
@@ -6014,7 +6040,6 @@ int was_zero_terminated = 0;
 const uint8_t *start_bits = NULL;
 const pcre2_real_code *re = (const pcre2_real_code *)code;

-
 BOOL anchored;
 BOOL firstline;
 BOOL has_first_cu = FALSE;
@@ -6022,6 +6047,11 @@ BOOL has_req_cu = FALSE;
 BOOL startline;
 BOOL utf;

+#if PCRE2_CODE_UNIT_WIDTH == 8
+BOOL memchr_not_found_first_cu = FALSE;
+BOOL memchr_not_found_first_cu2 = FALSE;
+#endif
+
 PCRE2_UCHAR first_cu = 0;
 PCRE2_UCHAR first_cu2 = 0;
 PCRE2_UCHAR req_cu = 0;
@@ -6029,10 +6059,23 @@ PCRE2_UCHAR req_cu2 = 0;

 PCRE2_SPTR bumpalong_limit;
 PCRE2_SPTR end_subject;
+PCRE2_SPTR true_end_subject;
 PCRE2_SPTR start_match = subject + start_offset;
 PCRE2_SPTR req_cu_ptr = start_match - 1;
-PCRE2_SPTR start_partial = NULL;
-PCRE2_SPTR match_partial = NULL;
+PCRE2_SPTR start_partial;
+PCRE2_SPTR match_partial;
+
+#ifdef SUPPORT_JIT
+BOOL use_jit;
+#endif
+
+#ifdef SUPPORT_UNICODE
+BOOL allow_invalid;
+uint32_t fragment_options = 0;
+#ifdef SUPPORT_JIT
+BOOL jit_checked_utf = FALSE;
+#endif
+#endif

 PCRE2_SIZE frame_size;

@@ -6059,7 +6102,7 @@ if (length == PCRE2_ZERO_TERMINATED)
  length = PRIV(strlen)(subject);
  was_zero_terminated = 1;
  }
-end_subject = subject + length;
+true_end_subject = end_subject = subject + length;

 /* Plausibility checks */

@@ -6095,12 +6138,24 @@ options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
 #undef FF
 #undef OO

-/* These two settings are used in the code for checking a UTF string that
-follows immediately afterwards. Other values in the mb block are used only
-during interpretive processing, not when the JIT support is in use, so they are
-set up later. */
+/* If the pattern was successfully studied with JIT support, we will run the
+JIT executable instead of the rest of this function. Most options must be set
+at compile time for the JIT code to be usable. */
+
+#ifdef SUPPORT_JIT
+use_jit = (re->executable_jit != NULL &&
+          (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0);
+#endif
+
+/* Initialize UTF parameters. */

 utf = (re->overall_options & PCRE2_UTF) != 0;
+#ifdef SUPPORT_UNICODE
+allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0;
+#endif
+
+/* Convert the partial matching flags into an integer. */
+
 mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :
              ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0;

@@ -6111,61 +6166,6 @@ if (mb->partial != 0 &&
   ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
  return PCRE2_ERROR_BADOPTION;

-/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
-we must also check that a starting offset does not point into the middle of a
-multiunit character. We check only the portion of the subject that is going to
-be inspected during matching - from the offset minus the maximum back reference
-to the given length. This saves time when a small part of a large subject is
-being matched by the use of a starting offset. Note that the maximum lookbehind
-is a number of characters, not code units. */
-
-#ifdef SUPPORT_UNICODE
-if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
-  {
-  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
-
-  if (start_offset > 0)
-    {
-#if PCRE2_CODE_UNIT_WIDTH != 32
-    unsigned int i;
-    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
-      return PCRE2_ERROR_BADUTFOFFSET;
-    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
-      {
-      check_subject--;
-      while (check_subject > subject &&
-#if PCRE2_CODE_UNIT_WIDTH == 8
-      (*check_subject & 0xc0) == 0x80)
-#else  /* 16-bit */
-      (*check_subject & 0xfc00) == 0xdc00)
-#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
-        check_subject--;
-      }
-#else
-    /* In the 32-bit library, one code unit equals one character. However,
-    we cannot just subtract the lookbehind and then compare pointers, because
-    a very large lookbehind could create an invalid pointer. */
-
-    if (start_offset >= re->max_lookbehind)
-      check_subject -= re->max_lookbehind;
-    else
-      check_subject = subject;
-#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
-    }
-
-  /* Validate the relevant portion of the subject. After an error, adjust the
-  offset to be an absolute offset in the whole string. */
-
-  match_data->rc = PRIV(valid_utf)(check_subject,
-    length - (check_subject - subject), &(match_data->startchar));
-  if (match_data->rc != 0)
-    {
-    match_data->startchar += check_subject - subject;
-    return match_data->rc;
-    }
-  }
-#endif  /* SUPPORT_UNICODE */
-
 /* It is an error to set an offset limit without setting the flag at compile
 time. */

@@ -6184,15 +6184,89 @@ if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
  }
 match_data->subject = NULL;

-/* If the pattern was successfully studied with JIT support, run the JIT
-executable instead of the rest of this function. Most options must be set at
-compile time for the JIT code to be usable. Fallback to the normal code path if
-an unsupported option is set or if JIT returns BADOPTION (which means that the
-selected normal or partial matching mode was not compiled). */
+/* Zero the error offset in case the first code unit is invalid UTF. */
+
+match_data->startchar = 0;
+
+
+/* ============================= JIT matching ============================== */
+
+/* Prepare for JIT matching. Check a UTF string for validity unless no check is
+requested or invalid UTF can be handled. We check only the portion of the
+subject that might be be inspected during matching - from the offset minus the
+maximum lookbehind to the given length. This saves time when a small part of a
+large subject is being matched by the use of a starting offset. Note that the
+maximum lookbehind is a number of characters, not code units. */

 #ifdef SUPPORT_JIT
-if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
+if (use_jit)
  {
+#ifdef SUPPORT_UNICODE
+  if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)
+    {
+#if PCRE2_CODE_UNIT_WIDTH != 32
+    unsigned int i;
+#endif
+
+    /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
+    character start. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 32
+    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+      {
+      if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      return PCRE2_ERROR_UTF8_ERR20;  /* Isolated 0x80 byte */
+#else
+      return PCRE2_ERROR_UTF16_ERR3;  /* Isolated low surrogate */
+#endif
+      }
+#endif  /* WIDTH != 32 */
+
+    /* Move back by the maximum lookbehind, just in case it happens at the very
+    start of matching. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 32
+    for (i = re->max_lookbehind; i > 0 && start_match > subject; i--)
+      {
+      start_match--;
+      while (start_match > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      (*start_match & 0xc0) == 0x80)
+#else  /* 16-bit */
+      (*start_match & 0xfc00) == 0xdc00)
+#endif
+        start_match--;
+      }
+#else  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+
+    /* In the 32-bit library, one code unit equals one character. However,
+    we cannot just subtract the lookbehind and then compare pointers, because
+    a very large lookbehind could create an invalid pointer. */
+
+    if (start_offset >= re->max_lookbehind)
+      start_match -= re->max_lookbehind;
+    else
+      start_match = subject;
+#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+
+    /* Validate the relevant portion of the subject. Adjust the offset of an
+    invalid code point to be an absolute offset in the whole string. */
+
+    match_data->rc = PRIV(valid_utf)(start_match,
+      length - (start_match - subject), &(match_data->startchar));
+    if (match_data->rc != 0)
+      {
+      match_data->startchar += start_match - subject;
+      return match_data->rc;
+      }
+    jit_checked_utf = TRUE;
+    }
+#endif  /* SUPPORT_UNICODE */
+
+  /* If JIT returns BADOPTION, which means that the selected complete or
+  partial matching mode was not compiled, fall through to the interpreter. */
+
  rc = pcre2_jit_match(code, subject, length, start_offset, options,
    match_data, mcontext);
  if (rc != PCRE2_ERROR_JIT_BADOPTION)
@@ -6209,10 +6283,152 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)
    return rc;
    }
  }
+#endif  /* SUPPORT_JIT */
+
+/* ========================= End of JIT matching ========================== */
+
+
+/* Proceed with non-JIT matching. The default is to allow lookbehinds to the
+start of the subject. A UTF check when there is a non-zero offset may change
+this. */
+
+mb->check_subject = subject;
+
+/* If a UTF subject string was not checked for validity in the JIT code above,
+check it here, and handle support for invalid UTF strings. The check above
+happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset.
+If we get here in those circumstances, it means the subject string is valid,
+but for some reason JIT matching was not successful. There is no need to check
+the subject again.
+
+We check only the portion of the subject that might be be inspected during
+matching - from the offset minus the maximum lookbehind to the given length.
+This saves time when a small part of a large subject is being matched by the
+use of a starting offset. Note that the maximum lookbehind is a number of
+characters, not code units.
+
+Note also that support for invalid UTF forces a check, overriding the setting
+of PCRE2_NO_CHECK_UTF. */
+
+#ifdef SUPPORT_UNICODE
+if (utf &&
+#ifdef SUPPORT_JIT
+    !jit_checked_utf &&
+#endif
+    ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid))
+  {
+#if PCRE2_CODE_UNIT_WIDTH != 32
+  BOOL skipped_bad_start = FALSE;
 #endif

-/* Carry on with non-JIT matching. A NULL match context means "use a default
-context", but we take the memory control functions from the pattern. */
+  /* For 8-bit and 16-bit UTF, check that the first code unit is a valid
+  character start. If we are handling invalid UTF, just skip over such code
+  units. Otherwise, give an appropriate error. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 32
+  if (allow_invalid)
+    {
+    while (start_match < end_subject && NOT_FIRSTCU(*start_match))
+      {
+      start_match++;
+      skipped_bad_start = TRUE;
+      }
+    }
+  else if (start_match < end_subject && NOT_FIRSTCU(*start_match))
+    {
+    if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET;
+#if PCRE2_CODE_UNIT_WIDTH == 8
+    return PCRE2_ERROR_UTF8_ERR20;  /* Isolated 0x80 byte */
+#else
+    return PCRE2_ERROR_UTF16_ERR3;  /* Isolated low surrogate */
+#endif
+    }
+#endif  /* WIDTH != 32 */
+
+  /* The mb->check_subject field points to the start of UTF checking;
+  lookbehinds can go back no further than this. */
+
+  mb->check_subject = start_match;
+
+  /* Move back by the maximum lookbehind, just in case it happens at the very
+  start of matching, but don't do this if we skipped bad 8-bit or 16-bit code
+  units above. */
+
+#if PCRE2_CODE_UNIT_WIDTH != 32
+  if (!skipped_bad_start)
+    {
+    unsigned int i;
+    for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--)
+      {
+      mb->check_subject--;
+      while (mb->check_subject > subject &&
+#if PCRE2_CODE_UNIT_WIDTH == 8
+      (*mb->check_subject & 0xc0) == 0x80)
+#else  /* 16-bit */
+      (*mb->check_subject & 0xfc00) == 0xdc00)
+#endif
+        mb->check_subject--;
+      }
+    }
+#else  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+
+  /* In the 32-bit library, one code unit equals one character. However,
+  we cannot just subtract the lookbehind and then compare pointers, because
+  a very large lookbehind could create an invalid pointer. */
+
+  if (start_offset >= re->max_lookbehind)
+    mb->check_subject -= re->max_lookbehind;
+  else
+    mb->check_subject = subject;
+#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
+
+  /* Validate the relevant portion of the subject. There's a loop in case we
+  encounter bad UTF in the characters preceding start_match which we are
+  scanning because of a lookbehind. */
+
+  for (;;)
+    {
+    match_data->rc = PRIV(valid_utf)(mb->check_subject,
+      length - (mb->check_subject - subject), &(match_data->startchar));
+
+    if (match_data->rc == 0) break;   /* Valid UTF string */
+
+    /* Invalid UTF string. Adjust the offset to be an absolute offset in the
+    whole string. If we are handling invalid UTF strings, set end_subject to
+    stop before the bad code unit, and set the options to "not end of line".
+    Otherwise return the error. */
+
+    match_data->startchar += mb->check_subject - subject;
+    if (!allow_invalid || match_data->rc > 0) return match_data->rc;
+    end_subject = subject + match_data->startchar;
+
+    /* If the end precedes start_match, it means there is invalid UTF in the
+    extra code units we reversed over because of a lookbehind. Advance past the
+    first bad code unit, and then skip invalid character starting code units in
+    8-bit and 16-bit modes, and try again. */
+
+    if (end_subject < start_match)
+      {
+      mb->check_subject = end_subject + 1;
+#if PCRE2_CODE_UNIT_WIDTH != 32
+      while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject))
+        mb->check_subject++;
+#endif
+      }
+
+    /* Otherwise, set the not end of line option, and do the match. */
+
+    else
+      {
+      fragment_options = PCRE2_NOTEOL;
+      break;
+      }
+    }
+  }
+#endif  /* SUPPORT_UNICODE */
+
+/* A NULL match context means "use a default context", but we take the memory
+control functions from the pattern. */

 if (mcontext == NULL)
  {
@@ -6224,8 +6440,8 @@ else mb->memctl = mcontext->memctl;
 anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;
 firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
 startline = (re->flags & PCRE2_STARTLINE) != 0;
-bumpalong_limit =  (mcontext->offset_limit == PCRE2_UNSET)?
-  end_subject : subject + mcontext->offset_limit;
+bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)?
+  true_end_subject : subject + mcontext->offset_limit;

 /* Initialize and set up the fixed fields in the callout block, with a pointer
 in the match block. */
@@ -6236,7 +6452,8 @@ cb.subject = subject;
 cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
 cb.callout_flags = 0;

-/* Fill in the remaining fields in the match block. */
+/* Fill in the remaining fields in the match block, except for moptions, which
+gets set later. */

 mb->callout = mcontext->callout;
 mb->callout_data = mcontext->callout_data;
@@ -6245,13 +6462,11 @@ mb->start_subject = subject;
 mb->start_offset = start_offset;
 mb->end_subject = end_subject;
 mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0;
-
-mb->moptions = options;                 /* Match options */
-mb->poptions = re->overall_options;     /* Pattern options */
-
+mb->allowemptypartial = (re->max_lookbehind > 0) ||
+    (re->flags & PCRE2_MATCH_EMPTY) != 0;
+mb->poptions = re->overall_options;          /* Pattern options */
 mb->ignore_skip_arg = 0;
-mb->mark = mb->nomatch_mark = NULL;     /* In case never set */
-mb->hitend = FALSE;
+mb->mark = mb->nomatch_mark = NULL;          /* In case never set */

 /* The name table is needed for finding all the numbers associated with a
 given name, for condition testing. The code follows the name table. */
@@ -6404,6 +6619,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)
 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
 the loop runs just once. */

+#ifdef SUPPORT_UNICODE
+FRAGMENT_RESTART:
+#endif
+
+start_partial = match_partial = NULL;
+mb->hitend = FALSE;
+
 for(;;)
  {
  PCRE2_SPTR new_start_match;
@@ -6473,7 +6695,10 @@ for(;;)
    /* Not anchored. Advance to a unique first code unit if there is one. In
    8-bit mode, the use of memchr() gives a big speed up, even though we have
    to call it twice in caseless mode, in order to find the earliest occurrence
-    of the character in either of its cases. */
+    of the character in either of its cases. If a call to memchr() that
+    searches the rest of the subject fails to find one case, remember that in
+    order not to keep on repeating the search. This can make a huge difference
+    when the strings are very long and only one case is present. */

    else
      {
@@ -6487,11 +6712,29 @@ for(;;)
                (smc = UCHAR21TEST(start_match)) != first_cu &&
                  smc != first_cu2)
            start_match++;
+
 #else  /* 8-bit code units */
-          PCRE2_SPTR pp1 =
-            memchr(start_match, first_cu, end_subject-start_match);
-          PCRE2_SPTR pp2 =
-            memchr(start_match, first_cu2, end_subject-start_match);
+          PCRE2_SPTR pp1 = NULL;
+          PCRE2_SPTR pp2 = NULL;
+          PCRE2_SIZE cu2size = end_subject - start_match;
+
+          if (!memchr_not_found_first_cu)
+            {
+            pp1 = memchr(start_match, first_cu, end_subject - start_match);
+            if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
+              else cu2size = pp1 - start_match;
+            }
+
+          /* If pp1 is not NULL, we have arranged to search only as far as pp1,
+          to see if the other case is earlier, so we can set "not found" only
+          when both searches have returned NULL. */
+
+          if (!memchr_not_found_first_cu2)
+            {
+            pp2 = memchr(start_match, first_cu2, cu2size);
+            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+            }
+
          if (pp1 == NULL)
            start_match = (pp2 == NULL)? end_subject : pp2;
          else
@@ -6523,7 +6766,7 @@ for(;;)
        we also let the cycle run, because the matching string is legitimately
        allowed to start with the first code unit of a newline. */

-        if (!mb->partial && start_match >= mb->end_subject)
+        if (mb->partial == 0 && start_match >= mb->end_subject)
          {
          rc = MATCH_NOMATCH;
          break;
@@ -6582,7 +6825,7 @@ for(;;)

        /* See comment above in first_cu checking about the next few lines. */

-        if (!mb->partial && start_match >= mb->end_subject)
+        if (mb->partial == 0 && start_match >= mb->end_subject)
          {
          rc = MATCH_NOMATCH;
          break;
@@ -6596,8 +6839,10 @@ for(;;)

    /* The following two optimizations must be disabled for partial matching. */

-    if (!mb->partial)
+    if (mb->partial == 0)
      {
+      PCRE2_SPTR p;
+
      /* The minimum matching length is a lower bound; no string of that length
      may actually match the pattern. Although the value is, strictly, in
      characters, we treat it as code units to avoid spending too much time in
@@ -6621,60 +6866,57 @@ for(;;)
      memchr() twice in the caseless case because we only need to check for the
      presence of the character in either case, not find the first occurrence.

+      The search can be skipped if the code unit was found later than the
+      current starting point in a previous iteration of the bumpalong loop.
+
      HOWEVER: when the subject string is very, very long, searching to its end
      can take a long time, and give bad performance on quite ordinary
-      patterns. This showed up when somebody was matching something like
-      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
-      sufficiently long. */
+      anchored patterns. This showed up when somebody was matching something
+      like /^\d+C/ on a 32-megabyte string... so we don't do this when the
+      string is sufficiently long, but it's worth searching a lot more for
+      unanchored patterns. */

-      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
+      p = start_match + (has_first_cu? 1:0);
+      if (has_req_cu && p > req_cu_ptr)
        {
-        PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
+        PCRE2_SIZE check_length = end_subject - start_match;

-        /* We don't need to repeat the search if we haven't yet reached the
-        place we found it last time round the bumpalong loop. */
-
-        if (p > req_cu_ptr)
+        if (check_length < REQ_CU_MAX ||
+              (!anchored && check_length < REQ_CU_MAX * 1000))
          {
-          if (p < end_subject)
+          if (req_cu != req_cu2)  /* Caseless */
            {
-            if (req_cu != req_cu2)  /* Caseless */
-              {
 #if PCRE2_CODE_UNIT_WIDTH != 8
-              do
-                {
-                uint32_t pp = UCHAR21INCTEST(p);
-                if (pp == req_cu || pp == req_cu2) { p--; break; }
-                }
-              while (p < end_subject);
-
-#else  /* 8-bit code units */
-              PCRE2_SPTR pp = p;
-              p = memchr(pp, req_cu, end_subject - pp);
-              if (p == NULL)
-                {
-                p = memchr(pp, req_cu2, end_subject - pp);
-                if (p == NULL) p = end_subject;
-                }
-#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
+            while (p < end_subject)
+              {
+              uint32_t pp = UCHAR21INCTEST(p);
+              if (pp == req_cu || pp == req_cu2) { p--; break; }
              }
-
-            /* The caseful case */
-
-            else
-              {
-#if PCRE2_CODE_UNIT_WIDTH != 8
-              do
-                {
-                if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
-                }
-              while (p < end_subject);
-
 #else  /* 8-bit code units */
-              p = memchr(p, req_cu, end_subject - p);
+            PCRE2_SPTR pp = p;
+            p = memchr(pp, req_cu, end_subject - pp);
+            if (p == NULL)
+              {
+              p = memchr(pp, req_cu2, end_subject - pp);
              if (p == NULL) p = end_subject;
-#endif
              }
+#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
+            }
+
+          /* The caseful case */
+
+          else
+            {
+#if PCRE2_CODE_UNIT_WIDTH != 8
+            while (p < end_subject)
+              {
+              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
+              }
+
+#else  /* 8-bit code units */
+            p = memchr(p, req_cu, end_subject - p);
+            if (p == NULL) p = end_subject;
+#endif
            }

          /* If we can't find the required code unit, break the bumpalong loop,
@@ -6714,6 +6956,11 @@ for(;;)

  mb->start_used_ptr = start_match;
  mb->last_used_ptr = start_match;
+#ifdef SUPPORT_UNICODE
+  mb->moptions = options | fragment_options;
+#else
+  mb->moptions = options;
+#endif
  mb->match_call_count = 0;
  mb->end_offset_top = 0;
  mb->skip_arg_count = 0;
@@ -6839,6 +7086,68 @@ for(;;)

 ENDLOOP:

+/* If end_subject != true_end_subject, it means we are handling invalid UTF,
+and have just processed a non-terminal fragment. If this resulted in no match
+or a partial match we must carry on to the next fragment (a partial match is
+returned to the caller only at the very end of the subject). A loop is used to
+avoid trying to match against empty fragments; if the pattern can match an
+empty string it would have done so already. */
+
+#ifdef SUPPORT_UNICODE
+if (utf && end_subject != true_end_subject &&
+    (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL))
+  {
+  for (;;)
+    {
+    /* Advance past the first bad code unit, and then skip invalid character
+    starting code units in 8-bit and 16-bit modes. */
+
+    start_match = end_subject + 1;
+#if PCRE2_CODE_UNIT_WIDTH != 32
+    while (start_match < true_end_subject && NOT_FIRSTCU(*start_match))
+      start_match++;
+#endif
+
+    /* If we have hit the end of the subject, there isn't another non-empty
+    fragment, so give up. */
+
+    if (start_match >= true_end_subject)
+      {
+      rc = MATCH_NOMATCH;  /* In case it was partial */
+      break;
+      }
+
+    /* Check the rest of the subject */
+
+    mb->check_subject = start_match;
+    rc = PRIV(valid_utf)(start_match, length - (start_match - subject),
+      &(match_data->startchar));
+
+    /* The rest of the subject is valid UTF. */
+
+    if (rc == 0)
+      {
+      mb->end_subject = end_subject = true_end_subject;
+      fragment_options = PCRE2_NOTBOL;
+      goto FRAGMENT_RESTART;
+      }
+
+    /* A subsequent UTF error has been found; if the next fragment is
+    non-empty, set up to process it. Otherwise, let the loop advance. */
+
+    else if (rc < 0)
+      {
+      mb->end_subject = end_subject = start_match + match_data->startchar;
+      if (end_subject > start_match)
+        {
+        fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL;
+        goto FRAGMENT_RESTART;
+        }
+      }
+    }
+  }
+#endif  /* SUPPORT_UNICODE */
+
 /* Release an enlarged frame vector that is on the heap. */

 if (mb->match_frames != mb->stack_frames)