pcre2: Update to upstream version 10.40

Changelog: https://github.com/PCRE2Project/pcre2/blob/pcre2-10.40/ChangeLog
2025-11-30 16:26:50 +00:00 · 2022-05-17 16:38:55 +02:00
parent d5c1de784c
commit fd6eb2c2d2
48 changed files with 13966 additions and 9395 deletions
--- a/thirdparty/pcre2/src/pcre2_script_run.c
+++ b/thirdparty/pcre2/src/pcre2_script_run.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2021 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -68,26 +68,26 @@ Arguments:
 Returns:    TRUE if this is a valid script run
 */

-/* These dummy values must be less than the negation of the largest offset in
-the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
-records (and is only likely to be a few hundred). */
+/* These are states in the checking process. */

-#define SCRIPT_UNSET        (-99999)
-#define SCRIPT_HANPENDING   (-99998)
-#define SCRIPT_HANHIRAKATA  (-99997)
-#define SCRIPT_HANBOPOMOFO  (-99996)
-#define SCRIPT_HANHANGUL    (-99995)
-#define SCRIPT_LIST         (-99994)
+enum { SCRIPT_UNSET,          /* Requirement as yet unknown */
+       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */
+       SCRIPT_HANPENDING,     /* Have had only Han characters */
+       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */
+       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */
+       SCRIPT_HANHANGUL       /* Expect Han or Hangul */
+       };

-#define INTERSECTION_LIST_SIZE 50
+#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
+#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)

 BOOL
 PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
 {
 #ifdef SUPPORT_UNICODE
-int require_script = SCRIPT_UNSET;
-uint8_t intersection_list[INTERSECTION_LIST_SIZE];
-const uint8_t *require_list = NULL;
+uint32_t require_state = SCRIPT_UNSET;
+uint32_t require_map[FULL_MAPSIZE];
+uint32_t map[FULL_MAPSIZE];
 uint32_t require_digitset = 0;
 uint32_t c;

@@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
 GETCHARINCTEST(c, ptr);
 if (ptr >= endptr) return TRUE;

+/* Initialize the require map. This is a full-size bitmap that has a bit for
+every script, as opposed to the maps in ucd_script_sets, which only have bits
+for scripts less than ucp_Unknown - those that appear in script extension
+lists. */
+
+for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
+
 /* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. We make use of the Script Extensions property. There is
-special code for scripts that can be combined with characters from the Han
-Chinese script. This may be used in conjunction with four other scripts in
-these combinations:
+of each code point. There is special code for scripts that can be combined with
+characters from the Han Chinese script. This may be used in conjunction with
+four other scripts in these combinations:

 . Han with Hiragana and Katakana is allowed (for Japanese).
 . Han with Bopomofo is allowed (for Taiwanese Mandarin).
@@ -119,310 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
 for (;;)
  {
  const ucd_record *ucd = GET_UCD(c);
-  int32_t scriptx = ucd->scriptx;
+  uint32_t script = ucd->script;

-  /* If the script extension is Unknown, the string is not a valid script run.
-  Such characters can only form script runs of length one. */
+  /* If the script is Unknown, the string is not a valid script run. Such
+  characters can only form script runs of length one (see test above). */

-  if (scriptx == ucp_Unknown) return FALSE;
+  if (script == ucp_Unknown) return FALSE;

-  /* A character whose script extension is Inherited is always accepted with
-  any script, and plays no further part in this testing. A character whose
-  script is Common is always accepted, but must still be tested for a digit
-  below. The scriptx value at this point is non-zero, because zero is
-  ucp_Unknown, tested for above. */
+  /* A character without any script extensions whose script is Inherited or
+  Common is always accepted with any script. If there are extensions, the
+  following processing happens for all scripts. */

-  if (scriptx != ucp_Inherited)
+  if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
    {
-    if (scriptx != ucp_Common)
+    BOOL OK;
+
+    /* Set up a full-sized map for this character that can include bits for all
+    scripts. Copy the scriptx map for this character (which covers those
+    scripts that appear in script extension lists), set the remaining values to
+    zero, and then, except for Common or Inherited, add this script's bit to
+    the map. */
+
+    memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
+    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
+    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
+
+    /* Handle the different checking states */
+
+    switch(require_state)
      {
-      /* If the script extension value is positive, the character is not a mark
-      that can be used with many scripts. In the simple case we either set or
-      compare with the required script. However, handling the scripts that can
-      combine with Han are more complicated, as is the case when the previous
-      characters have been man-script marks. */
+      /* First significant character - it might follow Common or Inherited
+      characters that do not have any script extensions. */

-      if (scriptx > 0)
+      case SCRIPT_UNSET:
+      switch(script)
        {
-        switch(require_script)
-          {
-          /* Either the first significant character (require_script unset) or
-          after only Han characters. */
+        case ucp_Han:
+        require_state = SCRIPT_HANPENDING;
+        break;

-          case SCRIPT_UNSET:
-          case SCRIPT_HANPENDING:
-          switch(scriptx)
-            {
-            case ucp_Han:
-            require_script = SCRIPT_HANPENDING;
-            break;
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_state = SCRIPT_HANHIRAKATA;
+        break;

-            case ucp_Hiragana:
-            case ucp_Katakana:
-            require_script = SCRIPT_HANHIRAKATA;
-            break;
+        case ucp_Bopomofo:
+        require_state = SCRIPT_HANBOPOMOFO;
+        break;

-            case ucp_Bopomofo:
-            require_script = SCRIPT_HANBOPOMOFO;
-            break;
+        case ucp_Hangul:
+        require_state = SCRIPT_HANHANGUL;
+        break;

-            case ucp_Hangul:
-            require_script = SCRIPT_HANHANGUL;
-            break;
+        default:
+        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
+        require_state = SCRIPT_MAP;
+        break;
+        }
+      break;

-            /* Not a Han-related script. If expecting one, fail. Otherise set
-            the requirement to this script. */
+      /* The first significant character was Han. An inspection of the Unicode
+      11.0.0 files shows that there are the following types of Script Extension
+      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
+      scripts:

-            default:
-            if (require_script == SCRIPT_HANPENDING) return FALSE;
-            require_script = scriptx;
-            break;
-            }
-          break;
+      . Bopomofo + Han
+      . Han + Hiragana + Katakana
+      . Hiragana + Katakana
+      . Bopopmofo + Hangul + Han + Hiragana + Katakana

-          /* Previously encountered one of the "with Han" scripts. Check that
-          this character is appropriate. */
-
-          case SCRIPT_HANHIRAKATA:
-          if (scriptx != ucp_Han && scriptx != ucp_Hiragana && 
-              scriptx != ucp_Katakana)
-            return FALSE;
-          break;
-
-          case SCRIPT_HANBOPOMOFO:
-          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
-          break;
-
-          case SCRIPT_HANHANGUL:
-          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
-          break;
-
-          /* We have a list of scripts to check that is derived from one or
-          more previous characters. This is either one of the lists in
-          ucd_script_sets[] (for one previous character) or the intersection of
-          several lists for multiple characters. */
-
-          case SCRIPT_LIST:
-            {
-            const uint8_t *list;
-            for (list = require_list; *list != 0; list++)
-              {
-              if (*list == scriptx) break;
-              }
-            if (*list == 0) return FALSE;
-            }
-
-          /* The rest of the string must be in this script, but we have to 
-          allow for the Han complications. */
-          
-          switch(scriptx)
-            {
-            case ucp_Han:
-            require_script = SCRIPT_HANPENDING;
-            break;
-
-            case ucp_Hiragana:
-            case ucp_Katakana:
-            require_script = SCRIPT_HANHIRAKATA;
-            break;
-
-            case ucp_Bopomofo:
-            require_script = SCRIPT_HANBOPOMOFO;
-            break;
-
-            case ucp_Hangul:
-            require_script = SCRIPT_HANHANGUL;
-            break;
-
-            default:
-            require_script = scriptx;
-            break;
-            }  
-          break;
-
-          /* This is the easy case when a single script is required. */
-
-          default:
-          if (scriptx != require_script) return FALSE;
-          break;
-          }
-        }  /* End of handing positive scriptx */
-
-      /* If scriptx is negative, this character is a mark-type character that
-      has a list of permitted scripts. */
-
-      else
-        {
-        uint32_t chspecial;
-        const uint8_t *clist, *rlist;
-        const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
-        
-        switch(require_script)
-          {
-          case SCRIPT_UNSET:
-          require_list = PRIV(ucd_script_sets) - scriptx;
-          require_script = SCRIPT_LIST;
-          break;
-
-          /* An inspection of the Unicode 11.0.0 files shows that there are the
-          following types of Script Extension list that involve the Han,
-          Bopomofo, Hiragana, Katakana, and Hangul scripts:
-
-          . Bopomofo + Han
-          . Han + Hiragana + Katakana
-          . Hiragana + Katakana
-          . Bopopmofo + Hangul + Han + Hiragana + Katakana
-
-          The following code tries to make sense of this. */
+      The following code tries to make sense of this. */

 #define FOUND_BOPOMOFO 1
 #define FOUND_HIRAGANA 2
 #define FOUND_KATAKANA 4
 #define FOUND_HANGUL   8

-          case SCRIPT_HANPENDING:
-          chspecial = 0;
-          for (; *list != 0; list++)
-            {
-            switch (*list)
-              {
-              case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
-              case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
-              case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
-              case ucp_Hangul:   chspecial |= FOUND_HANGUL; break;
-              default: break;
-              }
-            }
-
-           if (chspecial == 0) return FALSE;
-
-           if (chspecial == FOUND_BOPOMOFO)
-             {
-             require_script = SCRIPT_HANBOPOMOFO;
-             }
-           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
-             {
-             require_script = SCRIPT_HANHIRAKATA;
-             }
-
-          /* Otherwise it must be allowed with all of them, so remain in
-          the pending state. */
-
-          break;
-
-          case SCRIPT_HANHIRAKATA:
-          for (; *list != 0; list++)
-            {
-            if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
-            }
-          if (*list == 0) return FALSE;
-          break;
-
-          case SCRIPT_HANBOPOMOFO:
-          for (; *list != 0; list++)
-            {
-            if (*list == ucp_Bopomofo) break;
-            }
-          if (*list == 0) return FALSE;
-          break;
-
-          case SCRIPT_HANHANGUL:
-          for (; *list != 0; list++)
-            {
-            if (*list == ucp_Hangul) break;
-            }
-          if (*list == 0) return FALSE;
-          break;
-
-          /* Previously encountered one or more characters that are allowed
-          with a list of scripts. Build the intersection of the required list
-          with this character's list in intersection_list[]. This code is
-          written so that it still works OK if the required list is already in
-          that vector. */
-
-          case SCRIPT_LIST:
-            {
-            int i = 0;
-            for (rlist = require_list; *rlist != 0; rlist++)
-              {
-              for (clist = list; *clist != 0; clist++)
-                {
-                if (*rlist == *clist)
-                  {
-                  intersection_list[i++] = *rlist;
-                  break;
-                  }
-                }
-              }
-            if (i == 0) return FALSE;  /* No scripts in common */
-
-            /* If there's just one script in common, we can set it as the
-            unique required script. Otherwise, terminate the intersection list
-            and make it the required list. */
-
-            if (i == 1)
-              {
-              require_script = intersection_list[0];
-              }
-            else
-              {
-              intersection_list[i] = 0;
-              require_list = intersection_list;
-              }
-            }
-          break;
-
-          /* The previously set required script is a single script, not
-          Han-related. Check that it is in this character's list. */
-
-          default:
-          for (; *list != 0; list++)
-            {
-            if (*list == require_script) break;
-            }
-          if (*list == 0) return FALSE;
-          break;
-          }
-        }  /* End of handling negative scriptx */
-      }    /* End of checking non-Common character */
-
-    /* The character is in an acceptable script. We must now ensure that all
-    decimal digits in the string come from the same set. Some scripts (e.g.
-    Common, Arabic) have more than one set of decimal digits. This code does
-    not allow mixing sets, even within the same script. The vector called
-    PRIV(ucd_digit_sets)[] contains, in its first element, the number of
-    following elements, and then, in ascending order, the code points of the
-    '9' characters in every set of 10 digits. Each set is identified by the
-    offset in the vector of its '9' character. An initial check of the first
-    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
-
-    if (ucd->chartype == ucp_Nd)
-      {
-      uint32_t digitset;
-
-      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+      case SCRIPT_HANPENDING:
+      if (script != ucp_Han)   /* Another Han does nothing */
        {
-        int mid;
-        int bot = 1;
-        int top = PRIV(ucd_digit_sets)[0];
-        for (;;)
+        uint32_t chspecial = 0;
+
+        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
+        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
+        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
+        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL;
+
+        if (chspecial == 0) return FALSE;   /* Not allowed with Han */
+
+        if (chspecial == FOUND_BOPOMOFO)
+          require_state = SCRIPT_HANBOPOMOFO;
+        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
+          require_state = SCRIPT_HANHIRAKATA;
+
+        /* Otherwise this character must be allowed with all of them, so remain
+        in the pending state. */
+        }
+      break;
+
+      /* Previously encountered one of the "with Han" scripts. Check that
+      this character is appropriate. */
+
+      case SCRIPT_HANHIRAKATA:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
+          MAPBIT(map, ucp_Katakana) == 0) return FALSE;
+      break;
+
+      case SCRIPT_HANBOPOMOFO:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
+      break;
+
+      case SCRIPT_HANHANGUL:
+      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
+      break;
+
+      /* Previously encountered one or more characters that are allowed with a
+      list of scripts. */
+
+      case SCRIPT_MAP:
+      OK = FALSE;
+
+      for (int i = 0; i < FULL_MAPSIZE; i++)
+        {
+        if ((require_map[i] & map[i]) != 0)
          {
-          if (top <= bot + 1)    /* <= rather than == is paranoia */
-            {
-            digitset = top;
-            break;
-            }
-          mid = (top + bot) / 2;
-          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+          OK = TRUE;
+          break;
          }
        }

-      /* A required value of 0 means "unset". */
+      if (!OK) return FALSE;

-      if (require_digitset == 0) require_digitset = digitset;
-        else if (digitset != require_digitset) return FALSE;
-      }   /* End digit handling */
-    }     /* End checking non-Inherited character */
+      /* The rest of the string must be in this script, but we have to
+      allow for the Han complications. */
+
+      switch(script)
+        {
+        case ucp_Han:
+        require_state = SCRIPT_HANPENDING;
+        break;
+
+        case ucp_Hiragana:
+        case ucp_Katakana:
+        require_state = SCRIPT_HANHIRAKATA;
+        break;
+
+        case ucp_Bopomofo:
+        require_state = SCRIPT_HANBOPOMOFO;
+        break;
+
+        case ucp_Hangul:
+        require_state = SCRIPT_HANHANGUL;
+        break;
+
+        /* Compute the intersection of the required list of scripts and the
+        allowed scripts for this character. */
+
+        default:
+        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
+        break;
+        }
+
+      break;
+      }
+    }   /* End checking character's script and extensions. */
+
+  /* The character is in an acceptable script. We must now ensure that all
+  decimal digits in the string come from the same set. Some scripts (e.g.
+  Common, Arabic) have more than one set of decimal digits. This code does
+  not allow mixing sets, even within the same script. The vector called
+  PRIV(ucd_digit_sets)[] contains, in its first element, the number of
+  following elements, and then, in ascending order, the code points of the
+  '9' characters in every set of 10 digits. Each set is identified by the
+  offset in the vector of its '9' character. An initial check of the first
+  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
+
+  if (ucd->chartype == ucp_Nd)
+    {
+    uint32_t digitset;
+
+    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+      {
+      int mid;
+      int bot = 1;
+      int top = PRIV(ucd_digit_sets)[0];
+      for (;;)
+        {
+        if (top <= bot + 1)    /* <= rather than == is paranoia */
+          {
+          digitset = top;
+          break;
+          }
+        mid = (top + bot) / 2;
+        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+        }
+      }
+
+    /* A required value of 0 means "unset". */
+
+    if (require_digitset == 0) require_digitset = digitset;
+      else if (digitset != require_digitset) return FALSE;
+    }   /* End digit handling */

  /* If we haven't yet got to the end, pick up the next character. */