1
0
mirror of https://github.com/godotengine/godot.git synced 2025-11-30 16:26:50 +00:00

pcre2: Update to upstream version 10.40

Changelog: https://github.com/PCRE2Project/pcre2/blob/pcre2-10.40/ChangeLog
This commit is contained in:
Rémi Verschelde
2022-05-17 16:38:55 +02:00
parent d5c1de784c
commit fd6eb2c2d2
48 changed files with 13966 additions and 9395 deletions

View File

@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
New API code Copyright (c) 2016-2018 University of Cambridge
New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -68,26 +68,26 @@ Arguments:
Returns: TRUE if this is a valid script run
*/
/* These dummy values must be less than the negation of the largest offset in
the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
records (and is only likely to be a few hundred). */
/* These are states in the checking process. */
#define SCRIPT_UNSET (-99999)
#define SCRIPT_HANPENDING (-99998)
#define SCRIPT_HANHIRAKATA (-99997)
#define SCRIPT_HANBOPOMOFO (-99996)
#define SCRIPT_HANHANGUL (-99995)
#define SCRIPT_LIST (-99994)
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
SCRIPT_HANPENDING, /* Have had only Han characters */
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
SCRIPT_HANHANGUL /* Expect Han or Hangul */
};
#define INTERSECTION_LIST_SIZE 50
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
int require_script = SCRIPT_UNSET;
uint8_t intersection_list[INTERSECTION_LIST_SIZE];
const uint8_t *require_list = NULL;
uint32_t require_state = SCRIPT_UNSET;
uint32_t require_map[FULL_MAPSIZE];
uint32_t map[FULL_MAPSIZE];
uint32_t require_digitset = 0;
uint32_t c;
@@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;
/* Initialize the require map. This is a full-size bitmap that has a bit for
every script, as opposed to the maps in ucd_script_sets, which only have bits
for scripts less than ucp_Unknown - those that appear in script extension
lists. */
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
/* Scan strings of two or more characters, checking the Unicode characteristics
of each code point. We make use of the Script Extensions property. There is
special code for scripts that can be combined with characters from the Han
Chinese script. This may be used in conjunction with four other scripts in
these combinations:
of each code point. There is special code for scripts that can be combined with
characters from the Han Chinese script. This may be used in conjunction with
four other scripts in these combinations:
. Han with Hiragana and Katakana is allowed (for Japanese).
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
@@ -119,310 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
for (;;)
{
const ucd_record *ucd = GET_UCD(c);
int32_t scriptx = ucd->scriptx;
uint32_t script = ucd->script;
/* If the script extension is Unknown, the string is not a valid script run.
Such characters can only form script runs of length one. */
/* If the script is Unknown, the string is not a valid script run. Such
characters can only form script runs of length one (see test above). */
if (scriptx == ucp_Unknown) return FALSE;
if (script == ucp_Unknown) return FALSE;
/* A character whose script extension is Inherited is always accepted with
any script, and plays no further part in this testing. A character whose
script is Common is always accepted, but must still be tested for a digit
below. The scriptx value at this point is non-zero, because zero is
ucp_Unknown, tested for above. */
/* A character without any script extensions whose script is Inherited or
Common is always accepted with any script. If there are extensions, the
following processing happens for all scripts. */
if (scriptx != ucp_Inherited)
if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
{
if (scriptx != ucp_Common)
BOOL OK;
/* Set up a full-sized map for this character that can include bits for all
scripts. Copy the scriptx map for this character (which covers those
scripts that appear in script extension lists), set the remaining values to
zero, and then, except for Common or Inherited, add this script's bit to
the map. */
memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
/* Handle the different checking states */
switch(require_state)
{
/* If the script extension value is positive, the character is not a mark
that can be used with many scripts. In the simple case we either set or
compare with the required script. However, handling the scripts that can
combine with Han are more complicated, as is the case when the previous
characters have been man-script marks. */
/* First significant character - it might follow Common or Inherited
characters that do not have any script extensions. */
if (scriptx > 0)
case SCRIPT_UNSET:
switch(script)
{
switch(require_script)
{
/* Either the first significant character (require_script unset) or
after only Han characters. */
case ucp_Han:
require_state = SCRIPT_HANPENDING;
break;
case SCRIPT_UNSET:
case SCRIPT_HANPENDING:
switch(scriptx)
{
case ucp_Han:
require_script = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_state = SCRIPT_HANHIRAKATA;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_script = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_state = SCRIPT_HANBOPOMOFO;
break;
case ucp_Bopomofo:
require_script = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_state = SCRIPT_HANHANGUL;
break;
case ucp_Hangul:
require_script = SCRIPT_HANHANGUL;
break;
default:
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
require_state = SCRIPT_MAP;
break;
}
break;
/* Not a Han-related script. If expecting one, fail. Otherise set
the requirement to this script. */
/* The first significant character was Han. An inspection of the Unicode
11.0.0 files shows that there are the following types of Script Extension
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
scripts:
default:
if (require_script == SCRIPT_HANPENDING) return FALSE;
require_script = scriptx;
break;
}
break;
. Bopomofo + Han
. Han + Hiragana + Katakana
. Hiragana + Katakana
. Bopopmofo + Hangul + Han + Hiragana + Katakana
/* Previously encountered one of the "with Han" scripts. Check that
this character is appropriate. */
case SCRIPT_HANHIRAKATA:
if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
scriptx != ucp_Katakana)
return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
break;
case SCRIPT_HANHANGUL:
if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
break;
/* We have a list of scripts to check that is derived from one or
more previous characters. This is either one of the lists in
ucd_script_sets[] (for one previous character) or the intersection of
several lists for multiple characters. */
case SCRIPT_LIST:
{
const uint8_t *list;
for (list = require_list; *list != 0; list++)
{
if (*list == scriptx) break;
}
if (*list == 0) return FALSE;
}
/* The rest of the string must be in this script, but we have to
allow for the Han complications. */
switch(scriptx)
{
case ucp_Han:
require_script = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_script = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_script = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_script = SCRIPT_HANHANGUL;
break;
default:
require_script = scriptx;
break;
}
break;
/* This is the easy case when a single script is required. */
default:
if (scriptx != require_script) return FALSE;
break;
}
} /* End of handing positive scriptx */
/* If scriptx is negative, this character is a mark-type character that
has a list of permitted scripts. */
else
{
uint32_t chspecial;
const uint8_t *clist, *rlist;
const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
switch(require_script)
{
case SCRIPT_UNSET:
require_list = PRIV(ucd_script_sets) - scriptx;
require_script = SCRIPT_LIST;
break;
/* An inspection of the Unicode 11.0.0 files shows that there are the
following types of Script Extension list that involve the Han,
Bopomofo, Hiragana, Katakana, and Hangul scripts:
. Bopomofo + Han
. Han + Hiragana + Katakana
. Hiragana + Katakana
. Bopopmofo + Hangul + Han + Hiragana + Katakana
The following code tries to make sense of this. */
The following code tries to make sense of this. */
#define FOUND_BOPOMOFO 1
#define FOUND_HIRAGANA 2
#define FOUND_KATAKANA 4
#define FOUND_HANGUL 8
case SCRIPT_HANPENDING:
chspecial = 0;
for (; *list != 0; list++)
{
switch (*list)
{
case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
case ucp_Hangul: chspecial |= FOUND_HANGUL; break;
default: break;
}
}
if (chspecial == 0) return FALSE;
if (chspecial == FOUND_BOPOMOFO)
{
require_script = SCRIPT_HANBOPOMOFO;
}
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
{
require_script = SCRIPT_HANHIRAKATA;
}
/* Otherwise it must be allowed with all of them, so remain in
the pending state. */
break;
case SCRIPT_HANHIRAKATA:
for (; *list != 0; list++)
{
if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
}
if (*list == 0) return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
for (; *list != 0; list++)
{
if (*list == ucp_Bopomofo) break;
}
if (*list == 0) return FALSE;
break;
case SCRIPT_HANHANGUL:
for (; *list != 0; list++)
{
if (*list == ucp_Hangul) break;
}
if (*list == 0) return FALSE;
break;
/* Previously encountered one or more characters that are allowed
with a list of scripts. Build the intersection of the required list
with this character's list in intersection_list[]. This code is
written so that it still works OK if the required list is already in
that vector. */
case SCRIPT_LIST:
{
int i = 0;
for (rlist = require_list; *rlist != 0; rlist++)
{
for (clist = list; *clist != 0; clist++)
{
if (*rlist == *clist)
{
intersection_list[i++] = *rlist;
break;
}
}
}
if (i == 0) return FALSE; /* No scripts in common */
/* If there's just one script in common, we can set it as the
unique required script. Otherwise, terminate the intersection list
and make it the required list. */
if (i == 1)
{
require_script = intersection_list[0];
}
else
{
intersection_list[i] = 0;
require_list = intersection_list;
}
}
break;
/* The previously set required script is a single script, not
Han-related. Check that it is in this character's list. */
default:
for (; *list != 0; list++)
{
if (*list == require_script) break;
}
if (*list == 0) return FALSE;
break;
}
} /* End of handling negative scriptx */
} /* End of checking non-Common character */
/* The character is in an acceptable script. We must now ensure that all
decimal digits in the string come from the same set. Some scripts (e.g.
Common, Arabic) have more than one set of decimal digits. This code does
not allow mixing sets, even within the same script. The vector called
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
following elements, and then, in ascending order, the code points of the
'9' characters in every set of 10 digits. Each set is identified by the
offset in the vector of its '9' character. An initial check of the first
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
if (ucd->chartype == ucp_Nd)
{
uint32_t digitset;
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
case SCRIPT_HANPENDING:
if (script != ucp_Han) /* Another Han does nothing */
{
int mid;
int bot = 1;
int top = PRIV(ucd_digit_sets)[0];
for (;;)
uint32_t chspecial = 0;
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
if (chspecial == 0) return FALSE; /* Not allowed with Han */
if (chspecial == FOUND_BOPOMOFO)
require_state = SCRIPT_HANBOPOMOFO;
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
require_state = SCRIPT_HANHIRAKATA;
/* Otherwise this character must be allowed with all of them, so remain
in the pending state. */
}
break;
/* Previously encountered one of the "with Han" scripts. Check that
this character is appropriate. */
case SCRIPT_HANHIRAKATA:
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
break;
case SCRIPT_HANBOPOMOFO:
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
break;
case SCRIPT_HANHANGUL:
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
break;
/* Previously encountered one or more characters that are allowed with a
list of scripts. */
case SCRIPT_MAP:
OK = FALSE;
for (int i = 0; i < FULL_MAPSIZE; i++)
{
if ((require_map[i] & map[i]) != 0)
{
if (top <= bot + 1) /* <= rather than == is paranoia */
{
digitset = top;
break;
}
mid = (top + bot) / 2;
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
OK = TRUE;
break;
}
}
/* A required value of 0 means "unset". */
if (!OK) return FALSE;
if (require_digitset == 0) require_digitset = digitset;
else if (digitset != require_digitset) return FALSE;
} /* End digit handling */
} /* End checking non-Inherited character */
/* The rest of the string must be in this script, but we have to
allow for the Han complications. */
switch(script)
{
case ucp_Han:
require_state = SCRIPT_HANPENDING;
break;
case ucp_Hiragana:
case ucp_Katakana:
require_state = SCRIPT_HANHIRAKATA;
break;
case ucp_Bopomofo:
require_state = SCRIPT_HANBOPOMOFO;
break;
case ucp_Hangul:
require_state = SCRIPT_HANHANGUL;
break;
/* Compute the intersection of the required list of scripts and the
allowed scripts for this character. */
default:
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
break;
}
break;
}
} /* End checking character's script and extensions. */
/* The character is in an acceptable script. We must now ensure that all
decimal digits in the string come from the same set. Some scripts (e.g.
Common, Arabic) have more than one set of decimal digits. This code does
not allow mixing sets, even within the same script. The vector called
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
following elements, and then, in ascending order, the code points of the
'9' characters in every set of 10 digits. Each set is identified by the
offset in the vector of its '9' character. An initial check of the first
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
if (ucd->chartype == ucp_Nd)
{
uint32_t digitset;
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
{
int mid;
int bot = 1;
int top = PRIV(ucd_digit_sets)[0];
for (;;)
{
if (top <= bot + 1) /* <= rather than == is paranoia */
{
digitset = top;
break;
}
mid = (top + bot) / 2;
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
}
}
/* A required value of 0 means "unset". */
if (require_digitset == 0) require_digitset = digitset;
else if (digitset != require_digitset) return FALSE;
} /* End digit handling */
/* If we haven't yet got to the end, pick up the next character. */