You've already forked godot
mirror of
https://github.com/godotengine/godot.git
synced 2025-11-13 13:31:48 +00:00
pcre2: Update to upstream version 10.33
This commit is contained in:
441
thirdparty/pcre2/src/pcre2_script_run.c
vendored
Normal file
441
thirdparty/pcre2/src/pcre2_script_run.c
vendored
Normal file
@@ -0,0 +1,441 @@
|
||||
/*************************************************
|
||||
* Perl-Compatible Regular Expressions *
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Original API code Copyright (c) 1997-2012 University of Cambridge
|
||||
New API code Copyright (c) 2016-2018 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the University of Cambridge nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
-----------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/* This module contains the function for checking a script run. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pcre2_internal.h"
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Check script run *
|
||||
*************************************************/
|
||||
|
||||
/* A script run is conceptually a sequence of characters all in the same
|
||||
Unicode script. However, it isn't quite that simple. There are special rules
|
||||
for scripts that are commonly used together, and also special rules for digits.
|
||||
This function implements the appropriate checks, which is possible only when
|
||||
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
|
||||
no Unicode support; however, it should never be called in that circumstance
|
||||
because an error is given by pcre2_compile() if a script run is called for in a
|
||||
version of PCRE2 compiled without Unicode support.
|
||||
|
||||
Arguments:
|
||||
pgr point to the first character
|
||||
endptr point after the last character
|
||||
utf TRUE if in UTF mode
|
||||
|
||||
Returns: TRUE if this is a valid script run
|
||||
*/
|
||||
|
||||
/* These dummy values must be less than the negation of the largest offset in
|
||||
the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
|
||||
records (and is only likely to be a few hundred). */
|
||||
|
||||
#define SCRIPT_UNSET (-99999)
|
||||
#define SCRIPT_HANPENDING (-99998)
|
||||
#define SCRIPT_HANHIRAKATA (-99997)
|
||||
#define SCRIPT_HANBOPOMOFO (-99996)
|
||||
#define SCRIPT_HANHANGUL (-99995)
|
||||
#define SCRIPT_LIST (-99994)
|
||||
|
||||
#define INTERSECTION_LIST_SIZE 50
|
||||
|
||||
BOOL
|
||||
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
||||
{
|
||||
#ifdef SUPPORT_UNICODE
|
||||
int require_script = SCRIPT_UNSET;
|
||||
uint8_t intersection_list[INTERSECTION_LIST_SIZE];
|
||||
const uint8_t *require_list = NULL;
|
||||
uint32_t require_digitset = 0;
|
||||
uint32_t c;
|
||||
|
||||
#if PCRE2_CODE_UNIT_WIDTH == 32
|
||||
(void)utf; /* Avoid compiler warning */
|
||||
#endif
|
||||
|
||||
/* Any string containing fewer than 2 characters is a valid script run. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
if (ptr >= endptr) return TRUE;
|
||||
|
||||
/* Scan strings of two or more characters, checking the Unicode characteristics
|
||||
of each code point. We make use of the Script Extensions property. There is
|
||||
special code for scripts that can be combined with characters from the Han
|
||||
Chinese script. This may be used in conjunction with four other scripts in
|
||||
these combinations:
|
||||
|
||||
. Han with Hiragana and Katakana is allowed (for Japanese).
|
||||
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
||||
. Han with Hangul is allowed (for Korean).
|
||||
|
||||
If the first significant character's script is one of the four, the required
|
||||
script type is immediately known. However, if the first significant
|
||||
character's script is Han, we have to keep checking for a non-Han character.
|
||||
Hence the SCRIPT_HANPENDING state. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
const ucd_record *ucd = GET_UCD(c);
|
||||
int32_t scriptx = ucd->scriptx;
|
||||
|
||||
/* If the script extension is Unknown, the string is not a valid script run.
|
||||
Such characters can only form script runs of length one. */
|
||||
|
||||
if (scriptx == ucp_Unknown) return FALSE;
|
||||
|
||||
/* A character whose script extension is Inherited is always accepted with
|
||||
any script, and plays no further part in this testing. A character whose
|
||||
script is Common is always accepted, but must still be tested for a digit
|
||||
below. The scriptx value at this point is non-zero, because zero is
|
||||
ucp_Unknown, tested for above. */
|
||||
|
||||
if (scriptx != ucp_Inherited)
|
||||
{
|
||||
if (scriptx != ucp_Common)
|
||||
{
|
||||
/* If the script extension value is positive, the character is not a mark
|
||||
that can be used with many scripts. In the simple case we either set or
|
||||
compare with the required script. However, handling the scripts that can
|
||||
combine with Han are more complicated, as is the case when the previous
|
||||
characters have been man-script marks. */
|
||||
|
||||
if (scriptx > 0)
|
||||
{
|
||||
switch(require_script)
|
||||
{
|
||||
/* Either the first significant character (require_script unset) or
|
||||
after only Han characters. */
|
||||
|
||||
case SCRIPT_UNSET:
|
||||
case SCRIPT_HANPENDING:
|
||||
switch(scriptx)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_script = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_script = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
/* Not a Han-related script. If expecting one, fail. Otherise set
|
||||
the requirement to this script. */
|
||||
|
||||
default:
|
||||
if (require_script == SCRIPT_HANPENDING) return FALSE;
|
||||
require_script = scriptx;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Previously encountered one of the "with Han" scripts. Check that
|
||||
this character is appropriate. */
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
|
||||
scriptx != ucp_Katakana)
|
||||
return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
|
||||
break;
|
||||
|
||||
/* We have a list of scripts to check that is derived from one or
|
||||
more previous characters. This is either one of the lists in
|
||||
ucd_script_sets[] (for one previous character) or the intersection of
|
||||
several lists for multiple characters. */
|
||||
|
||||
case SCRIPT_LIST:
|
||||
{
|
||||
const uint8_t *list;
|
||||
for (list = require_list; *list != 0; list++)
|
||||
{
|
||||
if (*list == scriptx) break;
|
||||
}
|
||||
if (*list == 0) return FALSE;
|
||||
}
|
||||
|
||||
/* The rest of the string must be in this script, but we have to
|
||||
allow for the Han complications. */
|
||||
|
||||
switch(scriptx)
|
||||
{
|
||||
case ucp_Han:
|
||||
require_script = SCRIPT_HANPENDING;
|
||||
break;
|
||||
|
||||
case ucp_Hiragana:
|
||||
case ucp_Katakana:
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
break;
|
||||
|
||||
case ucp_Bopomofo:
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
break;
|
||||
|
||||
case ucp_Hangul:
|
||||
require_script = SCRIPT_HANHANGUL;
|
||||
break;
|
||||
|
||||
default:
|
||||
require_script = scriptx;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* This is the easy case when a single script is required. */
|
||||
|
||||
default:
|
||||
if (scriptx != require_script) return FALSE;
|
||||
break;
|
||||
}
|
||||
} /* End of handing positive scriptx */
|
||||
|
||||
/* If scriptx is negative, this character is a mark-type character that
|
||||
has a list of permitted scripts. */
|
||||
|
||||
else
|
||||
{
|
||||
uint32_t chspecial;
|
||||
const uint8_t *clist, *rlist;
|
||||
const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
|
||||
|
||||
switch(require_script)
|
||||
{
|
||||
case SCRIPT_UNSET:
|
||||
require_list = PRIV(ucd_script_sets) - scriptx;
|
||||
require_script = SCRIPT_LIST;
|
||||
break;
|
||||
|
||||
/* An inspection of the Unicode 11.0.0 files shows that there are the
|
||||
following types of Script Extension list that involve the Han,
|
||||
Bopomofo, Hiragana, Katakana, and Hangul scripts:
|
||||
|
||||
. Bopomofo + Han
|
||||
. Han + Hiragana + Katakana
|
||||
. Hiragana + Katakana
|
||||
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
||||
|
||||
The following code tries to make sense of this. */
|
||||
|
||||
#define FOUND_BOPOMOFO 1
|
||||
#define FOUND_HIRAGANA 2
|
||||
#define FOUND_KATAKANA 4
|
||||
#define FOUND_HANGUL 8
|
||||
|
||||
case SCRIPT_HANPENDING:
|
||||
chspecial = 0;
|
||||
for (; *list != 0; list++)
|
||||
{
|
||||
switch (*list)
|
||||
{
|
||||
case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
|
||||
case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
|
||||
case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
|
||||
case ucp_Hangul: chspecial |= FOUND_HANGUL; break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
if (chspecial == 0) return FALSE;
|
||||
|
||||
if (chspecial == FOUND_BOPOMOFO)
|
||||
{
|
||||
require_script = SCRIPT_HANBOPOMOFO;
|
||||
}
|
||||
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
||||
{
|
||||
require_script = SCRIPT_HANHIRAKATA;
|
||||
}
|
||||
|
||||
/* Otherwise it must be allowed with all of them, so remain in
|
||||
the pending state. */
|
||||
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHIRAKATA:
|
||||
for (; *list != 0; list++)
|
||||
{
|
||||
if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
|
||||
}
|
||||
if (*list == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANBOPOMOFO:
|
||||
for (; *list != 0; list++)
|
||||
{
|
||||
if (*list == ucp_Bopomofo) break;
|
||||
}
|
||||
if (*list == 0) return FALSE;
|
||||
break;
|
||||
|
||||
case SCRIPT_HANHANGUL:
|
||||
for (; *list != 0; list++)
|
||||
{
|
||||
if (*list == ucp_Hangul) break;
|
||||
}
|
||||
if (*list == 0) return FALSE;
|
||||
break;
|
||||
|
||||
/* Previously encountered one or more characters that are allowed
|
||||
with a list of scripts. Build the intersection of the required list
|
||||
with this character's list in intersection_list[]. This code is
|
||||
written so that it still works OK if the required list is already in
|
||||
that vector. */
|
||||
|
||||
case SCRIPT_LIST:
|
||||
{
|
||||
int i = 0;
|
||||
for (rlist = require_list; *rlist != 0; rlist++)
|
||||
{
|
||||
for (clist = list; *clist != 0; clist++)
|
||||
{
|
||||
if (*rlist == *clist)
|
||||
{
|
||||
intersection_list[i++] = *rlist;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i == 0) return FALSE; /* No scripts in common */
|
||||
|
||||
/* If there's just one script in common, we can set it as the
|
||||
unique required script. Otherwise, terminate the intersection list
|
||||
and make it the required list. */
|
||||
|
||||
if (i == 1)
|
||||
{
|
||||
require_script = intersection_list[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
intersection_list[i] = 0;
|
||||
require_list = intersection_list;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
/* The previously set required script is a single script, not
|
||||
Han-related. Check that it is in this character's list. */
|
||||
|
||||
default:
|
||||
for (; *list != 0; list++)
|
||||
{
|
||||
if (*list == require_script) break;
|
||||
}
|
||||
if (*list == 0) return FALSE;
|
||||
break;
|
||||
}
|
||||
} /* End of handling negative scriptx */
|
||||
} /* End of checking non-Common character */
|
||||
|
||||
/* The character is in an acceptable script. We must now ensure that all
|
||||
decimal digits in the string come from the same set. Some scripts (e.g.
|
||||
Common, Arabic) have more than one set of decimal digits. This code does
|
||||
not allow mixing sets, even within the same script. The vector called
|
||||
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
||||
following elements, and then, in ascending order, the code points of the
|
||||
'9' characters in every set of 10 digits. Each set is identified by the
|
||||
offset in the vector of its '9' character. An initial check of the first
|
||||
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
||||
|
||||
if (ucd->chartype == ucp_Nd)
|
||||
{
|
||||
uint32_t digitset;
|
||||
|
||||
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
||||
{
|
||||
int mid;
|
||||
int bot = 1;
|
||||
int top = PRIV(ucd_digit_sets)[0];
|
||||
for (;;)
|
||||
{
|
||||
if (top <= bot + 1) /* <= rather than == is paranoia */
|
||||
{
|
||||
digitset = top;
|
||||
break;
|
||||
}
|
||||
mid = (top + bot) / 2;
|
||||
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
||||
}
|
||||
}
|
||||
|
||||
/* A required value of 0 means "unset". */
|
||||
|
||||
if (require_digitset == 0) require_digitset = digitset;
|
||||
else if (digitset != require_digitset) return FALSE;
|
||||
} /* End digit handling */
|
||||
} /* End checking non-Inherited character */
|
||||
|
||||
/* If we haven't yet got to the end, pick up the next character. */
|
||||
|
||||
if (ptr >= endptr) return TRUE;
|
||||
GETCHARINCTEST(c, ptr);
|
||||
} /* End checking loop */
|
||||
|
||||
#else /* NOT SUPPORT_UNICODE */
|
||||
(void)ptr;
|
||||
(void)endptr;
|
||||
(void)utf;
|
||||
return TRUE;
|
||||
#endif /* SUPPORT_UNICODE */
|
||||
}
|
||||
|
||||
/* End of pcre2_script_run.c */
|
||||
Reference in New Issue
Block a user