pcre2: Update to upstream version 10.40

Changelog: https://github.com/PCRE2Project/pcre2/blob/pcre2-10.40/ChangeLog
2025-11-10 13:00:37 +00:00 · 2022-05-17 16:38:55 +02:00
parent d5c1de784c
commit fd6eb2c2d2
48 changed files with 13966 additions and 9395 deletions
--- a/thirdparty/pcre2/src/pcre2_ucp.h
+++ b/thirdparty/pcre2/src/pcre2_ucp.h
@@ -7,7 +7,11 @@ and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
     Original API code Copyright (c) 1997-2012 University of Cambridge
-          New API code Copyright (c) 2016-2018 University of Cambridge
+          New API code Copyright (c) 2016-2022 University of Cambridge
+
+This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY!
+Instead, modify the maint/GenerateUcpHeader.py script and run it to generate
+a new version of this code.

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -38,31 +42,27 @@ POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

-
 #ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD
 #define PCRE2_UCP_H_IDEMPOTENT_GUARD

-/* This file contains definitions of the property values that are returned by
-the UCD access macros. New values that are added for new releases of Unicode
-should always be at the end of each enum, for backwards compatibility.
+/* This file contains definitions of the Unicode property values that are
+returned by the UCD access macros and used throughout PCRE2.

-IMPORTANT: Note also that the specific numeric values of the enums have to be
-the same as the values that are generated by the maint/MultiStage2.py script,
-where the equivalent property descriptive names are listed in vectors.
-
-ALSO: The specific values of the first two enums are assumed for the table
-called catposstab in pcre2_compile.c. */
+IMPORTANT: The specific values of the first two enums (general and particular
+character categories) are assumed by the table called catposstab in the file
+pcre2_auto_possess.c. They are unlikely to change, but should be checked after
+an update. */

 /* These are the general character categories. */

 enum {
-  ucp_C,     /* Other */
-  ucp_L,     /* Letter */
-  ucp_M,     /* Mark */
-  ucp_N,     /* Number */
-  ucp_P,     /* Punctuation */
-  ucp_S,     /* Symbol */
-  ucp_Z      /* Separator */
+  ucp_C,
+  ucp_L,
+  ucp_M,
+  ucp_N,
+  ucp_P,
+  ucp_S,
+  ucp_Z,
 };

 /* These are the particular character categories. */
@@ -97,7 +97,98 @@ enum {
  ucp_So,    /* Other symbol */
  ucp_Zl,    /* Line separator */
  ucp_Zp,    /* Paragraph separator */
-  ucp_Zs     /* Space separator */
+  ucp_Zs,    /* Space separator */
+};
+
+/* These are Boolean properties. */
+
+enum {
+  ucp_ASCII,
+  ucp_ASCII_Hex_Digit,
+  ucp_Alphabetic,
+  ucp_Bidi_Control,
+  ucp_Bidi_Mirrored,
+  ucp_Case_Ignorable,
+  ucp_Cased,
+  ucp_Changes_When_Casefolded,
+  ucp_Changes_When_Casemapped,
+  ucp_Changes_When_Lowercased,
+  ucp_Changes_When_Titlecased,
+  ucp_Changes_When_Uppercased,
+  ucp_Dash,
+  ucp_Default_Ignorable_Code_Point,
+  ucp_Deprecated,
+  ucp_Diacritic,
+  ucp_Emoji,
+  ucp_Emoji_Component,
+  ucp_Emoji_Modifier,
+  ucp_Emoji_Modifier_Base,
+  ucp_Emoji_Presentation,
+  ucp_Extended_Pictographic,
+  ucp_Extender,
+  ucp_Grapheme_Base,
+  ucp_Grapheme_Extend,
+  ucp_Grapheme_Link,
+  ucp_Hex_Digit,
+  ucp_IDS_Binary_Operator,
+  ucp_IDS_Trinary_Operator,
+  ucp_ID_Continue,
+  ucp_ID_Start,
+  ucp_Ideographic,
+  ucp_Join_Control,
+  ucp_Logical_Order_Exception,
+  ucp_Lowercase,
+  ucp_Math,
+  ucp_Noncharacter_Code_Point,
+  ucp_Pattern_Syntax,
+  ucp_Pattern_White_Space,
+  ucp_Prepended_Concatenation_Mark,
+  ucp_Quotation_Mark,
+  ucp_Radical,
+  ucp_Regional_Indicator,
+  ucp_Sentence_Terminal,
+  ucp_Soft_Dotted,
+  ucp_Terminal_Punctuation,
+  ucp_Unified_Ideograph,
+  ucp_Uppercase,
+  ucp_Variation_Selector,
+  ucp_White_Space,
+  ucp_XID_Continue,
+  ucp_XID_Start,
+  /* This must be last */
+  ucp_Bprop_Count
+};
+
+/* Size of entries in ucd_boolprop_sets[] */
+
+#define ucd_boolprop_sets_item_size 2
+
+/* These are the bidi class values. */
+
+enum {
+  ucp_bidiAL,   /* Arabic letter */
+  ucp_bidiAN,   /* Arabic number */
+  ucp_bidiB,    /* Paragraph separator */
+  ucp_bidiBN,   /* Boundary neutral */
+  ucp_bidiCS,   /* Common separator */
+  ucp_bidiEN,   /* European number */
+  ucp_bidiES,   /* European separator */
+  ucp_bidiET,   /* European terminator */
+  ucp_bidiFSI,  /* First strong isolate */
+  ucp_bidiL,    /* Left to right */
+  ucp_bidiLRE,  /* Left to right embedding */
+  ucp_bidiLRI,  /* Left to right isolate */
+  ucp_bidiLRO,  /* Left to right override */
+  ucp_bidiNSM,  /* Non-spacing mark */
+  ucp_bidiON,   /* Other neutral */
+  ucp_bidiPDF,  /* Pop directional format */
+  ucp_bidiPDI,  /* Pop directional isolate */
+  ucp_bidiR,    /* Right to left */
+  ucp_bidiRLE,  /* Right to left embedding */
+  ucp_bidiRLI,  /* Right to left isolate */
+  ucp_bidiRLO,  /* Right to left override */
+  ucp_bidiS,    /* Segment separator */
+  ucp_bidiWS,   /* White space */
 };

 /* These are grapheme break properties. The Extended Pictographic property
@@ -115,191 +206,189 @@ enum {
  ucp_gbT,                     /*  8 Hangul syllable type T */
  ucp_gbLV,                    /*  9 Hangul syllable type LV */
  ucp_gbLVT,                   /* 10 Hangul syllable type LVT */
-  ucp_gbRegionalIndicator,     /* 11 */
+  ucp_gbRegional_Indicator,    /* 11 */
  ucp_gbOther,                 /* 12 */
  ucp_gbZWJ,                   /* 13 */
-  ucp_gbExtended_Pictographic  /* 14 */
+  ucp_gbExtended_Pictographic, /* 14 */
 };

 /* These are the script identifications. */

 enum {
-  ucp_Unknown,
-  ucp_Arabic,
-  ucp_Armenian,
-  ucp_Bengali,
-  ucp_Bopomofo,
-  ucp_Braille,
-  ucp_Buginese,
-  ucp_Buhid,
-  ucp_Canadian_Aboriginal,
-  ucp_Cherokee,
-  ucp_Common,
-  ucp_Coptic,
-  ucp_Cypriot,
-  ucp_Cyrillic,
-  ucp_Deseret,
-  ucp_Devanagari,
-  ucp_Ethiopic,
-  ucp_Georgian,
-  ucp_Glagolitic,
-  ucp_Gothic,
-  ucp_Greek,
-  ucp_Gujarati,
-  ucp_Gurmukhi,
-  ucp_Han,
-  ucp_Hangul,
-  ucp_Hanunoo,
-  ucp_Hebrew,
-  ucp_Hiragana,
-  ucp_Inherited,
-  ucp_Kannada,
-  ucp_Katakana,
-  ucp_Kharoshthi,
-  ucp_Khmer,
-  ucp_Lao,
+  /* Scripts which has characters in other scripts. */
  ucp_Latin,
-  ucp_Limbu,
-  ucp_Linear_B,
-  ucp_Malayalam,
-  ucp_Mongolian,
-  ucp_Myanmar,
-  ucp_New_Tai_Lue,
-  ucp_Ogham,
-  ucp_Old_Italic,
-  ucp_Old_Persian,
-  ucp_Oriya,
-  ucp_Osmanya,
-  ucp_Runic,
-  ucp_Shavian,
-  ucp_Sinhala,
-  ucp_Syloti_Nagri,
+  ucp_Greek,
+  ucp_Cyrillic,
+  ucp_Arabic,
  ucp_Syriac,
-  ucp_Tagalog,
-  ucp_Tagbanwa,
-  ucp_Tai_Le,
+  ucp_Thaana,
+  ucp_Devanagari,
+  ucp_Bengali,
+  ucp_Gurmukhi,
+  ucp_Gujarati,
+  ucp_Oriya,
  ucp_Tamil,
  ucp_Telugu,
-  ucp_Thaana,
-  ucp_Thai,
-  ucp_Tibetan,
-  ucp_Tifinagh,
-  ucp_Ugaritic,
+  ucp_Kannada,
+  ucp_Malayalam,
+  ucp_Sinhala,
+  ucp_Myanmar,
+  ucp_Georgian,
+  ucp_Hangul,
+  ucp_Mongolian,
+  ucp_Hiragana,
+  ucp_Katakana,
+  ucp_Bopomofo,
+  ucp_Han,
  ucp_Yi,
-  /* New for Unicode 5.0 */
-  ucp_Balinese,
-  ucp_Cuneiform,
-  ucp_Nko,
+  ucp_Tagalog,
+  ucp_Hanunoo,
+  ucp_Buhid,
+  ucp_Tagbanwa,
+  ucp_Limbu,
+  ucp_Tai_Le,
+  ucp_Linear_B,
+  ucp_Cypriot,
+  ucp_Buginese,
+  ucp_Coptic,
+  ucp_Glagolitic,
+  ucp_Syloti_Nagri,
  ucp_Phags_Pa,
-  ucp_Phoenician,
-  /* New for Unicode 5.1 */
-  ucp_Carian,
-  ucp_Cham,
+  ucp_Nko,
  ucp_Kayah_Li,
-  ucp_Lepcha,
-  ucp_Lycian,
-  ucp_Lydian,
-  ucp_Ol_Chiki,
-  ucp_Rejang,
-  ucp_Saurashtra,
-  ucp_Sundanese,
-  ucp_Vai,
-  /* New for Unicode 5.2 */
-  ucp_Avestan,
-  ucp_Bamum,
-  ucp_Egyptian_Hieroglyphs,
-  ucp_Imperial_Aramaic,
-  ucp_Inscriptional_Pahlavi,
-  ucp_Inscriptional_Parthian,
  ucp_Javanese,
  ucp_Kaithi,
-  ucp_Lisu,
-  ucp_Meetei_Mayek,
-  ucp_Old_South_Arabian,
-  ucp_Old_Turkic,
-  ucp_Samaritan,
-  ucp_Tai_Tham,
-  ucp_Tai_Viet,
-  /* New for Unicode 6.0.0 */
-  ucp_Batak,
-  ucp_Brahmi,
  ucp_Mandaic,
-  /* New for Unicode 6.1.0 */
  ucp_Chakma,
-  ucp_Meroitic_Cursive,
-  ucp_Meroitic_Hieroglyphs,
-  ucp_Miao,
  ucp_Sharada,
-  ucp_Sora_Sompeng,
  ucp_Takri,
-  /* New for Unicode 7.0.0 */
-  ucp_Bassa_Vah,
-  ucp_Caucasian_Albanian,
  ucp_Duployan,
-  ucp_Elbasan,
  ucp_Grantha,
  ucp_Khojki,
-  ucp_Khudawadi,
  ucp_Linear_A,
  ucp_Mahajani,
  ucp_Manichaean,
-  ucp_Mende_Kikakui,
  ucp_Modi,
-  ucp_Mro,
-  ucp_Nabataean,
-  ucp_Old_North_Arabian,
  ucp_Old_Permic,
-  ucp_Pahawh_Hmong,
-  ucp_Palmyrene,
  ucp_Psalter_Pahlavi,
+  ucp_Khudawadi,
+  ucp_Tirhuta,
+  ucp_Multani,
+  ucp_Adlam,
+  ucp_Masaram_Gondi,
+  ucp_Dogra,
+  ucp_Gunjala_Gondi,
+  ucp_Hanifi_Rohingya,
+  ucp_Sogdian,
+  ucp_Nandinagari,
+  ucp_Yezidi,
+  ucp_Cypro_Minoan,
+  ucp_Old_Uyghur,
+
+  /* Scripts which has no characters in other scripts. */
+  ucp_Unknown,
+  ucp_Common,
+  ucp_Armenian,
+  ucp_Hebrew,
+  ucp_Thai,
+  ucp_Lao,
+  ucp_Tibetan,
+  ucp_Ethiopic,
+  ucp_Cherokee,
+  ucp_Canadian_Aboriginal,
+  ucp_Ogham,
+  ucp_Runic,
+  ucp_Khmer,
+  ucp_Old_Italic,
+  ucp_Gothic,
+  ucp_Deseret,
+  ucp_Inherited,
+  ucp_Ugaritic,
+  ucp_Shavian,
+  ucp_Osmanya,
+  ucp_Braille,
+  ucp_New_Tai_Lue,
+  ucp_Tifinagh,
+  ucp_Old_Persian,
+  ucp_Kharoshthi,
+  ucp_Balinese,
+  ucp_Cuneiform,
+  ucp_Phoenician,
+  ucp_Sundanese,
+  ucp_Lepcha,
+  ucp_Ol_Chiki,
+  ucp_Vai,
+  ucp_Saurashtra,
+  ucp_Rejang,
+  ucp_Lycian,
+  ucp_Carian,
+  ucp_Lydian,
+  ucp_Cham,
+  ucp_Tai_Tham,
+  ucp_Tai_Viet,
+  ucp_Avestan,
+  ucp_Egyptian_Hieroglyphs,
+  ucp_Samaritan,
+  ucp_Lisu,
+  ucp_Bamum,
+  ucp_Meetei_Mayek,
+  ucp_Imperial_Aramaic,
+  ucp_Old_South_Arabian,
+  ucp_Inscriptional_Parthian,
+  ucp_Inscriptional_Pahlavi,
+  ucp_Old_Turkic,
+  ucp_Batak,
+  ucp_Brahmi,
+  ucp_Meroitic_Cursive,
+  ucp_Meroitic_Hieroglyphs,
+  ucp_Miao,
+  ucp_Sora_Sompeng,
+  ucp_Caucasian_Albanian,
+  ucp_Bassa_Vah,
+  ucp_Elbasan,
+  ucp_Pahawh_Hmong,
+  ucp_Mende_Kikakui,
+  ucp_Mro,
+  ucp_Old_North_Arabian,
+  ucp_Nabataean,
+  ucp_Palmyrene,
  ucp_Pau_Cin_Hau,
  ucp_Siddham,
-  ucp_Tirhuta,
  ucp_Warang_Citi,
-  /* New for Unicode 8.0.0 */
  ucp_Ahom,
  ucp_Anatolian_Hieroglyphs,
  ucp_Hatran,
-  ucp_Multani,
  ucp_Old_Hungarian,
  ucp_SignWriting,
-  /* New for Unicode 10.0.0 (no update since 8.0.0) */
-  ucp_Adlam,
  ucp_Bhaiksuki,
  ucp_Marchen,
  ucp_Newa,
  ucp_Osage,
  ucp_Tangut,
-  ucp_Masaram_Gondi,
  ucp_Nushu,
  ucp_Soyombo,
  ucp_Zanabazar_Square,
-  /* New for Unicode 11.0.0 */
-  ucp_Dogra,
-  ucp_Gunjala_Gondi,
-  ucp_Hanifi_Rohingya,
  ucp_Makasar,
  ucp_Medefaidrin,
  ucp_Old_Sogdian,
-  ucp_Sogdian,
-  /* New for Unicode 12.0.0 */
  ucp_Elymaic,
-  ucp_Nandinagari,
  ucp_Nyiakeng_Puachue_Hmong,
  ucp_Wancho,
-  /* New for Unicode 13.0.0 */
  ucp_Chorasmian,
  ucp_Dives_Akuru,
  ucp_Khitan_Small_Script,
-  ucp_Yezidi,
-  /* New for Unicode 14.0.0 */
-  ucp_Cypro_Minoan,
-  ucp_Old_Uyghur,
  ucp_Tangsa,
  ucp_Toto,
-  ucp_Vithkuqi
+  ucp_Vithkuqi,
+
+  /* This must be last */
+  ucp_Script_Count
 };

+/* Size of entries in ucd_script_sets[] */
+
+#define ucd_script_sets_item_size 3
+
 #endif  /* PCRE2_UCP_H_IDEMPOTENT_GUARD */

 /* End of pcre2_ucp.h */