diff --git a/BUILD.gn b/BUILD.gn
index e4ba69032b1afd2bdb7d5e02320a2d97d7763452..c41223a8780b4ffce90cdd4632c29c2851cbf305 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -110,6 +110,10 @@ ohos_shared_library("libpcre2") {
"updater",
]
license_file = "$PCRE2_LIB_DIR/LICENCE"
- part_name = "selinux"
- subsystem_name = "security"
+ innerapi_tags = [
+ "platformsdk_indirect",
+ "chipsetsdk_indirect",
+ ]
+ part_name = "pcre2"
+ subsystem_name = "thirdparty"
}
diff --git a/README.OpenSource b/README.OpenSource
index a0aa847d3a236aa40131e35f6267c6399d7b4058..cb1360c42d17a9be21f868927745b96027f99aeb 100644
--- a/README.OpenSource
+++ b/README.OpenSource
@@ -3,7 +3,7 @@
"Name": "PCRE2",
"License": "PCRE2 LICENCE",
"License File": "LICENSE",
- "Version Number": "pcre2-10.39",
+ "Version Number": "pcre2-10.40",
"Owner": "jiangxiaofeng8@huawei.com",
"Upstream URL": "https://github.com/PhilipHazel/pcre2.git",
"Description": "pcre2 is a re_working of the original PCRE1 library to provide an entirely new API."
diff --git a/bundle.json b/bundle.json
index 0ee0b30f5d0d54940601e410ed0ebf85e8df82ff..c935fa8d3124eb7e55175d4b245d00fddf3385aa 100644
--- a/bundle.json
+++ b/bundle.json
@@ -11,13 +11,13 @@
"scripts": {},
"licensePath": "pcre2/LICENSE",
"component": {
- "name": "thirdparty_pcre2",
- "subsystem": "",
+ "name": "pcre2",
+ "subsystem": "thirdparty",
"syscap": [],
"features": [],
- "adapted_system_type": [],
- "rom": "",
- "ram": "",
+ "adapted_system_type": [ "standard" ],
+ "rom": "512KB",
+ "ram": "512KB",
"deps": {
"components": [],
"third_party": []
diff --git a/pcre2/.gitignore b/pcre2/.gitignore
index e104501d760fb10e44e4fdc588462e3c5277a105..3e3284eeee3c0e6f266f7108f373fb9c7d5aa37e 100644
--- a/pcre2/.gitignore
+++ b/pcre2/.gitignore
@@ -7,6 +7,7 @@
*.o
*~
+__pycache__
.deps
.libs
diff --git a/pcre2/132html b/pcre2/132html
old mode 100755
new mode 100644
diff --git a/pcre2/AUTHORS b/pcre2/AUTHORS
index bec8a1e5adce4e605b9da70410ef09f57d054017..11ef898b2501732692fa2c76ad14e01215de0ce5 100644
--- a/pcre2/AUTHORS
+++ b/pcre2/AUTHORS
@@ -8,7 +8,7 @@ Email domain: gmail.com
Retired from University of Cambridge Computing Service,
Cambridge, England.
-Copyright (c) 1997-2021 University of Cambridge
+Copyright (c) 1997-2022 University of Cambridge
All rights reserved
@@ -19,7 +19,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
-Copyright(c) 2010-2021 Zoltan Herczeg
+Copyright(c) 2010-2022 Zoltan Herczeg
All rights reserved.
@@ -30,7 +30,7 @@ Written by: Zoltan Herczeg
Email local part: hzmester
Emain domain: freemail.hu
-Copyright(c) 2009-2021 Zoltan Herczeg
+Copyright(c) 2009-2022 Zoltan Herczeg
All rights reserved.
####
diff --git a/pcre2/CMakeLists.txt b/pcre2/CMakeLists.txt
index 8010497f569a22c57caa151fb18fd4c3a0898d53..7febf337d7fb9e79231d3b6825532dc540cf752f 100644
--- a/pcre2/CMakeLists.txt
+++ b/pcre2/CMakeLists.txt
@@ -110,6 +110,11 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
# GET_TARGET_PROPERTY. This should no longer be required.
# CMAKE_POLICY(SET CMP0026 OLD)
+# With a recent cmake, you can provide a rootdir to look for non
+# standard installed library dependencies, but to do so, the policy
+# needs to be set to new (by uncommenting the following)
+# CMAKE_POLICY(SET CMP0074 NEW)
+
# For FindReadline.cmake. This was changed to allow setting CMAKE_MODULE_PATH
# on the command line.
# SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
@@ -142,10 +147,16 @@ CHECK_INCLUDE_FILE(windows.h HAVE_WINDOWS_H)
CHECK_SYMBOL_EXISTS(bcopy "strings.h" HAVE_BCOPY)
CHECK_SYMBOL_EXISTS(memfd_create "sys/mman.h" HAVE_MEMFD_CREATE)
CHECK_SYMBOL_EXISTS(memmove "string.h" HAVE_MEMMOVE)
-CHECK_SYMBOL_EXISTS(realpath "stdlib.h" HAVE_REALPATH)
CHECK_SYMBOL_EXISTS(secure_getenv "stdlib.h" HAVE_SECURE_GETENV)
CHECK_SYMBOL_EXISTS(strerror "string.h" HAVE_STRERROR)
+CHECK_C_SOURCE_COMPILES(
+ "#include
- PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{df800} to \x{dfff} - in UTF-8 and UTF-32 modes + PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \K in lookarounds + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \x{d800} to \x{dfff} in UTF-8 and UTF-32 modes PCRE2_EXTRA_ALT_BSUX Extended alternate \u, \U, and \x handling PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL Treat all invalid escapes as a literal following character PCRE2_EXTRA_ESCAPED_CR_IS_LF Interpret \r as \n diff --git a/pcre2/doc/html/pcre2_substitute.html b/pcre2/doc/html/pcre2_substitute.html index 10b2267e2d72e9858b30f35874ed6321a06181be..abf0a703044f9829dbeea051ef790dd97d3850ed 100644 --- a/pcre2/doc/html/pcre2_substitute.html +++ b/pcre2/doc/html/pcre2_substitute.html @@ -68,29 +68,29 @@ automatically added. The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for zero-terminated strings. The options are:Returns 1 if there is a rightmost literal code unit that must exist in any -matched string, other than at its start. The third argument should point to a +matched string, other than at its start. The third argument should point to a uint32_t variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is @@ -2640,7 +2640,9 @@ The subject string is passed to pcre2_match() as a pointer in startoffset. The length and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not -UTF processing is enabled. +UTF processing is enabled. As a special case, if subject is NULL and +length is zero, the subject is assumed to be an empty string. If +length is non-zero, an error occurs if subject is NULL.- PCRE2_ANCHORED Match only at the first position - PCRE2_ENDANCHORED Pattern can match only at end of subject - PCRE2_NOTBOL Subject is not the beginning of a line - PCRE2_NOTEOL Subject is not the end of a line - PCRE2_NOTEMPTY An empty string is not a valid match - PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match - PCRE2_NO_JIT Do not use JIT matching - PCRE2_NO_UTF_CHECK Do not check the subject or replacement for UTF validity (only relevant if - PCRE2_UTF was set at compile time) - PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing - PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject - PCRE2_SUBSTITUTE_LITERAL The replacement string is literal - PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length + PCRE2_ANCHORED Match only at the first position + PCRE2_ENDANCHORED Match only at end of subject + PCRE2_NOTBOL Subject is not the beginning of a line + PCRE2_NOTEOL Subject is not the end of a line + PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_NOTEMPTY_ATSTART An empty string at the start of the subject is not a valid match + PCRE2_NO_JIT Do not use JIT matching + PCRE2_NO_UTF_CHECK Do not check for UTF validity in the subject or replacement + (only relevant if PCRE2_UTF was set at compile time) + PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing + PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject + PCRE2_SUBSTITUTE_LITERAL The replacement string is literal + PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for first match + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s) - PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset - PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string + PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset + PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty stringIf PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored.-If PCRE2_SUBSTITUTE_MATCHED is set, match_data must be non-zero; its +If PCRE2_SUBSTITUTE_MATCHED is set, match_data must be non-NULL; its contents must be the result of a call to pcre2_match() using the same pattern and subject.
diff --git a/pcre2/doc/html/pcre2api.html b/pcre2/doc/html/pcre2api.html index e2237e721b9dfeaf5e030e2dbf45735c296432ab..047e242a3b4d76f9f3bb285e2d588f07f5361ccd 100644 --- a/pcre2/doc/html/pcre2api.html +++ b/pcre2/doc/html/pcre2api.html @@ -1845,7 +1845,7 @@ undefined. It may cause your program to crash or loop.Note that this option can also be passed to pcre2_match() and -pcre_dfa_match(), to suppress UTF validity checking of the subject +pcre2_dfa_match(), to suppress UTF validity checking of the subject string.
@@ -2055,8 +2055,8 @@ point. However, this applies only to characters whose code points are less than \d.
-When PCRE2 is built with Unicode support (the default), the Unicode properties -of all characters can be tested with \p and \P, or, alternatively, the +When PCRE2 is built with Unicode support (the default), certain Unicode +character properties can be tested with \p and \P, or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; this causes \w and friends to use Unicode property support instead of the built-in tables. PCRE2_UCP also causes upper/lower casing operations on characters with code @@ -2316,7 +2316,7 @@ return zero. The third argument should point to a size_t variable. PCRE2_INFO_LASTCODETYPE
If startoffset is greater than the length of the subject, @@ -3394,12 +3396,17 @@ same number causes an error at compile time.
This function optionally calls pcre2_match() and then makes a copy of the subject string in outputbuffer, replacing parts that were matched with -the replacement string, whose length is supplied in rlength. This -can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an -option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the -replacement string(s). The default action is to perform just one replacement if -the pattern matches, but there is an option that requests multiple replacements -(see PCRE2_SUBSTITUTE_GLOBAL below). +the replacement string, whose length is supplied in rlength, which +can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a +special case, if replacement is NULL and rlength is zero, the +replacement is assumed to be an empty string. If rlength is non-zero, an +error occurs if replacement is NULL. +
++There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just +the replacement string(s). The default action is to perform just one +replacement if the pattern matches, but there is an option that requests +multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below).
If successful, pcre2_substitute() returns the number of substitutions @@ -3433,12 +3440,12 @@ block may or may not have been changed. As well as the usual options for pcre2_match(), a number of additional options can be set in the options argument of pcre2_substitute(). One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external -match_data block must be provided, and it must have been used for an -external call to pcre2_match(). The data in the match_data block -(return code, offset vector) is used for the first substitution instead of -calling pcre2_match() from within pcre2_substitute(). This allows -an application to check for a match before choosing to substitute, without -having to repeat the match. +match_data block must be provided, and it must have already been used for +an external call to pcre2_match() with the same pattern and subject +arguments. The data in the match_data block (return code, offset vector) +is then used for the first substitution instead of calling pcre2_match() +from within pcre2_substitute(). This allows an application to check for a +match before choosing to substitute, without having to repeat the match.
The contents of the externally supplied match data block are not changed when @@ -3583,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \U and terminating a \Q quoted sequence) reverts to no case forcing. The sequences \u and \l force the next character (if it is a letter) to upper or lower case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from +forcing. Case forcing applies to all inserted characters, including those from capture groups and letters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater @@ -3655,7 +3662,9 @@ default.
PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the -match_data argument is NULL. +match_data argument is NULL or if the subject or replacement +arguments are NULL. For backward compatibility reasons an exception is made for +the replacement argument if the rlength argument is also 0.
PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the @@ -3810,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches,
The function pcre2_dfa_match() is called to match a subject string against a compiled pattern, using a matching algorithm that scans the subject -string just once (not counting lookaround assertions), and does not backtrack. -This has different characteristics to the normal algorithm, and is not -compatible with Perl. Some of the features of PCRE2 patterns are not supported. -Nevertheless, there are times when this kind of matching can be useful. For a -discussion of the two matching algorithms, and a list of features that -pcre2_dfa_match() does not support, see the +string just once (not counting lookaround assertions), and does not backtrack +(except when processing lookaround assertions). This has different +characteristics to the normal algorithm, and is not compatible with Perl. Some +of the features of PCRE2 patterns are not supported. Nevertheless, there are +times when this kind of matching can be useful. For a discussion of the two +matching algorithms, and a list of features that pcre2_dfa_match() does +not support, see the pcre2matching documentation.
@@ -3850,7 +3860,7 @@ Here is an example of a simple call to pcre2_dfa_match():The unused bits of the options argument for pcre2_dfa_match() must @@ -4008,7 +4018,7 @@ Cambridge, England.
-Last updated: 30 August 2021
+Last updated: 14 December 2021
Copyright © 1997-2021 University of Cambridge.
diff --git a/pcre2/doc/html/pcre2build.html b/pcre2/doc/html/pcre2build.html
index a1c2e9595dbccc2aeb988084e944084f96e4112d..0d12155b093f764dc250520401b4b8805b73a87c 100644
--- a/pcre2/doc/html/pcre2build.html
+++ b/pcre2/doc/html/pcre2build.html
@@ -142,8 +142,9 @@ locked this out by setting PCRE2_NEVER_UTF.
UTF support allows the libraries to process character code points up to
0x10ffff in the strings that they handle. Unicode support also gives access to
the Unicode properties of characters, using pattern escapes such as \P, \p,
-and \X. Only the general category properties such as Lu and Nd are
-supported. Details are given in the
+and \X. Only the general category properties such as Lu and Nd,
+script names, and some bi-directional properties are supported. Details are
+given in the
pcre2pattern
documentation.
- --with-match-limit_depth=10000 + --with-match-limit-depth=10000to the configure command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is used, but @@ -615,9 +616,9 @@ Cambridge, England.
-Last updated: 20 March 2020
+Last updated: 08 December 2021
-Copyright © 1997-2020 University of Cambridge.
+Copyright © 1997-2021 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2compat.html b/pcre2/doc/html/pcre2compat.html index eb826947b87ba0849c14022da799f24a24179943..5f390c1d2dd306475335fbabe3b845be0914bb1c 100644 --- a/pcre2/doc/html/pcre2compat.html +++ b/pcre2/doc/html/pcre2compat.html @@ -18,33 +18,41 @@ DIFFERENCES BETWEEN PCRE2 AND PERL
This document describes some of the differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are with respect to -Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the +Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date.
-1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does +1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the +behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the +next character unless it is the start of a newline sequence. This means that, +if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF +(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using +EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline +indicator. +
++2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the pcre2unicode page.
-2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but +3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just asserts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some repeat quantifiers on other assertions, -for example, \b* (but not \b{3}, though oddly it does allow ^{3}), but these -do not seem to have any use. PCRE2 does not allow any kind of quantifier on -non-lookaround assertions. +for example, \b* , but these do not seem to have any use. PCRE2 does not allow +any kind of quantifier on non-lookaround assertions.
-3. Capture groups that occur inside negative lookaround assertions are counted, +4. Capture groups that occur inside negative lookaround assertions are counted, but their entries in the offsets vector are set only when a negative assertion is a condition that has a matching branch (that is, the condition is false). Perl may set such capture groups in other circumstances.
-4. The following Perl escape sequences are not supported: \F, \l, \L, \u, +5. The following Perl escape sequences are not supported: \F, \l, \L, \u, \U, and \N when followed by a character name. \N on its own, matching a non-newline character, and \N{U+dd..}, matching a Unicode code point, are supported. The escapes that modify the case of following letters are @@ -55,26 +63,26 @@ PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript interprets them.
-5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is +6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \p and \P are limited to the general category properties such as Lu and -Nd, script names such as Greek or Han, and the derived properties Any and L&. -Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use -is limited. See the +Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the +derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs +(surrogate) property, but in PCRE2 its use is limited. See the pcre2pattern documentation for details. The long synonyms for property names that Perl supports (such as \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is".
-6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters +7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the quotes. In Perl, -they cause variable interpolation (but of course PCRE2 does not have -variables). Also, Perl does "double-quotish backslash interpolation" on any -backslashes between \Q and \E which, its documentation says, "may lead to -confusing results". PCRE2 treats a backslash between \Q and \E just like any -other character. Note the following examples: +they cause variable interpolation (PCRE2 does not have variables). Also, Perl +does "double-quotish backslash interpolation" on any backslashes between \Q +and \E which, its documentation says, "may lead to confusing results". PCRE2 +treats a backslash between \Q and \E just like any other character. Note the +following examples:
Pattern PCRE2 matches Perl matches @@ -88,19 +96,19 @@ The \Q...\E sequence is recognized both inside and outside character classes by both PCRE2 and Perl.Note that octal values of 100 or greater that are specified using this syntax must not be introduced by a leading zero, because no more than three octal @@ -776,199 +776,62 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all -treated as being in the Unknown script and with an unassigned type. The extra -escape sequences are: +treated as being in the Unknown script and with an unassigned type. + +-7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) +8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details.
-8. Subroutine calls (whether recursive or not) were treated as atomic groups up +9. Subroutine calls (whether recursive or not) were treated as atomic groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl.
-9. In PCRE2, if any of the backtracking control verbs are used in a group that +10. In PCRE2, if any of the backtracking control verbs are used in a group that is called as a subroutine (whether or not recursively), their effect is confined to that group; it does not extend to the surrounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group @@ -109,20 +117,20 @@ the group does not contain any | characters. Note that such groups are processed as anchored at the point where they are tested.
-10. If a pattern contains more than one backtracking control verb, the first +11. If a pattern contains more than one backtracking control verb, the first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs.
-11. There are some differences that are concerned with the settings of captured +12. There are some differences that are concerned with the settings of captured strings when part of a pattern is repeated. For example, matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to "b".
-12. PCRE2's handling of duplicate capture group numbers and names is not as +13. PCRE2's handling of duplicate capture group numbers and names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate between numbers and names. In particular, a pattern such as (?|(?<a>A)|(?<b>B)), where the two @@ -132,42 +140,43 @@ to distinguish which group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time.
-13. Perl used to recognize comments in some places that PCRE2 does not, for +14. Perl used to recognize comments in some places that PCRE2 does not, for example, between the ( and ? at the start of a group. If the /x modifier is set, Perl allowed white space between ( and ? though the latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently.
-14. Perl, when in warning mode, gives warnings for character classes such as +15. Perl, when in warning mode, gives warnings for character classes such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes.
-15. In PCRE2, the upper/lower case character properties Lu and Ll are not +16. In PCRE2, the upper/lower case character properties Lu and Ll are not affected when case-independent matching is specified. For example, \p{Lu} always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.32), \p{Lu} and \p{Ll} match all +in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all letters, regardless of case, when case independence is specified.
-16. From release 5.32.0, Perl locks out the use of \K in lookaround +17. From release 5.32.0, Perl locks out the use of \K in lookaround assertions. From release 10.38 PCRE2 does the same by default. However, there is an option for re-enabling the previous behaviour. When this option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions.
-17. PCRE2 provides some extensions to the Perl regular expression facilities. +18. PCRE2 provides some extensions to the Perl regular expression facilities. Perl 5.10 included new features that were not in earlier versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This -list is with respect to Perl 5.32: +list is with respect to Perl 5.34:
(a) Although lookbehind assertions in PCRE2 must match fixed length strings, each alternative toplevel branch of a lookbehind assertion can match a -different length of string. Perl requires them all to have the same length. +different length of string. Perl used to require them all to have the same +length, but the latest version has some variable length support.
(b) From PCRE2 10.23, backreferences to groups of fixed length are supported @@ -221,12 +230,12 @@ extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic.-18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa +19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode rules. This separation cannot be represented with PCRE2_UCP.
-19. Perl has different limits than PCRE2. See the +20. Perl has different limits than PCRE2. See the pcre2limit documentation for details. Perl went with 5.10 from recursion to iteration keeping the intermediate matches on the heap, which is ~10% slower but does not @@ -248,7 +257,7 @@ Cambridge, England. REVISION
-Last updated: 30 August 2021 +Last updated: 08 December 2021
Copyright © 1997-2021 University of Cambridge.
diff --git a/pcre2/doc/html/pcre2jit.html b/pcre2/doc/html/pcre2jit.html index e73a22984c126817aca7c6d6347687c91bc1fc8b..d89fa239995e526d94e6e383d1912e26978874c6 100644 --- a/pcre2/doc/html/pcre2jit.html +++ b/pcre2/doc/html/pcre2jit.html @@ -269,11 +269,11 @@ starts another match, that match must use a different JIT stack to the one used for currently suspended match(es).-In a multithread application, if you do not -specify a JIT stack, or if you assign or pass back NULL from a callback, that -is thread-safe, because each thread has its own machine stack. However, if you -assign or pass back a non-NULL JIT stack, this must be a different stack for -each thread so that the application is thread-safe. +In a multithread application, if you do not specify a JIT stack, or if you +assign or pass back NULL from a callback, that is thread-safe, because each +thread has its own machine stack. However, if you assign or pass back a +non-NULL JIT stack, this must be a different stack for each thread so that the +application is thread-safe.
Strictly speaking, even more is allowed. You can assign the same non-NULL stack @@ -382,8 +382,8 @@ out this complicated API. void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext);
-The JIT executable allocator does not free all memory when it is possible. -It expects new allocations, and keeps some free memory around to improve +The JIT executable allocator does not free all memory when it is possible. It +expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general context, for custom @@ -442,10 +442,10 @@ that was not compiled.
When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For example, if -the subject pointer is NULL, an immediate error is given. Also, unless -PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the -interests of speed, these checks do not happen on the JIT fast path, and if -invalid data is passed, the result is undefined. +the subject pointer is NULL but the length is non-zero, an immediate error is +given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested +for validity. In the interests of speed, these checks do not happen on the JIT +fast path, and if invalid data is passed, the result is undefined.
Bypassing the sanity checks and the pcre2_match() wrapping can give @@ -466,9 +466,9 @@ Cambridge, England.
REVISION
-Last updated: 23 May 2019 +Last updated: 30 November 2021
-Copyright © 1997-2019 University of Cambridge. +Copyright © 1997-2021 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2pattern.html b/pcre2/doc/html/pcre2pattern.html index 9c2d66cf8267c5e212b2cbb88444f68ff57bbc2c..2c243010187143290d614546589e9d3c7933a075 100644 --- a/pcre2/doc/html/pcre2pattern.html +++ b/pcre2/doc/html/pcre2pattern.html @@ -534,7 +534,7 @@ for themselves. For example, outside a character class: \0113 is a tab followed by the character "3" \113 might be a backreference, otherwise the character with octal code 113 \377 might be a backreference, otherwise the value 255 (decimal) - \81 is always a backreference .sp + \81 is always a backreference
+Matching characters by Unicode property is not fast, because PCRE2 has to do a +multistage table lookup in order to find a character's property. That is why +the traditional escape sequences such as \d and \w do not use Unicode +properties in PCRE2 by default, though you can make them do so by setting the +PCRE2_UCP option or by starting the pattern with (*UCP). +
++The extra escape sequences that provide property support are:
\p{xx} a character with the xx property \P{xx} a character without the xx property \X a Unicode extended grapheme cluster-The property names represented by xx above are case-sensitive. There is -support for Unicode script names, Unicode general category properties, "Any", -which matches any character (including newline), and some special PCRE2 -properties (described in the -next section). -Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. -Note that \P{Any} does not match any characters, so always causes a match -failure. +The property names represented by xx above are not case-sensitive, and in +accordance with Unicode's "loose matching" rules, spaces, hyphens, and +underscores are ignored. There is support for Unicode script names, Unicode +general category properties, "Any", which matches any character (including +newline), Bidi_Class, a number of binary (yes/no) properties, and some special +PCRE2 properties (described +below). +Certain other Perl properties such as "InMusicalSymbols" are not supported by +PCRE2. Note that \P{Any} does not match any characters, so always causes a +match failure. + +
+There are three different syntax forms for matching a script. Each Unicode +character has a basic script and, optionally, a list of other scripts ("Script +Extensions") with which it is commonly used. Using the Adlam script as an +example, \p{sc:Adlam} matches characters whose basic script is Adlam, whereas +\p{scx:Adlam} matches, in addition, characters that have Adlam in their +extensions list. The full names "script" and "script extensions" for the +property types are recognized, and a equals sign is an alternative to the +colon. If a script name is given without a property type, for example, +\p{Adlam}, it is treated as \p{scx:Adlam}. Perl changed to this +interpretation at release 5.26 and PCRE2 changed at release 10.40.
-Sets of Unicode characters are defined as belonging to certain scripts. A -character from one of these sets can be matched using a script name. For -example: -
- \p{Greek} - \P{Han} -Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Common". The current list -of scripts is: - -
-Adlam, -Ahom, -Anatolian_Hieroglyphs, -Arabic, -Armenian, -Avestan, -Balinese, -Bamum, -Bassa_Vah, -Batak, -Bengali, -Bhaiksuki, -Bopomofo, -Brahmi, -Braille, -Buginese, -Buhid, -Canadian_Aboriginal, -Carian, -Caucasian_Albanian, -Chakma, -Cham, -Cherokee, -Chorasmian, -Common, -Coptic, -Cuneiform, -Cypriot, -Cypro_Minoan, -Cyrillic, -Deseret, -Devanagari, -Dives_Akuru, -Dogra, -Duployan, -Egyptian_Hieroglyphs, -Elbasan, -Elymaic, -Ethiopic, -Georgian, -Glagolitic, -Gothic, -Grantha, -Greek, -Gujarati, -Gunjala_Gondi, -Gurmukhi, -Han, -Hangul, -Hanifi_Rohingya, -Hanunoo, -Hatran, -Hebrew, -Hiragana, -Imperial_Aramaic, -Inherited, -Inscriptional_Pahlavi, -Inscriptional_Parthian, -Javanese, -Kaithi, -Kannada, -Katakana, -Kayah_Li, -Kharoshthi, -Khitan_Small_Script, -Khmer, -Khojki, -Khudawadi, -Lao, -Latin, -Lepcha, -Limbu, -Linear_A, -Linear_B, -Lisu, -Lycian, -Lydian, -Mahajani, -Makasar, -Malayalam, -Mandaic, -Manichaean, -Marchen, -Masaram_Gondi, -Medefaidrin, -Meetei_Mayek, -Mende_Kikakui, -Meroitic_Cursive, -Meroitic_Hieroglyphs, -Miao, -Modi, -Mongolian, -Mro, -Multani, -Myanmar, -Nabataean, -Nandinagari, -New_Tai_Lue, -Newa, -Nko, -Nushu, -Nyakeng_Puachue_Hmong, -Ogham, -Ol_Chiki, -Old_Hungarian, -Old_Italic, -Old_North_Arabian, -Old_Permic, -Old_Persian, -Old_Sogdian, -Old_South_Arabian, -Old_Turkic, -Old_Uyghur, -Oriya, -Osage, -Osmanya, -Pahawh_Hmong, -Palmyrene, -Pau_Cin_Hau, -Phags_Pa, -Phoenician, -Psalter_Pahlavi, -Rejang, -Runic, -Samaritan, -Saurashtra, -Sharada, -Shavian, -Siddham, -SignWriting, -Sinhala, -Sogdian, -Sora_Sompeng, -Soyombo, -Sundanese, -Syloti_Nagri, -Syriac, -Tagalog, -Tagbanwa, -Tai_Le, -Tai_Tham, -Tai_Viet, -Takri, -Tamil, -Tangsa, -Tangut, -Telugu, -Thaana, -Thai, -Tibetan, -Tifinagh, -Tirhuta, -Toto, -Ugaritic, -Unknown, -Vai, -Vithkuqi, -Wancho, -Warang_Citi, -Yezidi, -Yi, -Zanabazar_Square. +of recognized script names and their 4-character abbreviations can be obtained +by running this command: +
+ pcre2test -LS + ++
Each character has exactly one Unicode general category property, specified by a two-letter abbreviation. For compatibility with Perl, negation can be @@ -1030,9 +893,9 @@ The following general category property codes are supported: Zp Paragraph separator Zs Space separator -The special property L& is also supported: it matches a character that has -the Lu, Ll, or Lt property, in other words, a letter that is not classified as -a modifier or "other". +The special property LC, which has the synonym L&, is also supported: it +matches a character that has the Lu, Ll, or Lt property, in other words, a +letter that is not classified as a modifier or "other".
The Cs (Surrogate) property applies only to characters whose code points are in @@ -1059,12 +922,54 @@ Specifying caseless matching does not affect these escape sequences. For example, \p{Lu} always matches only upper case letters. This is different from the behaviour of current versions of Perl.
+-Matching characters by Unicode property is not fast, because PCRE2 has to do a -multistage table lookup in order to find a character's property. That is why -the traditional escape sequences such as \d and \w do not use Unicode -properties in PCRE2 by default, though you can make them do so by setting the -PCRE2_UCP option or by starting the pattern with (*UCP). +Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\p and \P, along with their abbreviations, by running this command: +
+ pcre2test -LP + ++ +
+
+ \p{Bidi_Class:<class>} matches a character with the given class + \p{BC:<class>} matches a character with the given class ++The recognized classes are: +
+ AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space ++An equals sign may be used instead of a colon. The class names are +case-insensitive; only the short names listed above are recognized.
Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a -line. +line. One or more characters may be specified as line terminators (see +"Newline conventions" +above).
-When a line ending is defined as a single character, dot never matches that -character; when the two-character sequence CRLF is used, dot does not match CR -if it is immediately followed by LF, but otherwise it matches all characters -(including isolated CRs and LFs). When any Unicode line endings are being -recognized, dot does not match CR or LF or any of the other line ending -characters. +Dot never matches a single line-ending character. When the two-character +sequence CRLF is the only line ending, dot does not match CR if it is +immediately followed by LF, but otherwise it matches all characters (including +isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences +of CR of LF match dot. When all Unicode line endings are being recognized, dot +does not match CR or LF or any of the other line ending characters.
The behaviour of dot with regard to newlines can be changed. If the @@ -2180,10 +2087,10 @@ be easier to remember:
(*atomic:\d+)foo-This kind of parenthesized group "locks up" the part of the pattern it -contains once it has matched, and a failure further into the pattern is -prevented from backtracking into it. Backtracking past it to previous items, -however, works as normal. +This kind of parenthesized group "locks up" the part of the pattern it contains +once it has matched, and a failure further into the pattern is prevented from +backtracking into it. Backtracking past it to previous items, however, works as +normal.
An alternative description is that a group of this type matches exactly the @@ -3859,9 +3766,9 @@ Cambridge, England.
-Last updated: 30 August 2021
+Last updated: 12 January 2022
-Copyright © 1997-2021 University of Cambridge.
+Copyright © 1997-2022 University of Cambridge.
Return to the PCRE2 index page.
diff --git a/pcre2/doc/html/pcre2serialize.html b/pcre2/doc/html/pcre2serialize.html
index 18a8d7fa8d9248833e39c249412a190e0ec77248..df4098e1d38ad26bb8d2e3991f30d83acb029cfa 100644
--- a/pcre2/doc/html/pcre2serialize.html
+++ b/pcre2/doc/html/pcre2serialize.html
@@ -23,12 +23,12 @@ please consult the man page, in case the conversion went wrong.
SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS
int32_t pcre2_serialize_decode(pcre2_code **codes,
- int32_t number_of_codes, const uint32_t *bytes,
+ int32_t number_of_codes, const uint8_t *bytes,
pcre2_general_context *gcontext);
-int32_t pcre2_serialize_encode(pcre2_code **codes,
- int32_t number_of_codes, uint32_t **serialized_bytes,
+int32_t pcre2_serialize_encode(const pcre2_code **codes,
+ int32_t number_of_codes, uint8_t **serialized_bytes,
PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext);
@@ -154,7 +154,6 @@ mangagement functions for the decoded patterns. If this argument is NULL,
malloc() and free() are used. After deserialization, the byte
stream is no longer needed and can be discarded.
- int32_t number_of_codes; pcre2_code *list_of_codes[2]; uint8_t *bytes = <serialized data>; int32_t number_of_codes = diff --git a/pcre2/doc/html/pcre2syntax.html b/pcre2/doc/html/pcre2syntax.html index 735eb69fb1e4d35c977081df3c53c69f1e0c176e..8364c521534093eb951a901685f52d45126b16fb 100644 --- a/pcre2/doc/html/pcre2syntax.html +++ b/pcre2/doc/html/pcre2syntax.html @@ -19,29 +19,31 @@ please consult the man page, in case the conversion went wrong.
@@ -136,6 +138,11 @@ happening, \s and \w may also match characters with code points in the range sequences is changed to use Unicode properties and they match many more characters.
++Property descriptions in \p and \P are matched caselessly; hyphens, +underscores, and white space are ignored, in accordance with Unicode's "loose +matching" rules. +
@@ -152,6 +159,7 @@ characters. Lo Other letter Lt Title case letter Lu Upper case letter + Lc Ll, Lu, or Lt L& Ll, Lu, or Lt M Mark @@ -198,171 +206,58 @@ characters. Perl and POSIX space are now the same. Perl added VT to its space character set at release 5.18. -
SCRIPT NAMES FOR \p AND \P
--Adlam, -Ahom, -Anatolian_Hieroglyphs, -Arabic, -Armenian, -Avestan, -Balinese, -Bamum, -Bassa_Vah, -Batak, -Bengali, -Bhaiksuki, -Bopomofo, -Brahmi, -Braille, -Buginese, -Buhid, -Canadian_Aboriginal, -Carian, -Caucasian_Albanian, -Chakma, -Cham, -Cherokee, -Chorasmian, -Common, -Coptic, -Cuneiform, -Cypriot, -Cypro_Minoan, -Cyrillic, -Deseret, -Devanagari, -Dives_Akuru, -Dogra, -Duployan, -Egyptian_Hieroglyphs, -Elbasan, -Elymaic, -Ethiopic, -Georgian, -Glagolitic, -Gothic, -Grantha, -Greek, -Gujarati, -Gunjala_Gondi, -Gurmukhi, -Han, -Hangul, -Hanifi_Rohingya, -Hanunoo, -Hatran, -Hebrew, -Hiragana, -Imperial_Aramaic, -Inherited, -Inscriptional_Pahlavi, -Inscriptional_Parthian, -Javanese, -Kaithi, -Kannada, -Katakana, -Kayah_Li, -Kharoshthi, -Khitan_Small_Script, -Khmer, -Khojki, -Khudawadi, -Lao, -Latin, -Lepcha, -Limbu, -Linear_A, -Linear_B, -Lisu, -Lycian, -Lydian, -Mahajani, -Makasar, -Malayalam, -Mandaic, -Manichaean, -Marchen, -Masaram_Gondi, -Medefaidrin, -Meetei_Mayek, -Mende_Kikakui, -Meroitic_Cursive, -Meroitic_Hieroglyphs, -Miao, -Modi, -Mongolian, -Mro, -Multani, -Myanmar, -Nabataean, -Nandinagari, -New_Tai_Lue, -Newa, -Nko, -Nushu, -Nyakeng_Puachue_Hmong, -Ogham, -Ol_Chiki, -Old_Hungarian, -Old_Italic, -Old_North_Arabian, -Old_Permic, -Old_Persian, -Old_Sogdian, -Old_South_Arabian, -Old_Turkic, -Old_Uyghur, -Oriya, -Osage, -Osmanya, -Pahawh_Hmong, -Palmyrene, -Pau_Cin_Hau, -Phags_Pa, -Phoenician, -Psalter_Pahlavi, -Rejang, -Runic, -Samaritan, -Saurashtra, -Sharada, -Shavian, -Siddham, -SignWriting, -Sinhala, -Sogdian, -Sora_Sompeng, -Soyombo, -Sundanese, -Syloti_Nagri, -Syriac, -Tagalog, -Tagbanwa, -Tai_Le, -Tai_Tham, -Tai_Viet, -Takri, -Tamil, -Tangsa, -Tangut, -Telugu, -Thaana, -Thai, -Tibetan, -Tifinagh, -Tirhuta, -Toto, -Ugaritic, -Vai, -Vithkuqi, -Wancho, -Warang_Citi, -Yezidi, -Yi, -Zanabazar_Square. -
-
CHARACTER CLASSES
+
BINARY PROPERTIES FOR \p AND \P
++Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\p and \P, along with their abbreviations, by running this command: +
+ pcre2test -LP ++ +
SCRIPT MATCHING WITH \p AND \P
++Many script names and their 4-letter abbreviations are recognized in +\p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P of +course). You can obtain a list of these scripts by running this command: +
+ pcre2test -LS ++ +
THE BIDI_CLASS PROPERTY FOR \p AND \P
++
+ \p{Bidi_Class:<class>} matches a character with the given class + \p{BC:<class>} matches a character with the given class ++The recognized classes are: ++ AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space ++ +
CHARACTER CLASSES
[...] positive character class @@ -390,7 +285,7 @@ In PCRE2, POSIX character set names recognize only ASCII characters by default, but some of them use Unicode properties if PCRE2_UCP is set. You can use \Q...\E inside a character class. -
QUANTIFIERS
+
QUANTIFIERS
? 0 or 1, greedy @@ -411,7 +306,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use {n,}? n or more, lazy-
ANCHORS AND SIMPLE ASSERTIONS
+
ANCHORS AND SIMPLE ASSERTIONS
\b word boundary @@ -429,7 +324,7 @@ but some of them use Unicode properties if PCRE2_UCP is set. You can use \G first matching position in subject-
REPORTED MATCH POINT SETTING
+
REPORTED MATCH POINT SETTING
\K set reported start of match @@ -439,13 +334,13 @@ for compatibility with Perl. However, if the PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK option is set, the previous behaviour is re-enabled. When this option is set, \K is honoured in positive assertions, but ignored in negative ones. --
ALTERNATION
+
ALTERNATION
expr|expr|expr...-
CAPTURING
+
CAPTURING
(...) capture group @@ -460,20 +355,20 @@ In non-UTF modes, names may contain underscores and ASCII letters and digits; in UTF modes, any Unicode letters and Unicode decimal digits are permitted. In both cases, a name must not start with a digit. --
ATOMIC GROUPS
+
ATOMIC GROUPS
(?>...) atomic non-capture group (*atomic:...) atomic non-capture group-
COMMENT
+
COMMENT
(?#....) comment (not nestable)-
OPTION SETTING
+
OPTION SETTING
Changes of these options within a group are automatically cancelled at the end of the group. @@ -518,7 +413,7 @@ not increase them. LIMIT_RECURSION is an obsolete synonym for LIMIT_DEPTH. The application can lock out the use of (*UTF) and (*UCP) by setting the PCRE2_NEVER_UTF or PCRE2_NEVER_UCP options, respectively, at compile time.
-
NEWLINE CONVENTION
+
NEWLINE CONVENTION
These are recognized only at the very start of the pattern or after option settings with a similar syntax. @@ -531,7 +426,7 @@ settings with a similar syntax. (*NUL) the NUL character (binary zero)
WHAT \R MATCHES
+
WHAT \R MATCHES
These are recognized only at the very start of the pattern or after option setting with a similar syntax. @@ -540,7 +435,7 @@ setting with a similar syntax. (*BSR_UNICODE) any Unicode newline sequence
LOOKAHEAD AND LOOKBEHIND ASSERTIONS
+
LOOKAHEAD AND LOOKBEHIND ASSERTIONS
(?=...) ) @@ -561,7 +456,7 @@ setting with a similar syntax.Each top-level branch of a lookbehind must be of a fixed length. -
NON-ATOMIC LOOKAROUND ASSERTIONS
+
NON-ATOMIC LOOKAROUND ASSERTIONS
These assertions are specific to PCRE2 and are not Perl-compatible.
@@ -574,7 +469,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (*non_atomic_positive_lookbehind:...) )-
SCRIPT RUNS
+
SCRIPT RUNS
(*script_run:...) ) script run, can be backtracked into @@ -584,7 +479,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (*asr:...) )-
BACKREFERENCES
+
BACKREFERENCES
\n reference by number (can be ambiguous) @@ -601,7 +496,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. (?P=name) reference by name (Python)-
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
(?R) recurse whole pattern @@ -620,7 +515,7 @@ These assertions are specific to PCRE2 and are not Perl-compatible. \g'-n' call subroutine by relative number (PCRE2 extension)-
CONDITIONAL PATTERNS
+
CONDITIONAL PATTERNS
(?(condition)yes-pattern) @@ -643,7 +538,7 @@ Note the ambiguity of (?(R) and (?(Rn) which might be named reference conditions or recursion tests. Such a condition is interpreted as a reference condition if the relevant named group exists. -
BACKTRACKING CONTROL
+
BACKTRACKING CONTROL
All backtracking control verbs may be in the form (*VERB:NAME). For (*MARK) the name is mandatory, for the others it is optional. (*SKIP) changes its behaviour @@ -670,7 +565,7 @@ pattern is not anchored. The effect of one of these verbs in a group called as a subroutine is confined to the subroutine call.
-
CALLOUTS
+
CALLOUTS
(?C) callout (assumed number 0) @@ -681,12 +576,12 @@ The allowed string delimiters are ` ' " ^ % # $ (which are the same for the start and the end), and the starting delimiter { matched with the ending delimiter }. To encode the ending delimiter within the string, double it. -
SEE ALSO
+
SEE ALSO
pcre2pattern(3), pcre2api(3), pcre2callout(3), pcre2matching(3), pcre2(3).
-
AUTHOR
+
AUTHOR
Philip Hazel
-
@@ -695,11 +590,11 @@ Retired from University Computing Service Cambridge, England.
REVISION
+
REVISION
-Last updated: 30 August 2021 +Last updated: 12 January 2022
-Copyright © 1997-2021 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2test.html b/pcre2/doc/html/pcre2test.html index 3ee51cd5dd6674e7fef83ae842517bdc74081a97..373e5dff09e2f725e49d56a557714a1959a59805 100644 --- a/pcre2/doc/html/pcre2test.html +++ b/pcre2/doc/html/pcre2test.html @@ -78,7 +78,7 @@ to 8-bit code units for output.
In the rest of this document, the names of library functions and structures -are given in generic form, for example, pcre_compile(). The actual +are given in generic form, for example, pcre2_compile(). The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate.
INPUT ENCODING
@@ -253,7 +253,19 @@ available, and the use of JIT for matching is verified. -LM List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. -If both -C and -LM are present, whichever is first is recognized. +If both -C and any -Lx options are present, whichever is first is recognized. + ++-LP +List properties: write a list of recognized Unicode properties to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. +
++-LS +List scripts: write a list of recogized Unicode script names to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized.
-pattern modifier-list @@ -1239,6 +1251,8 @@ pattern, but can be overridden by modifiers on the subject. match_limit=<n> set a match limit memory show heap memory usage null_context match with a NULL context + null_replacement substitute with NULL replacement + null_subject match with NULL subject offset=<n> set starting offset offset_limit=<n> set offset limit ovector=<n> set size of output vector @@ -1668,7 +1682,7 @@ When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated.
-Passing a NULL context +Passing a NULL context, subject, or replacement
Normally, pcre2test passes a context block to pcre2_match(), @@ -1678,6 +1692,11 @@ testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the find_limits or substitute_callout modifiers.
++Similarly, for testing purposes, if the null_subject or +null_replacement modifier is set, the subject or replacement string +pointers are passed as NULL, respectively, to the relevant functions. +
THE ALTERNATIVE MATCHING FUNCTION
By default, pcre2test uses the standard PCRE2 matching function, @@ -2122,9 +2141,9 @@ Cambridge, England.
REVISION
-Last updated: 30 August 2021 +Last updated: 12 January 2022
-Copyright © 1997-2021 University of Cambridge. +Copyright © 1997-2022 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/html/pcre2unicode.html b/pcre2/doc/html/pcre2unicode.html index 76ca6ea281b78bf091ae7dc3dff6191d9ac501ce..a0d4270f447698385542c372b0182c18bbb0482b 100644 --- a/pcre2/doc/html/pcre2unicode.html +++ b/pcre2/doc/html/pcre2unicode.html @@ -50,17 +50,18 @@ UNICODE PROPERTY SUPPORT
When PCRE2 is built with Unicode support, the escape sequences \p{..}, \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF setting. -The Unicode properties that can be tested are limited to the general category -properties such as Lu for an upper case letter or Nd for a decimal number, the -Unicode script names such as Arabic or Han, and the derived properties Any and -L&. Full lists are given in the +The Unicode properties that can be tested are a subset of those that Perl +supports. Currently they are limited to the general category properties such as +Lu for an upper case letter or Nd for a decimal number, the Unicode script +names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived +properties Any and LC (synonym L&). Full lists are given in the pcre2pattern and pcre2syntax -documentation. Only the short names for properties are supported. For example, -\p{L} matches a letter. Its Perl synonym, \p{Letter}, is not supported. -Furthermore, in Perl, many properties may optionally be prefixed by "Is", for -compatibility with Perl 5.6. PCRE2 does not support this. +documentation. In general, only the short names for properties are supported. +For example, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not +supported. Furthermore, in Perl, many properties may optionally be prefixed by +"Is", for compatibility with Perl 5.6. PCRE2 does not support this.
WIDE CHARACTERS AND UTF MODES @@ -477,7 +478,7 @@ AUTHORPhilip Hazel
-University Computing Service +Retired from University Computing Service
Cambridge, England.
@@ -486,9 +487,9 @@ Cambridge, England. REVISION
-Last updated: 23 February 2020 +Last updated: 22 December 2021
-Copyright © 1997-2020 University of Cambridge. +Copyright © 1997-2021 University of Cambridge.
Return to the PCRE2 index page. diff --git a/pcre2/doc/pcre2.txt b/pcre2/doc/pcre2.txt index dde66a136d9679f0fb1d402fdc75400cb82d8bcc..641a1f9d2c6c7d9f83eb42f0fd668f74f15f8ac4 100644 --- a/pcre2/doc/pcre2.txt +++ b/pcre2/doc/pcre2.txt @@ -1815,7 +1815,7 @@ COMPILING A PATTERN to crash or loop. Note that this option can also be passed to pcre2_match() and - pcre_dfa_match(), to suppress UTF validity checking of the subject + pcre2_dfa_match(), to suppress UTF validity checking of the subject string. Note also that setting PCRE2_NO_UTF_CHECK at compile time does not dis- @@ -2012,13 +2012,13 @@ LOCALE SUPPORT code points are less than 256. By default, higher-valued code points never match escapes such as \w or \d. - When PCRE2 is built with Unicode support (the default), the Unicode - properties of all characters can be tested with \p and \P, or, alterna- - tively, the PCRE2_UCP option can be set when a pattern is compiled; - this causes \w and friends to use Unicode property support instead of - the built-in tables. PCRE2_UCP also causes upper/lower casing opera- - tions on characters with code points greater than 127 to use Unicode - properties. These effects apply even when PCRE2_UTF is not set. + When PCRE2 is built with Unicode support (the default), certain Unicode + character properties can be tested with \p and \P, or, alternatively, + the PCRE2_UCP option can be set when a pattern is compiled; this causes + \w and friends to use Unicode property support instead of the built-in + tables. PCRE2_UCP also causes upper/lower casing operations on charac- + ters with code points greater than 127 to use Unicode properties. These + effects apply even when PCRE2_UTF is not set. The use of locales with Unicode is discouraged. If you are handling characters with code points greater than 127, you should either use @@ -2579,7 +2579,9 @@ MATCHING A PATTERN: THE TRADITIONAL FUNCTION and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not UTF pro- - cessing is enabled. + cessing is enabled. As a special case, if subject is NULL and length is + zero, the subject is assumed to be an empty string. If length is non- + zero, an error occurs if subject is NULL. If startoffset is greater than the length of the subject, pcre2_match() returns PCRE2_ERROR_BADOFFSET. When the starting offset is zero, the @@ -3280,8 +3282,12 @@ CREATING A NEW STRING WITH SUBSTITUTIONS This function optionally calls pcre2_match() and then makes a copy of the subject string in outputbuffer, replacing parts that were matched - with the replacement string, whose length is supplied in rlength. This - can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. + with the replacement string, whose length is supplied in rlength, which + can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As + a special case, if replacement is NULL and rlength is zero, the re- + placement is assumed to be an empty string. If rlength is non-zero, an + error occurs if replacement is NULL. + There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to re- turn just the replacement string(s). The default action is to perform just one replacement if the pattern matches, but there is an option @@ -3315,89 +3321,90 @@ CREATING A NEW STRING WITH SUBSTITUTIONS As well as the usual options for pcre2_match(), a number of additional options can be set in the options argument of pcre2_substitute(). One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external - match_data block must be provided, and it must have been used for an - external call to pcre2_match(). The data in the match_data block (re- - turn code, offset vector) is used for the first substitution instead of - calling pcre2_match() from within pcre2_substitute(). This allows an - application to check for a match before choosing to substitute, without - having to repeat the match. - - The contents of the externally supplied match data block are not - changed when PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTI- - TUTE_GLOBAL is also set, pcre2_match() is called after the first sub- - stitution to check for further matches, but this is done using an in- - ternally obtained match data block, thus always leaving the external + match_data block must be provided, and it must have already been used + for an external call to pcre2_match() with the same pattern and subject + arguments. The data in the match_data block (return code, offset vec- + tor) is then used for the first substitution instead of calling + pcre2_match() from within pcre2_substitute(). This allows an applica- + tion to check for a match before choosing to substitute, without having + to repeat the match. + + The contents of the externally supplied match data block are not + changed when PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTI- + TUTE_GLOBAL is also set, pcre2_match() is called after the first sub- + stitution to check for further matches, but this is done using an in- + ternally obtained match data block, thus always leaving the external block unchanged. - The code argument is not used for matching before the first substitu- - tion when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, - even when PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains in- + The code argument is not used for matching before the first substitu- + tion when PCRE2_SUBSTITUTE_MATCHED is set, but it must be provided, + even when PCRE2_SUBSTITUTE_GLOBAL is not set, because it contains in- formation such as the UTF setting and the number of capturing parenthe- ses in the pattern. - The default action of pcre2_substitute() is to return a copy of the + The default action of pcre2_substitute() is to return a copy of the subject string with matched substrings replaced. However, if PCRE2_SUB- - STITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are + STITUTE_REPLACEMENT_ONLY is set, only the replacement substrings are returned. In the global case, multiple replacements are concatenated in - the output buffer. Substitution callouts (see below) can be used to + the output buffer. Substitution callouts (see below) can be used to separate them if necessary. - The outlengthptr argument of pcre2_substitute() must point to a vari- - able that contains the length, in code units, of the output buffer. If - the function is successful, the value is updated to contain the length - in code units of the new string, excluding the trailing zero that is + The outlengthptr argument of pcre2_substitute() must point to a vari- + able that contains the length, in code units, of the output buffer. If + the function is successful, the value is updated to contain the length + in code units of the new string, excluding the trailing zero that is automatically added. - If the function is not successful, the value set via outlengthptr de- - pends on the type of error. For syntax errors in the replacement + If the function is not successful, the value set via outlengthptr de- + pends on the type of error. For syntax errors in the replacement string, the value is the offset in the replacement string where the er- - ror was detected. For other errors, the value is PCRE2_UNSET by de- + ror was detected. For other errors, the value is PCRE2_UNSET by de- fault. This includes the case of the output buffer being too small, un- less PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set. - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEM- - ORY immediately. If this option is set, however, pcre2_substitute() + ORY immediately. If this option is set, however, pcre2_substitute() continues to go through the motions of matching and substituting (with- - out, of course, writing anything) in order to compute the size of buf- - fer that is needed. This value is passed back via the outlengthptr - variable, with the result of the function still being PCRE2_ER- + out, of course, writing anything) in order to compute the size of buf- + fer that is needed. This value is passed back via the outlengthptr + variable, with the result of the function still being PCRE2_ER- ROR_NOMEMORY. - Passing a buffer size of zero is a permitted way of finding out how - much memory is needed for given substitution. However, this does mean + Passing a buffer size of zero is a permitted way of finding out how + much memory is needed for given substitution. However, this does mean that the entire operation is carried out twice. Depending on the appli- - cation, it may be more efficient to allocate a large buffer and free - the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- + cation, it may be more efficient to allocate a large buffer and free + the excess afterwards, instead of using PCRE2_SUBSTITUTE_OVER- FLOW_LENGTH. - The replacement string, which is interpreted as a UTF string in UTF - mode, is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An + The replacement string, which is interpreted as a UTF string in UTF + mode, is checked for UTF validity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacement string causes an immediate return with the rel- evant UTF error code. - If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not in- + If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not in- terpreted in any way. By default, however, a dollar character is an es- - cape character that can specify the insertion of characters from cap- - ture groups and names from (*MARK) or other control verbs in the pat- + cape character that can specify the insertion of characters from cap- + ture groups and names from (*MARK) or other control verbs in the pat- tern. The following forms are always recognized: $$ insert a dollar character $
or ${ } insert the contents of group $*MARK or ${*MARK} insert a control verb name - Either a group number or a group name can be given for . Curly - brackets are required only if the following character would be inter- + Either a group number or a group name can be given for . Curly + brackets are required only if the following character would be inter- preted as part of the number or name. The number may be zero to include - the entire matched string. For example, if the pattern a(b)c is - matched with "=abc=" and the replacement string "+$1$0$1+", the result + the entire matched string. For example, if the pattern a(b)c is + matched with "=abc=" and the replacement string "+$1$0$1+", the result is "=+babcb+=". - $*MARK inserts the name from the last encountered backtracking control - verb on the matching path that has a name. (*MARK) must always include - a name, but the other verbs need not. For example, in the case of + $*MARK inserts the name from the last encountered backtracking control + verb on the matching path that has a name. (*MARK) must always include + a name, but the other verbs need not. For example, in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for (*MARK:A)(*PRUNE:B) - the relevant name is "B". This facility can be used to perform simple + the relevant name is "B". This facility can be used to perform simple simultaneous substitutions, as this pcre2test example shows: /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} @@ -3405,15 +3412,15 @@ CREATING A NEW STRING WITH SUBSTITUTIONS 2: pear orange PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subject - string, replacing every matching substring. If this option is not set, - only the first matching substring is replaced. The search for matches - takes place in the original subject string (that is, previous replace- - ments do not affect it). Iteration is implemented by advancing the - startoffset value for each search, which is always passed the entire + string, replacing every matching substring. If this option is not set, + only the first matching substring is replaced. The search for matches + takes place in the original subject string (that is, previous replace- + ments do not affect it). Iteration is implemented by advancing the + startoffset value for each search, which is always passed the entire subject string. If an offset limit is set in the match context, search- ing stops when that limit is reached. - You can restrict the effect of a global substitution to a portion of + You can restrict the effect of a global substitution to a portion of the subject string by setting either or both of startoffset and an off- set limit. Here is a pcre2test example: @@ -3421,73 +3428,73 @@ CREATING A NEW STRING WITH SUBSTITUTIONS ABC ABC ABC ABC\=offset=3,offset_limit=12 2: ABC A!C A!C ABC - When continuing with global substitutions after matching a substring + When continuing with global substitutions after matching a substring with zero length, an attempt to find a non-empty match at the same off- set is performed. If this is not successful, the offset is advanced by one character except when CRLF is a valid newline sequence and the next - two characters are CR, LF. In this case, the offset is advanced by two + two characters are CR, LF. In this case, the offset is advanced by two characters. PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups that do not appear in the pattern to be treated as unset groups. This option - should be used with care, because it means that a typo in a group name + should be used with care, because it means that a typo in a group name or number no longer causes the PCRE2_ERROR_NOSUBSTRING error. PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (including un- - known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated - as empty strings when inserted as described above. If this option is + known groups when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated + as empty strings when inserted as described above. If this option is not set, an attempt to insert an unset group causes the PCRE2_ERROR_UN- - SET error. This option does not influence the extended substitution + SET error. This option does not influence the extended substitution syntax described below. - PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the - replacement string. Without this option, only the dollar character is - special, and only the group insertion forms listed above are valid. + PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to the + replacement string. Without this option, only the dollar character is + special, and only the group insertion forms listed above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things change: - Firstly, backslash in a replacement string is interpreted as an escape + Firstly, backslash in a replacement string is interpreted as an escape character. The usual forms such as \n or \x{ddd} can be used to specify - particular character codes, and backslash followed by any non-alphanu- - meric character quotes that character. Extended quoting can be coded + particular character codes, and backslash followed by any non-alphanu- + meric character quotes that character. Extended quoting can be coded using \Q...\E, exactly as in pattern strings. - There are also four escape sequences for forcing the case of inserted - letters. The insertion mechanism has three states: no case forcing, + There are also four escape sequences for forcing the case of inserted + letters. The insertion mechanism has three states: no case forcing, force upper case, and force lower case. The escape sequences change the current state: \U and \L change to upper or lower case forcing, respec- - tively, and \E (when not terminating a \Q quoted sequence) reverts to - no case forcing. The sequences \u and \l force the next character (if - it is a letter) to upper or lower case, respectively, and then the + tively, and \E (when not terminating a \Q quoted sequence) reverts to + no case forcing. The sequences \u and \l force the next character (if + it is a letter) to upper or lower case, respectively, and then the state automatically reverts to no case forcing. Case forcing applies to - all inserted characters, including those from capture groups and let- - ters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP - was set when the pattern was compiled, Unicode properties are used for + all inserted characters, including those from capture groups and let- + ters within \Q...\E quoted sequences. If either PCRE2_UTF or PCRE2_UCP + was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater than 127. Note that case forcing sequences such as \U...\E do not nest. For exam- - ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final - \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- + ple, the result of processing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final + \E has no effect. Note also that the PCRE2_ALT_BSUX and PCRE2_EX- TRA_ALT_BSUX options do not apply to replacement strings. - The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more - flexibility to capture group substitution. The syntax is similar to + The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more + flexibility to capture group substitution. The syntax is similar to that used by Bash: ${ :- } ${ :+ : } - As before, may be a group number or a name. The first form speci- - fies a default value. If group is set, its value is inserted; if - not, is expanded and the result inserted. The second form - specifies strings that are expanded and inserted when group is set - or unset, respectively. The first form is just a convenient shorthand + As before, may be a group number or a name. The first form speci- + fies a default value. If group is set, its value is inserted; if + not, is expanded and the result inserted. The second form + specifies strings that are expanded and inserted when group is set + or unset, respectively. The first form is just a convenient shorthand for ${ :+${ }: } - Backslash can be used to escape colons and closing curly brackets in - the replacement strings. A change of the case forcing state within a - replacement string remains in force afterwards, as shown in this + Backslash can be used to escape colons and closing curly brackets in + the replacement strings. A change of the case forcing state within a + replacement string remains in force afterwards, as shown in this pcre2test example: /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo @@ -3496,8 +3503,8 @@ CREATING A NEW STRING WITH SUBSTITUTIONS somebody 1: HELLO - The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended - substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- + The PCRE2_SUBSTITUTE_UNSET_EMPTY option does not affect these extended + substitutions. However, PCRE2_SUBSTITUTE_UNKNOWN_UNSET does cause un- known groups in the extended syntax forms to be treated as unset. If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, @@ -3506,37 +3513,39 @@ CREATING A NEW STRING WITH SUBSTITUTIONS Substitution errors - In the event of an error, pcre2_substitute() returns a negative error - code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors + In the event of an error, pcre2_substitute() returns a negative error + code. Except for PCRE2_ERROR_NOMATCH (which is never returned), errors from pcre2_match() are passed straight back. PCRE2_ERROR_NOSUBSTRING is returned for a non-existent substring inser- tion, unless PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set. PCRE2_ERROR_UNSET is returned for an unset substring insertion (includ- - ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) - when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- + ing an unknown substring when PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) + when the simple (non-extended) syntax is used and PCRE2_SUBSTITUTE_UN- SET_EMPTY is not set. - PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big + PCRE2_ERROR_NOMEMORY is returned if the output buffer is not big enough. If the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option is set, the size - of buffer that is needed is returned via outlengthptr. Note that this + of buffer that is needed is returned via outlengthptr. Note that this does not happen by default. PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the - match_data argument is NULL. + match_data argument is NULL or if the subject or replacement arguments + are NULL. For backward compatibility reasons an exception is made for + the replacement argument if the rlength argument is also 0. - PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in - the replacement string, with more particular errors being PCRE2_ER- + PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in + the replacement string, with more particular errors being PCRE2_ER- ROR_BADREPESCAPE (invalid escape sequence), PCRE2_ERROR_REPMISSINGBRACE - (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax - error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN + (closing curly bracket not found), PCRE2_ERROR_BADSUBSTITUTION (syntax + error in extended group substitution), and PCRE2_ERROR_BADSUBSPATTERN (the pattern match ended before it started or the match started earlier - than the current position in the subject, which can happen if \K is + than the current position in the subject, which can happen if \K is used in an assertion). As for all PCRE2 errors, a text message that describes the error can be - obtained by calling the pcre2_get_error_message() function (see "Ob- + obtained by calling the pcre2_get_error_message() function (see "Ob- taining a textual error message" above). Substitution callouts @@ -3545,15 +3554,15 @@ CREATING A NEW STRING WITH SUBSTITUTIONS int (*callout_function)(pcre2_substitute_callout_block *, void *), void *callout_data); - The pcre2_set_substitution_callout() function can be used to specify a - callout function for pcre2_substitute(). This information is passed in + The pcre2_set_substitution_callout() function can be used to specify a + callout function for pcre2_substitute(). This information is passed in a match context. The callout function is called after each substitution has been processed, but it can cause the replacement not to happen. The - callout function is not called for simulated substitutions that happen + callout function is not called for simulated substitutions that happen as a result of the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. The first argument of the callout function is a pointer to a substitute - callout block structure, which contains the following fields, not nec- + callout block structure, which contains the following fields, not nec- essarily in this order: uint32_t version; @@ -3564,34 +3573,34 @@ CREATING A NEW STRING WITH SUBSTITUTIONS uint32_t oveccount; PCRE2_SIZE output_offsets[2]; - The version field contains the version number of the block format. The - current version is 0. The version number will increase in future if - more fields are added, but the intention is never to remove any of the + The version field contains the version number of the block format. The + current version is 0. The version number will increase in future if + more fields are added, but the intention is never to remove any of the existing fields. The subscount field is the number of the current match. It is 1 for the first callout, 2 for the second, and so on. The input and output point- ers are copies of the values passed to pcre2_substitute(). - The ovector field points to the ovector, which contains the result of + The ovector field points to the ovector, which contains the result of the most recent match. The oveccount field contains the number of pairs that are set in the ovector, and is always greater than zero. - The output_offsets vector contains the offsets of the replacement in - the output string. This has already been processed for dollar and (if + The output_offsets vector contains the offsets of the replacement in + the output string. This has already been processed for dollar and (if requested) backslash substitutions as described above. - The second argument of the callout function is the value passed as - callout_data when the function was registered. The value returned by + The second argument of the callout function is the value passed as + callout_data when the function was registered. The value returned by the callout function is interpreted as follows: - If the value is zero, the replacement is accepted, and, if PCRE2_SUB- - STITUTE_GLOBAL is set, processing continues with a search for the next - match. If the value is not zero, the current replacement is not ac- - cepted. If the value is greater than zero, processing continues when - PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero - or PCRE2_SUBSTITUTE_GLOBAL is not set), the the rest of the input is - copied to the output and the call to pcre2_substitute() exits, return- + If the value is zero, the replacement is accepted, and, if PCRE2_SUB- + STITUTE_GLOBAL is set, processing continues with a search for the next + match. If the value is not zero, the current replacement is not ac- + cepted. If the value is greater than zero, processing continues when + PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less than zero + or PCRE2_SUBSTITUTE_GLOBAL is not set), the the rest of the input is + copied to the output and the call to pcre2_substitute() exits, return- ing the number of matches so far. @@ -3600,56 +3609,56 @@ DUPLICATE CAPTURE GROUP NAMES int pcre2_substring_nametable_scan(const pcre2_code *code, PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); - When a pattern is compiled with the PCRE2_DUPNAMES option, names for - capture groups are not required to be unique. Duplicate names are al- - ways allowed for groups with the same number, created by using the (?| + When a pattern is compiled with the PCRE2_DUPNAMES option, names for + capture groups are not required to be unique. Duplicate names are al- + ways allowed for groups with the same number, created by using the (?| feature. Indeed, if such groups are named, they are required to use the same names. - Normally, patterns that use duplicate names are such that in any one - match, only one of each set of identically-named groups participates. + Normally, patterns that use duplicate names are such that in any one + match, only one of each set of identically-named groups participates. An example is shown in the pcre2pattern documentation. - When duplicates are present, pcre2_substring_copy_byname() and - pcre2_substring_get_byname() return the first substring corresponding - to the given name that is set. Only if none are set is PCRE2_ERROR_UN- - SET is returned. The pcre2_substring_number_from_name() function re- - turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate + When duplicates are present, pcre2_substring_copy_byname() and + pcre2_substring_get_byname() return the first substring corresponding + to the given name that is set. Only if none are set is PCRE2_ERROR_UN- + SET is returned. The pcre2_substring_number_from_name() function re- + turns the error PCRE2_ERROR_NOUNIQUESUBSTRING when there are duplicate names. - If you want to get full details of all captured substrings for a given - name, you must use the pcre2_substring_nametable_scan() function. The - first argument is the compiled pattern, and the second is the name. If - the third and fourth arguments are NULL, the function returns a group + If you want to get full details of all captured substrings for a given + name, you must use the pcre2_substring_nametable_scan() function. The + first argument is the compiled pattern, and the second is the name. If + the third and fourth arguments are NULL, the function returns a group number for a unique name, or PCRE2_ERROR_NOUNIQUESUBSTRING otherwise. When the third and fourth arguments are not NULL, they must be pointers - to variables that are updated by the function. After it has run, they + to variables that are updated by the function. After it has run, they point to the first and last entries in the name-to-number table for the - given name, and the function returns the length of each entry in code - units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are + given name, and the function returns the length of each entry in code + units. In both cases, PCRE2_ERROR_NOSUBSTRING is returned if there are no entries for the given name. The format of the name table is described above in the section entitled - Information about a pattern. Given all the relevant entries for the - name, you can extract each of their numbers, and hence the captured + Information about a pattern. Given all the relevant entries for the + name, you can extract each of their numbers, and hence the captured data. FINDING ALL POSSIBLE MATCHES AT ONE POSITION - The traditional matching function uses a similar algorithm to Perl, - which stops when it finds the first match at a given point in the sub- + The traditional matching function uses a similar algorithm to Perl, + which stops when it finds the first match at a given point in the sub- ject. If you want to find all possible matches, or the longest possible - match at a given position, consider using the alternative matching - function (see below) instead. If you cannot use the alternative func- + match at a given position, consider using the alternative matching + function (see below) instead. If you cannot use the alternative func- tion, you can kludge it up by making use of the callout facility, which is described in the pcre2callout documentation. What you have to do is to insert a callout right at the end of the pat- - tern. When your callout function is called, extract and save the cur- - rent matched substring. Then return 1, which forces pcre2_match() to - backtrack and try other alternatives. Ultimately, when it runs out of + tern. When your callout function is called, extract and save the cur- + rent matched substring. Then return 1, which forces pcre2_match() to + backtrack and try other alternatives. Ultimately, when it runs out of matches, pcre2_match() will yield PCRE2_ERROR_NOMATCH. @@ -3661,15 +3670,16 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount); - The function pcre2_dfa_match() is called to match a subject string - against a compiled pattern, using a matching algorithm that scans the + The function pcre2_dfa_match() is called to match a subject string + against a compiled pattern, using a matching algorithm that scans the subject string just once (not counting lookaround assertions), and does - not backtrack. This has different characteristics to the normal algo- - rithm, and is not compatible with Perl. Some of the features of PCRE2 - patterns are not supported. Nevertheless, there are times when this - kind of matching can be useful. For a discussion of the two matching - algorithms, and a list of features that pcre2_dfa_match() does not sup- - port, see the pcre2matching documentation. + not backtrack (except when processing lookaround assertions). This has + different characteristics to the normal algorithm, and is not compati- + ble with Perl. Some of the features of PCRE2 patterns are not sup- + ported. Nevertheless, there are times when this kind of matching can be + useful. For a discussion of the two matching algorithms, and a list of + features that pcre2_dfa_match() does not support, see the pcre2matching + documentation. The arguments for the pcre2_dfa_match() function are the same as for pcre2_match(), plus two extras. The ovector within the match data block @@ -3698,7 +3708,7 @@ MATCHING A PATTERN: THE ALTERNATIVE FUNCTION wspace, /* working space vector */ 20); /* number of elements (NOT size in bytes) */ - Option bits for pcre_dfa_match() + Option bits for pcre2_dfa_match() The unused bits of the options argument for pcre2_dfa_match() must be zero. The only bits that may be set are PCRE2_ANCHORED, @@ -3848,7 +3858,7 @@ AUTHOR REVISION - Last updated: 30 August 2021 + Last updated: 14 December 2021 Copyright (c) 1997-2021 University of Cambridge. ------------------------------------------------------------------------------ @@ -3961,8 +3971,8 @@ UNICODE AND UTF SUPPORT 0x10ffff in the strings that they handle. Unicode support also gives access to the Unicode properties of characters, using pattern escapes such as \P, \p, and \X. Only the general category properties such as Lu - and Nd are supported. Details are given in the pcre2pattern documenta- - tion. + and Nd, script names, and some bi-directional properties are supported. + Details are given in the pcre2pattern documentation. Pattern escapes such as \d and \w do not by default make use of Unicode properties. The application can request that they do by setting the @@ -4128,7 +4138,7 @@ LIMITING PCRE2 RESOURCE USAGE for --with-match-limit. You can set a lower default limit by adding, for example, - --with-match-limit_depth=10000 + --with-match-limit-depth=10000 to the configure command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is @@ -4444,8 +4454,8 @@ AUTHOR REVISION - Last updated: 20 March 2020 - Copyright (c) 1997-2020 University of Cambridge. + Last updated: 08 December 2021 + Copyright (c) 1997-2021 University of Cambridge. ------------------------------------------------------------------------------ @@ -4890,57 +4900,64 @@ DIFFERENCES BETWEEN PCRE2 AND PERL This document describes some of the differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are - with respect to Perl version 5.32.0, but as both Perl and PCRE2 are + with respect to Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date. - 1. PCRE2 has only a subset of Perl's Unicode support. Details of what + 1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, + the behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' + matches the next character unless it is the start of a newline se- + quence. This means that, if the newline setting is CR, CRLF, or NUL, + '.' will match the code point LF (0x0A) in ASCII/Unicode environments, + and NL (either 0x15 or 0x25) when using EBCDIC. In Perl, '.' appears + never to match LF, even when 0x0A is not a newline indicator. + + 2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the pcre2unicode page. - 2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized asser- + 3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized asser- tions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just as- serts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some - repeat quantifiers on other assertions, for example, \b* (but not - \b{3}, though oddly it does allow ^{3}), but these do not seem to have - any use. PCRE2 does not allow any kind of quantifier on non-lookaround - assertions. - - 3. Capture groups that occur inside negative lookaround assertions are - counted, but their entries in the offsets vector are set only when a - negative assertion is a condition that has a matching branch (that is, - the condition is false). Perl may set such capture groups in other + repeat quantifiers on other assertions, for example, \b* , but these do + not seem to have any use. PCRE2 does not allow any kind of quantifier + on non-lookaround assertions. + + 4. Capture groups that occur inside negative lookaround assertions are + counted, but their entries in the offsets vector are set only when a + negative assertion is a condition that has a matching branch (that is, + the condition is false). Perl may set such capture groups in other circumstances. - 4. The following Perl escape sequences are not supported: \F, \l, \L, + 5. The following Perl escape sequences are not supported: \F, \l, \L, \u, \U, and \N when followed by a character name. \N on its own, match- - ing a non-newline character, and \N{U+dd..}, matching a Unicode code - point, are supported. The escapes that modify the case of following - letters are implemented by Perl's general string-handling and are not + ing a non-newline character, and \N{U+dd..}, matching a Unicode code + point, are supported. The escapes that modify the case of following + letters are implemented by Perl's general string-handling and are not part of its pattern matching engine. If any of these are encountered by - PCRE2, an error is generated by default. However, if either of the - PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are + PCRE2, an error is generated by default. However, if either of the + PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \U and \u are interpreted as ECMAScript interprets them. - 5. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 + 6. The Perl escape sequences \p, \P, and \X are supported only if PCRE2 is built with Unicode support (the default). The properties that can be - tested with \p and \P are limited to the general category properties - such as Lu and Nd, script names such as Greek or Han, and the derived - properties Any and L&. Both PCRE2 and Perl support the Cs (surrogate) - property, but in PCRE2 its use is limited. See the pcre2pattern docu- - mentation for details. The long synonyms for property names that Perl - supports (such as \p{Letter}) are not supported by PCRE2, nor is it - permitted to prefix any of these properties with "Is". - - 6. PCRE2 supports the \Q...\E escape for quoting substrings. Characters + tested with \p and \P are limited to the general category properties + such as Lu and Nd, script names such as Greek or Han, Bidi_Class, + Bidi_Control, and the derived properties Any and LC (synonym L&). Both + PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its + use is limited. See the pcre2pattern documentation for details. The + long synonyms for property names that Perl supports (such as \p{Let- + ter}) are not supported by PCRE2, nor is it permitted to prefix any of + these properties with "Is". + + 7. PCRE2 supports the \Q...\E escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the - quotes. In Perl, they cause variable interpolation (but of course PCRE2 - does not have variables). Also, Perl does "double-quotish backslash in- - terpolation" on any backslashes between \Q and \E which, its documenta- - tion says, "may lead to confusing results". PCRE2 treats a backslash - between \Q and \E just like any other character. Note the following ex- - amples: + quotes. In Perl, they cause variable interpolation (PCRE2 does not have + variables). Also, Perl does "double-quotish backslash interpolation" on + any backslashes between \Q and \E which, its documentation says, "may + lead to confusing results". PCRE2 treats a backslash between \Q and \E + just like any other character. Note the following examples: Pattern PCRE2 matches Perl matches @@ -4951,81 +4968,82 @@ DIFFERENCES BETWEEN PCRE2 AND PERL \QA\B\E A\B A\B \Q\\E \ \\E - The \Q...\E sequence is recognized both inside and outside character + The \Q...\E sequence is recognized both inside and outside character classes by both PCRE2 and Perl. - 7. Fairly obviously, PCRE2 does not support the (?{code}) and + 8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the pcre2callout documentation for details. - 8. Subroutine calls (whether recursive or not) were treated as atomic - groups up to PCRE2 release 10.23, but from release 10.30 this changed, + 9. Subroutine calls (whether recursive or not) were treated as atomic + groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. - 9. In PCRE2, if any of the backtracking control verbs are used in a - group that is called as a subroutine (whether or not recursively), - their effect is confined to that group; it does not extend to the sur- - rounding pattern. This is not always the case in Perl. In particular, - if (*THEN) is present in a group that is called as a subroutine, its + 10. In PCRE2, if any of the backtracking control verbs are used in a + group that is called as a subroutine (whether or not recursively), + their effect is confined to that group; it does not extend to the sur- + rounding pattern. This is not always the case in Perl. In particular, + if (*THEN) is present in a group that is called as a subroutine, its action is limited to that group, even if the group does not contain any - | characters. Note that such groups are processed as anchored at the + | characters. Note that such groups are processed as anchored at the point where they are tested. - 10. If a pattern contains more than one backtracking control verb, the - first one that is backtracked onto acts. For example, in the pattern - A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure + 11. If a pattern contains more than one backtracking control verb, the + first one that is backtracked onto acts. For example, in the pattern + A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. - 11. There are some differences that are concerned with the settings of - captured strings when part of a pattern is repeated. For example, - matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- + 12. There are some differences that are concerned with the settings of + captured strings when part of a pattern is repeated. For example, + matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 un- set, but in PCRE2 it is set to "b". - 12. PCRE2's handling of duplicate capture group numbers and names is - not as general as Perl's. This is a consequence of the fact the PCRE2 - works internally just with numbers, using an external table to trans- - late between numbers and names. In particular, a pattern such as - (?|(?A)|(?B)), where the two capture groups have the same number - but different names, is not supported, and causes an error at compile + 13. PCRE2's handling of duplicate capture group numbers and names is + not as general as Perl's. This is a consequence of the fact the PCRE2 + works internally just with numbers, using an external table to trans- + late between numbers and names. In particular, a pattern such as + (?|(?A)|(?B)), where the two capture groups have the same number + but different names, is not supported, and causes an error at compile time. If it were allowed, it would not be possible to distinguish which - group matched, because both names map to capture group number 1. To + group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. - 13. Perl used to recognize comments in some places that PCRE2 does not, - for example, between the ( and ? at the start of a group. If the /x - modifier is set, Perl allowed white space between ( and ? though the - latest Perls give an error (for a while it was just deprecated). There + 14. Perl used to recognize comments in some places that PCRE2 does not, + for example, between the ( and ? at the start of a group. If the /x + modifier is set, Perl allowed white space between ( and ? though the + latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. - 14. Perl, when in warning mode, gives warnings for character classes - such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- + 15. Perl, when in warning mode, gives warnings for character classes + such as [A-\d] or [a-[:digit:]]. It then treats the hyphens as liter- als. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. - 15. In PCRE2, the upper/lower case character properties Lu and Ll are - not affected when case-independent matching is specified. For example, + 16. In PCRE2, the upper/lower case character properties Lu and Ll are + not affected when case-independent matching is specified. For example, \p{Lu} always matches an upper case letter. I think Perl has changed in - this respect; in the release at the time of writing (5.32), \p{Lu} and + this respect; in the release at the time of writing (5.34), \p{Lu} and \p{Ll} match all letters, regardless of case, when case independence is specified. - 16. From release 5.32.0, Perl locks out the use of \K in lookaround as- - sertions. From release 10.38 PCRE2 does the same by default. However, - there is an option for re-enabling the previous behaviour. When this - option is set, \K is acted on when it occurs in positive assertions, + 17. From release 5.32.0, Perl locks out the use of \K in lookaround as- + sertions. From release 10.38 PCRE2 does the same by default. However, + there is an option for re-enabling the previous behaviour. When this + option is set, \K is acted on when it occurs in positive assertions, but is ignored in negative assertions. - 17. PCRE2 provides some extensions to the Perl regular expression fa- - cilities. Perl 5.10 included new features that were not in earlier - versions of Perl, some of which (such as named parentheses) were in - PCRE2 for some time before. This list is with respect to Perl 5.32: + 18. PCRE2 provides some extensions to the Perl regular expression fa- + cilities. Perl 5.10 included new features that were not in earlier + versions of Perl, some of which (such as named parentheses) were in + PCRE2 for some time before. This list is with respect to Perl 5.34: - (a) Although lookbehind assertions in PCRE2 must match fixed length + (a) Although lookbehind assertions in PCRE2 must match fixed length strings, each alternative toplevel branch of a lookbehind assertion can - match a different length of string. Perl requires them all to have the - same length. + match a different length of string. Perl used to require them all to + have the same length, but the latest version has some variable length + support. (b) From PCRE2 10.23, backreferences to groups of fixed length are sup- ported in lookbehinds, provided that there is no possibility of refer- @@ -5067,12 +5085,12 @@ DIFFERENCES BETWEEN PCRE2 AND PERL an extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. - 18. The Perl /a modifier restricts /d numbers to pure ascii, and the + 19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa modifier restricts /i case-insensitive matching to pure ascii, ig- noring Unicode rules. This separation cannot be represented with PCRE2_UCP. - 19. Perl has different limits than PCRE2. See the pcre2limit documenta- + 20. Perl has different limits than PCRE2. See the pcre2limit documenta- tion for details. Perl went with 5.10 from recursion to iteration keep- ing the intermediate matches on the heap, which is ~10% slower but does not fall into any stack-overflow limit. PCRE2 made a similar change at @@ -5089,7 +5107,7 @@ AUTHOR REVISION - Last updated: 30 August 2021 + Last updated: 08 December 2021 Copyright (c) 1997-2021 University of Cambridge. ------------------------------------------------------------------------------ @@ -5434,7 +5452,7 @@ FREEING JIT SPECULATIVE MEMORY void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); The JIT executable allocator does not free all memory when it is possi- - ble. It expects new allocations, and keeps some free memory around to + ble. It expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general con- @@ -5492,12 +5510,13 @@ JIT FAST PATH API When you call pcre2_match(), as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For exam- - ple, if the subject pointer is NULL, an immediate error is given. Also, - unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for - validity. In the interests of speed, these checks do not happen on the - JIT fast path, and if invalid data is passed, the result is undefined. + ple, if the subject pointer is NULL but the length is non-zero, an im- + mediate error is given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF + subject string is tested for validity. In the interests of speed, these + checks do not happen on the JIT fast path, and if invalid data is + passed, the result is undefined. - Bypassing the sanity checks and the pcre2_match() wrapping can give + Bypassing the sanity checks and the pcre2_match() wrapping can give speedups of more than 10%. @@ -5515,8 +5534,8 @@ AUTHOR REVISION - Last updated: 23 May 2019 - Copyright (c) 1997-2019 University of Cambridge. + Last updated: 30 November 2021 + Copyright (c) 1997-2021 University of Cambridge. ------------------------------------------------------------------------------ @@ -6870,68 +6889,65 @@ BACKSLASH ters whose code points are less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all treated as being in the Un- - known script and with an unassigned type. The extra escape sequences - are: + known script and with an unassigned type. + + Matching characters by Unicode property is not fast, because PCRE2 has + to do a multistage table lookup in order to find a character's prop- + erty. That is why the traditional escape sequences such as \d and \w do + not use Unicode properties in PCRE2 by default, though you can make + them do so by setting the PCRE2_UCP option or by starting the pattern + with (*UCP). + + The extra escape sequences that provide property support are: \p{xx} a character with the xx property \P{xx} a character without the xx property \X a Unicode extended grapheme cluster - The property names represented by xx above are case-sensitive. There is - support for Unicode script names, Unicode general category properties, - "Any", which matches any character (including newline), and some spe- - cial PCRE2 properties (described in the next section). Other Perl - properties such as "InMusicalSymbols" are not supported by PCRE2. Note - that \P{Any} does not match any characters, so always causes a match - failure. - - Sets of Unicode characters are defined as belonging to certain scripts. - A character from one of these sets can be matched using a script name. - For example: - - \p{Greek} - \P{Han} + The property names represented by xx above are not case-sensitive, and + in accordance with Unicode's "loose matching" rules, spaces, hyphens, + and underscores are ignored. There is support for Unicode script names, + Unicode general category properties, "Any", which matches any character + (including newline), Bidi_Class, a number of binary (yes/no) proper- + ties, and some special PCRE2 properties (described below). Certain + other Perl properties such as "InMusicalSymbols" are not supported by + PCRE2. Note that \P{Any} does not match any characters, so always + causes a match failure. + + Script properties for \p and \P + + There are three different syntax forms for matching a script. Each Uni- + code character has a basic script and, optionally, a list of other + scripts ("Script Extensions") with which it is commonly used. Using the + Adlam script as an example, \p{sc:Adlam} matches characters whose basic + script is Adlam, whereas \p{scx:Adlam} matches, in addition, characters + that have Adlam in their extensions list. The full names "script" and + "script extensions" for the property types are recognized, and a equals + sign is an alternative to the colon. If a script name is given without + a property type, for example, \p{Adlam}, it is treated as \p{scx:Ad- + lam}. Perl changed to this interpretation at release 5.26 and PCRE2 + changed at release 10.40. Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Com- - mon". The current list of scripts is: - - Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali- - nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi, - Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba- - nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform, - Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do- - gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor- - gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur- - mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana, - Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip- - tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li, - Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin, - Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, - Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede- - faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero- - glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi- - nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham, - Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, - Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur, - Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa, - Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, - Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng, - Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le, - Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai, - Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Unknown, Vai, Vithkuqi, - Wancho, Warang_Citi, Yezidi, Yi, Zanabazar_Square. + mon". The current list of recognized script names and their 4-character + abbreviations can be obtained by running this command: + + pcre2test -LS + + + The general category property for \p and \P Each character has exactly one Unicode general category property, spec- - ified by a two-letter abbreviation. For compatibility with Perl, nega- - tion can be specified by including a circumflex between the opening - brace and the property name. For example, \p{^Lu} is the same as + ified by a two-letter abbreviation. For compatibility with Perl, nega- + tion can be specified by including a circumflex between the opening + brace and the property name. For example, \p{^Lu} is the same as \P{Lu}. If only one letter is specified with \p or \P, it includes all the gen- - eral category properties that start with that letter. In this case, in - the absence of negation, the curly brackets in the escape sequence are + eral category properties that start with that letter. In this case, in + the absence of negation, the curly brackets in the escape sequence are optional; these two examples have the same effect: \p{L} @@ -6983,36 +6999,73 @@ BACKSLASH Zp Paragraph separator Zs Space separator - The special property L& is also supported: it matches a character that - has the Lu, Ll, or Lt property, in other words, a letter that is not - classified as a modifier or "other". + The special property LC, which has the synonym L&, is also supported: + it matches a character that has the Lu, Ll, or Lt property, in other + words, a letter that is not classified as a modifier or "other". - The Cs (Surrogate) property applies only to characters whose code - points are in the range U+D800 to U+DFFF. These characters are no dif- - ferent to any other character when PCRE2 is not in UTF mode (using the - 16-bit or 32-bit library). However, they are not valid in Unicode + The Cs (Surrogate) property applies only to characters whose code + points are in the range U+D800 to U+DFFF. These characters are no dif- + ferent to any other character when PCRE2 is not in UTF mode (using the + 16-bit or 32-bit library). However, they are not valid in Unicode strings and so cannot be tested by PCRE2 in UTF mode, unless UTF valid- - ity checking has been turned off (see the discussion of + ity checking has been turned off (see the discussion of PCRE2_NO_UTF_CHECK in the pcre2api page). - The long synonyms for property names that Perl supports (such as - \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix + The long synonyms for property names that Perl supports (such as + \p{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". No character that is in the Unicode table has the Cn (unassigned) prop- erty. Instead, this property is assumed for any code point that is not in the Unicode table. - Specifying caseless matching does not affect these escape sequences. - For example, \p{Lu} always matches only upper case letters. This is + Specifying caseless matching does not affect these escape sequences. + For example, \p{Lu} always matches only upper case letters. This is different from the behaviour of current versions of Perl. - Matching characters by Unicode property is not fast, because PCRE2 has - to do a multistage table lookup in order to find a character's prop- - erty. That is why the traditional escape sequences such as \d and \w do - not use Unicode properties in PCRE2 by default, though you can make - them do so by setting the PCRE2_UCP option or by starting the pattern - with (*UCP). + Binary (yes/no) properties for \p and \P + + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by + running this command: + + pcre2test -LP + + + The Bidi_Class property for \p and \P + + \p{Bidi_Class: } matches a character with the given class + \p{BC: } matches a character with the given class + + The recognized classes are: + + AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space + + An equals sign may be used instead of a colon. The class names are + case-insensitive; only the short names listed above are recognized. Extended grapheme clusters @@ -7267,14 +7320,16 @@ FULL STOP (PERIOD, DOT) AND \N Outside a character class, a dot in the pattern matches any one charac- ter in the subject string except (by default) a character that signi- - fies the end of a line. + fies the end of a line. One or more characters may be specified as line + terminators (see "Newline conventions" above). - When a line ending is defined as a single character, dot never matches - that character; when the two-character sequence CRLF is used, dot does - not match CR if it is immediately followed by LF, but otherwise it - matches all characters (including isolated CRs and LFs). When any Uni- - code line endings are being recognized, dot does not match CR or LF or - any of the other line ending characters. + Dot never matches a single line-ending character. When the two-charac- + ter sequence CRLF is the only line ending, dot does not match CR if it + is immediately followed by LF, but otherwise it matches all characters + (including isolated CRs and LFs). When ANYCRLF is selected for line + endings, no occurences of CR of LF match dot. When all Unicode line + endings are being recognized, dot does not match CR or LF or any of the + other line ending characters. The behaviour of dot with regard to newlines can be changed. If the PCRE2_DOTALL option is set, a dot matches any one character, without @@ -8068,7 +8123,7 @@ ATOMIC GROUPING AND POSSESSIVE QUANTIFIERS (*atomic:\d+)foo - This kind of parenthesized group "locks up" the part of the pattern it + This kind of parenthesized group "locks up" the part of the pattern it contains once it has matched, and a failure further into the pattern is prevented from backtracking into it. Backtracking past it to previous items, however, works as normal. @@ -9640,8 +9695,8 @@ AUTHOR REVISION - Last updated: 30 August 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 12 January 2022 + Copyright (c) 1997-2022 University of Cambridge. ------------------------------------------------------------------------------ @@ -10312,11 +10367,11 @@ NAME SAVING AND RE-USING PRECOMPILED PCRE2 PATTERNS int32_t pcre2_serialize_decode(pcre2_code **codes, - int32_t number_of_codes, const uint32_t *bytes, + int32_t number_of_codes, const uint8_t *bytes, pcre2_general_context *gcontext); - int32_t pcre2_serialize_encode(pcre2_code **codes, - int32_t number_of_codes, uint32_t **serialized_bytes, + int32_t pcre2_serialize_encode(const pcre2_code **codes, + int32_t number_of_codes, uint8_t **serialized_bytes, PCRE2_SIZE *serialized_size, pcre2_general_context *gcontext); void pcre2_serialize_free(uint8_t *bytes); @@ -10440,7 +10495,6 @@ RE-USING PRECOMPILED PATTERNS If this argument is NULL, malloc() and free() are used. After deserial- ization, the byte stream is no longer needed and can be discarded. - int32_t number_of_codes; pcre2_code *list_of_codes[2]; uint8_t *bytes = ; int32_t number_of_codes = @@ -10588,6 +10642,10 @@ CHARACTER TYPES iour of these escape sequences is changed to use Unicode properties and they match many more characters. + Property descriptions in \p and \P are matched caselessly; hyphens, un- + derscores, and white space are ignored, in accordance with Unicode's + "loose matching" rules. + GENERAL CATEGORY PROPERTIES FOR \p and \P @@ -10604,6 +10662,7 @@ GENERAL CATEGORY PROPERTIES FOR \p and \P Lo Other letter Lt Title case letter Lu Upper case letter + Lc Ll, Lu, or Lt L& Ll, Lu, or Lt M Mark @@ -10650,33 +10709,56 @@ PCRE2 SPECIAL CATEGORY PROPERTIES FOR \p and \P acter set at release 5.18. -SCRIPT NAMES FOR \p AND \P - - Adlam, Ahom, Anatolian_Hieroglyphs, Arabic, Armenian, Avestan, Bali- - nese, Bamum, Bassa_Vah, Batak, Bengali, Bhaiksuki, Bopomofo, Brahmi, - Braille, Buginese, Buhid, Canadian_Aboriginal, Carian, Caucasian_Alba- - nian, Chakma, Cham, Cherokee, Chorasmian, Common, Coptic, Cuneiform, - Cypriot, Cypro_Minoan, Cyrillic, Deseret, Devanagari, Dives_Akuru, Do- - gra, Duployan, Egyptian_Hieroglyphs, Elbasan, Elymaic, Ethiopic, Geor- - gian, Glagolitic, Gothic, Grantha, Greek, Gujarati, Gunjala_Gondi, Gur- - mukhi, Han, Hangul, Hanifi_Rohingya, Hanunoo, Hatran, Hebrew, Hiragana, - Imperial_Aramaic, Inherited, Inscriptional_Pahlavi, Inscrip- - tional_Parthian, Javanese, Kaithi, Kannada, Katakana, Kayah_Li, - Kharoshthi, Khitan_Small_Script, Khmer, Khojki, Khudawadi, Lao, Latin, - Lepcha, Limbu, Linear_A, Linear_B, Lisu, Lycian, Lydian, Mahajani, - Makasar, Malayalam, Mandaic, Manichaean, Marchen, Masaram_Gondi, Mede- - faidrin, Meetei_Mayek, Mende_Kikakui, Meroitic_Cursive, Meroitic_Hiero- - glyphs, Miao, Modi, Mongolian, Mro, Multani, Myanmar, Nabataean, Nandi- - nagari, New_Tai_Lue, Newa, Nko, Nushu, Nyakeng_Puachue_Hmong, Ogham, - Ol_Chiki, Old_Hungarian, Old_Italic, Old_North_Arabian, Old_Permic, - Old_Persian, Old_Sogdian, Old_South_Arabian, Old_Turkic, Old_Uyghur, - Oriya, Osage, Osmanya, Pahawh_Hmong, Palmyrene, Pau_Cin_Hau, Phags_Pa, - Phoenician, Psalter_Pahlavi, Rejang, Runic, Samaritan, Saurashtra, - Sharada, Shavian, Siddham, SignWriting, Sinhala, Sogdian, Sora_Sompeng, - Soyombo, Sundanese, Syloti_Nagri, Syriac, Tagalog, Tagbanwa, Tai_Le, - Tai_Tham, Tai_Viet, Takri, Tamil, Tangsa, Tangut, Telugu, Thaana, Thai, - Tibetan, Tifinagh, Tirhuta, Toto, Ugaritic, Vai, Vithkuqi, Wancho, - Warang_Citi, Yezidi, Yi, Zanabazar_Square. +BINARY PROPERTIES FOR \p AND \P + + Unicode defines a number of binary properties, that is, properties + whose only values are true or false. You can obtain a list of those + that are recognized by \p and \P, along with their abbreviations, by + running this command: + + pcre2test -LP + + +SCRIPT MATCHING WITH \p AND \P + + Many script names and their 4-letter abbreviations are recognized in + \p{sc:...} or \p{scx:...} items, or on their own with \p (and also \P + of course). You can obtain a list of these scripts by running this com- + mand: + + pcre2test -LS + + +THE BIDI_CLASS PROPERTY FOR \p AND \P + + \p{Bidi_Class: } matches a character with the given class + \p{BC: } matches a character with the given class + + The recognized classes are: + + AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space CHARACTER CLASSES @@ -11008,8 +11090,8 @@ AUTHOR REVISION - Last updated: 30 August 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 12 January 2022 + Copyright (c) 1997-2022 University of Cambridge. ------------------------------------------------------------------------------ @@ -11051,15 +11133,17 @@ UNICODE PROPERTY SUPPORT When PCRE2 is built with Unicode support, the escape sequences \p{..}, \P{..}, and \X can be used. This is not dependent on the PCRE2_UTF set- - ting. The Unicode properties that can be tested are limited to the - general category properties such as Lu for an upper case letter or Nd - for a decimal number, the Unicode script names such as Arabic or Han, - and the derived properties Any and L&. Full lists are given in the - pcre2pattern and pcre2syntax documentation. Only the short names for - properties are supported. For example, \p{L} matches a letter. Its Perl - synonym, \p{Letter}, is not supported. Furthermore, in Perl, many - properties may optionally be prefixed by "Is", for compatibility with - Perl 5.6. PCRE2 does not support this. + ting. The Unicode properties that can be tested are a subset of those + that Perl supports. Currently they are limited to the general category + properties such as Lu for an upper case letter or Nd for a decimal num- + ber, the Unicode script names such as Arabic or Han, Bidi_Class, + Bidi_Control, and the derived properties Any and LC (synonym L&). Full + lists are given in the pcre2pattern and pcre2syntax documentation. In + general, only the short names for properties are supported. For exam- + ple, \p{L} matches a letter. Its longer synonym, \p{Letter}, is not + supported. Furthermore, in Perl, many properties may optionally be pre- + fixed by "Is", for compatibility with Perl 5.6. PCRE2 does not support + this. WIDE CHARACTERS AND UTF MODES @@ -11437,14 +11521,14 @@ MATCHING IN INVALID UTF STRINGS AUTHOR Philip Hazel - University Computing Service + Retired from University Computing Service Cambridge, England. REVISION - Last updated: 23 February 2020 - Copyright (c) 1997-2020 University of Cambridge. + Last updated: 22 December 2021 + Copyright (c) 1997-2021 University of Cambridge. ------------------------------------------------------------------------------ diff --git a/pcre2/doc/pcre2_jit_stack_create.3 b/pcre2/doc/pcre2_jit_stack_create.3 index f0b29f0dc4fa88b87162f046100044f2d2ef7097..d332b72d7cdc733e2fbafdd93a6e6d7c1a38f234 100644 --- a/pcre2/doc/pcre2_jit_stack_create.3 +++ b/pcre2/doc/pcre2_jit_stack_create.3 @@ -22,7 +22,8 @@ allocation. The result can be passed to the JIT run-time code by calling \fBpcre2_jit_stack_assign()\fP to associate the stack with a compiled pattern, which can then be processed by \fBpcre2_match()\fP or \fBpcre2_jit_match()\fP. A maximum stack size of 512KiB to 1MiB should be more than enough for any -pattern. For more details, see the +pattern. If the stack couldn't be allocated or the values passed were not +reasonable, NULL will be returned. For more details, see the .\" HREF \fBpcre2jit\fP .\" diff --git a/pcre2/doc/pcre2_set_compile_extra_options.3 b/pcre2/doc/pcre2_set_compile_extra_options.3 index 58cefe57c369b49fe5beda5e4da9ef8cf3ce4293..0dcc8de86d764c3af019f9bbd76c74648c43450b 100644 --- a/pcre2/doc/pcre2_set_compile_extra_options.3 +++ b/pcre2/doc/pcre2_set_compile_extra_options.3 @@ -18,9 +18,9 @@ This function sets additional option bits for \fBpcre2_compile()\fP that are housed in a compile context. It completely replaces all the bits. The extra options are: .sp -.\" JOIN PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{df800} to \ex{dfff} +.\" JOIN + PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff} in UTF-8 and UTF-32 modes .\" JOIN PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and diff --git a/pcre2/doc/pcre2_substitute.3 b/pcre2/doc/pcre2_substitute.3 index cceb7846bdd5d2be9e0d95e2ccbb93fcb1f3790a..7ee4b6ac73f95d172aeee83f30fab015d2c4bfb8 100644 --- a/pcre2/doc/pcre2_substitute.3 +++ b/pcre2/doc/pcre2_substitute.3 @@ -55,32 +55,42 @@ automatically added. The subject and replacement lengths can be given as PCRE2_ZERO_TERMINATED for zero-terminated strings. The options are: .sp - PCRE2_ANCHORED Match only at the first position - PCRE2_ENDANCHORED Pattern can match only at end of subject - PCRE2_NOTBOL Subject is not the beginning of a line - PCRE2_NOTEOL Subject is not the end of a line - PCRE2_NOTEMPTY An empty string is not a valid match + PCRE2_ANCHORED Match only at the first position + PCRE2_ENDANCHORED Match only at end of subject .\" JOIN - PCRE2_NOTEMPTY_ATSTART An empty string at the start of the - subject is not a valid match - PCRE2_NO_JIT Do not use JIT matching + PCRE2_NOTBOL Subject is not the beginning of a + line + PCRE2_NOTEOL Subject is not the end of a line .\" JOIN - PCRE2_NO_UTF_CHECK Do not check the subject or replacement - for UTF validity (only relevant if - PCRE2_UTF was set at compile time) - PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing - PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the subject - PCRE2_SUBSTITUTE_LITERAL The replacement string is literal - PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for 1st match - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length + PCRE2_NOTEMPTY An empty string is not a + valid match +.\" JOIN + PCRE2_NOTEMPTY_ATSTART An empty string at the start of + the subject is not a valid match + PCRE2_NO_JIT Do not use JIT matching +.\" JOIN + PCRE2_NO_UTF_CHECK Do not check for UTF validity in + the subject or replacement +.\" JOIN + (only relevant if PCRE2_UTF was + set at compile time) + PCRE2_SUBSTITUTE_EXTENDED Do extended replacement processing +.\" JOIN + PCRE2_SUBSTITUTE_GLOBAL Replace all occurrences in the + subject + PCRE2_SUBSTITUTE_LITERAL The replacement string is literal +.\" JOIN + PCRE2_SUBSTITUTE_MATCHED Use pre-existing match data for + first match + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH If overflow, compute needed length PCRE2_SUBSTITUTE_REPLACEMENT_ONLY Return only replacement string(s) - PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset - PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string + PCRE2_SUBSTITUTE_UNKNOWN_UNSET Treat unknown group as unset + PCRE2_SUBSTITUTE_UNSET_EMPTY Simple unset insert = empty string .sp If PCRE2_SUBSTITUTE_LITERAL is set, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_UNKNOWN_UNSET, and PCRE2_SUBSTITUTE_UNSET_EMPTY are ignored. .P -If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-zero; its +If PCRE2_SUBSTITUTE_MATCHED is set, \fImatch_data\fP must be non-NULL; its contents must be the result of a call to \fBpcre2_match()\fP using the same pattern and subject. .P diff --git a/pcre2/doc/pcre2api.3 b/pcre2/doc/pcre2api.3 index 1ad6e261a1dcf4c8cc4ab7a0c9305ab403ed3468..edde3db7784291b5684d27ed26c2cd2521dbd79c 100644 --- a/pcre2/doc/pcre2api.3 +++ b/pcre2/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "30 August 2021" "PCRE2 10.38" +.TH PCRE2API 3 "14 December 2021" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1794,7 +1794,7 @@ it is set, the effect of passing an invalid UTF string as a pattern is undefined. It may cause your program to crash or loop. .P Note that this option can also be passed to \fBpcre2_match()\fP and -\fBpcre_dfa_match()\fP, to suppress UTF validity checking of the subject +\fBpcre2_dfa_match()\fP, to suppress UTF validity checking of the subject string. .P Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the @@ -2015,8 +2015,8 @@ point. However, this applies only to characters whose code points are less than 256. By default, higher-valued code points never match escapes such as \ew or \ed. .P -When PCRE2 is built with Unicode support (the default), the Unicode properties -of all characters can be tested with \ep and \eP, or, alternatively, the +When PCRE2 is built with Unicode support (the default), certain Unicode +character properties can be tested with \ep and \eP, or, alternatively, the PCRE2_UCP option can be set when a pattern is compiled; this causes \ew and friends to use Unicode property support instead of the built-in tables. PCRE2_UCP also causes upper/lower casing operations on characters with code @@ -2279,7 +2279,7 @@ return zero. The third argument should point to a \fBsize_t\fP variable. PCRE2_INFO_LASTCODETYPE .sp Returns 1 if there is a rightmost literal code unit that must exist in any -matched string, other than at its start. The third argument should point to a +matched string, other than at its start. The third argument should point to a \fBuint32_t\fP variable. If there is no such value, 0 is returned. When 1 is returned, the code unit value itself can be retrieved using PCRE2_INFO_LASTCODEUNIT. For anchored patterns, a last literal value is @@ -2624,7 +2624,9 @@ The subject string is passed to \fBpcre2_match()\fP as a pointer in \fIstartoffset\fP. The length and offset are in code units, not characters. That is, they are in bytes for the 8-bit library, 16-bit code units for the 16-bit library, and 32-bit code units for the 32-bit library, whether or not -UTF processing is enabled. +UTF processing is enabled. As a special case, if \fIsubject\fP is NULL and +\fIlength\fP is zero, the subject is assumed to be an empty string. If +\fIlength\fP is non-zero, an error occurs if \fIsubject\fP is NULL. .P If \fIstartoffset\fP is greater than the length of the subject, \fBpcre2_match()\fP returns PCRE2_ERROR_BADOFFSET. When the starting offset is @@ -3413,12 +3415,16 @@ same number causes an error at compile time. .P This function optionally calls \fBpcre2_match()\fP and then makes a copy of the subject string in \fIoutputbuffer\fP, replacing parts that were matched with -the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP. This -can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. There is an -option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just the -replacement string(s). The default action is to perform just one replacement if -the pattern matches, but there is an option that requests multiple replacements -(see PCRE2_SUBSTITUTE_GLOBAL below). +the \fIreplacement\fP string, whose length is supplied in \fBrlength\fP, which +can be given as PCRE2_ZERO_TERMINATED for a zero-terminated string. As a +special case, if \fIreplacement\fP is NULL and \fIrlength\fP is zero, the +replacement is assumed to be an empty string. If \fIrlength\fP is non-zero, an +error occurs if \fIreplacement\fP is NULL. +.P +There is an option (see PCRE2_SUBSTITUTE_REPLACEMENT_ONLY below) to return just +the replacement string(s). The default action is to perform just one +replacement if the pattern matches, but there is an option that requests +multiple replacements (see PCRE2_SUBSTITUTE_GLOBAL below). .P If successful, \fBpcre2_substitute()\fP returns the number of substitutions that were carried out. This may be zero if no match was found, and is never @@ -3447,12 +3453,12 @@ block may or may not have been changed. As well as the usual options for \fBpcre2_match()\fP, a number of additional options can be set in the \fIoptions\fP argument of \fBpcre2_substitute()\fP. One such option is PCRE2_SUBSTITUTE_MATCHED. When this is set, an external -\fImatch_data\fP block must be provided, and it must have been used for an -external call to \fBpcre2_match()\fP. The data in the \fImatch_data\fP block -(return code, offset vector) is used for the first substitution instead of -calling \fBpcre2_match()\fP from within \fBpcre2_substitute()\fP. This allows -an application to check for a match before choosing to substitute, without -having to repeat the match. +\fImatch_data\fP block must be provided, and it must have already been used for +an external call to \fBpcre2_match()\fP with the same pattern and subject +arguments. The data in the \fImatch_data\fP block (return code, offset vector) +is then used for the first substitution instead of calling \fBpcre2_match()\fP +from within \fBpcre2_substitute()\fP. This allows an application to check for a +match before choosing to substitute, without having to repeat the match. .P The contents of the externally supplied match data block are not changed when PCRE2_SUBSTITUTE_MATCHED is set. If PCRE2_SUBSTITUTE_GLOBAL is also set, @@ -3584,7 +3590,7 @@ and force lower case. The escape sequences change the current state: \eU and terminating a \eQ quoted sequence) reverts to no case forcing. The sequences \eu and \el force the next character (if it is a letter) to upper or lower case, respectively, and then the state automatically reverts to no case -forcing. Case forcing applies to all inserted characters, including those from +forcing. Case forcing applies to all inserted characters, including those from capture groups and letters within \eQ...\eE quoted sequences. If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater @@ -3649,7 +3655,9 @@ needed is returned via \fIoutlengthptr\fP. Note that this does not happen by default. .P PCRE2_ERROR_NULL is returned if PCRE2_SUBSTITUTE_MATCHED is set but the -\fImatch_data\fP argument is NULL. +\fImatch_data\fP argument is NULL or if the \fIsubject\fP or \fIreplacement\fP +arguments are NULL. For backward compatibility reasons an exception is made for +the \fIreplacement\fP argument if the \fIrlength\fP argument is also 0. .P PCRE2_ERROR_BADREPLACEMENT is used for miscellaneous syntax errors in the replacement string, with more particular errors being PCRE2_ERROR_BADREPESCAPE @@ -3811,12 +3819,13 @@ other alternatives. Ultimately, when it runs out of matches, .P The function \fBpcre2_dfa_match()\fP is called to match a subject string against a compiled pattern, using a matching algorithm that scans the subject -string just once (not counting lookaround assertions), and does not backtrack. -This has different characteristics to the normal algorithm, and is not -compatible with Perl. Some of the features of PCRE2 patterns are not supported. -Nevertheless, there are times when this kind of matching can be useful. For a -discussion of the two matching algorithms, and a list of features that -\fBpcre2_dfa_match()\fP does not support, see the +string just once (not counting lookaround assertions), and does not backtrack +(except when processing lookaround assertions). This has different +characteristics to the normal algorithm, and is not compatible with Perl. Some +of the features of PCRE2 patterns are not supported. Nevertheless, there are +times when this kind of matching can be useful. For a discussion of the two +matching algorithms, and a list of features that \fBpcre2_dfa_match()\fP does +not support, see the .\" HREF \fBpcre2matching\fP .\" @@ -3848,7 +3857,7 @@ Here is an example of a simple call to \fBpcre2_dfa_match()\fP: wspace, /* working space vector */ 20); /* number of elements (NOT size in bytes) */ . -.SS "Option bits for \fBpcre_dfa_match()\fP" +.SS "Option bits for \fBpcre2_dfa_match()\fP" .rs .sp The unused bits of the \fIoptions\fP argument for \fBpcre2_dfa_match()\fP must @@ -4016,6 +4025,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2021 +Last updated: 14 December 2021 Copyright (c) 1997-2021 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2build.3 b/pcre2/doc/pcre2build.3 index 60931bfc5870f457b0224afc7c1fd18a870e9cc7..5fca3dce062477133acb3ee11aa7262682483c40 100644 --- a/pcre2/doc/pcre2build.3 +++ b/pcre2/doc/pcre2build.3 @@ -1,4 +1,4 @@ -.TH PCRE2BUILD 3 "20 March 2020" "PCRE2 10.35" +.TH PCRE2BUILD 3 "08 December 2021" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) . @@ -122,8 +122,9 @@ locked this out by setting PCRE2_NEVER_UTF. UTF support allows the libraries to process character code points up to 0x10ffff in the strings that they handle. Unicode support also gives access to the Unicode properties of characters, using pattern escapes such as \eP, \ep, -and \eX. Only the general category properties such as \fILu\fP and \fINd\fP are -supported. Details are given in the +and \eX. Only the general category properties such as \fILu\fP and \fINd\fP, +script names, and some bi-directional properties are supported. Details are +given in the .\" HREF \fBpcre2pattern\fP .\" @@ -302,7 +303,7 @@ You can also explicitly limit the depth of nested backtracking in the for --with-match-limit. You can set a lower default limit by adding, for example, .sp - --with-match-limit_depth=10000 + --with-match-limit-depth=10000 .sp to the \fBconfigure\fP command. This value can be overridden at run time. This depth limit indirectly limits the amount of heap memory that is used, but @@ -633,6 +634,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 20 March 2020 -Copyright (c) 1997-2020 University of Cambridge. +Last updated: 08 December 2021 +Copyright (c) 1997-2021 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2compat.3 b/pcre2/doc/pcre2compat.3 index 311d6ebe82b10f80398f01f8f7f8743dbd8bac17..8333d3e363b24f4cc26ed74b6fc2178ac4d02ee7 100644 --- a/pcre2/doc/pcre2compat.3 +++ b/pcre2/doc/pcre2compat.3 @@ -1,4 +1,4 @@ -.TH PCRE2COMPAT 3 "30 August 2021" "PCRE2 10.38" +.TH PCRE2COMPAT 3 "08 December 2021" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "DIFFERENCES BETWEEN PCRE2 AND PERL" @@ -6,31 +6,38 @@ PCRE2 - Perl-compatible regular expressions (revised API) .sp This document describes some of the differences in the ways that PCRE2 and Perl handle regular expressions. The differences described here are with respect to -Perl version 5.32.0, but as both Perl and PCRE2 are continually changing, the +Perl version 5.34.0, but as both Perl and PCRE2 are continually changing, the information may at times be out of date. .P -1. PCRE2 has only a subset of Perl's Unicode support. Details of what it does +1. When PCRE2_DOTALL (equivalent to Perl's /s qualifier) is not set, the +behaviour of the '.' metacharacter differs from Perl. In PCRE2, '.' matches the +next character unless it is the start of a newline sequence. This means that, +if the newline setting is CR, CRLF, or NUL, '.' will match the code point LF +(0x0A) in ASCII/Unicode environments, and NL (either 0x15 or 0x25) when using +EBCDIC. In Perl, '.' appears never to match LF, even when 0x0A is not a newline +indicator. +.P +2. PCRE2 has only a subset of Perl's Unicode support. Details of what it does have are given in the .\" HREF \fBpcre2unicode\fP .\" page. .P -2. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but +3. Like Perl, PCRE2 allows repeat quantifiers on parenthesized assertions, but they do not mean what you might think. For example, (?!a){3} does not assert that the next three characters are not "a". It just asserts that the next character is not "a" three times (in principle; PCRE2 optimizes this to run the assertion just once). Perl allows some repeat quantifiers on other assertions, -for example, \eb* (but not \eb{3}, though oddly it does allow ^{3}), but these -do not seem to have any use. PCRE2 does not allow any kind of quantifier on -non-lookaround assertions. +for example, \eb* , but these do not seem to have any use. PCRE2 does not allow +any kind of quantifier on non-lookaround assertions. .P -3. Capture groups that occur inside negative lookaround assertions are counted, +4. Capture groups that occur inside negative lookaround assertions are counted, but their entries in the offsets vector are set only when a negative assertion is a condition that has a matching branch (that is, the condition is false). Perl may set such capture groups in other circumstances. .P -4. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu, +5. The following Perl escape sequences are not supported: \eF, \el, \eL, \eu, \eU, and \eN when followed by a character name. \eN on its own, matching a non-newline character, and \eN{U+dd..}, matching a Unicode code point, are supported. The escapes that modify the case of following letters are @@ -40,12 +47,12 @@ generated by default. However, if either of the PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX options is set, \eU and \eu are interpreted as ECMAScript interprets them. .P -5. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is +6. The Perl escape sequences \ep, \eP, and \eX are supported only if PCRE2 is built with Unicode support (the default). The properties that can be tested with \ep and \eP are limited to the general category properties such as Lu and -Nd, script names such as Greek or Han, and the derived properties Any and L&. -Both PCRE2 and Perl support the Cs (surrogate) property, but in PCRE2 its use -is limited. See the +Nd, script names such as Greek or Han, Bidi_Class, Bidi_Control, and the +derived properties Any and LC (synonym L&). Both PCRE2 and Perl support the Cs +(surrogate) property, but in PCRE2 its use is limited. See the .\" HREF \fBpcre2pattern\fP .\" @@ -53,14 +60,14 @@ documentation for details. The long synonyms for property names that Perl supports (such as \ep{Letter}) are not supported by PCRE2, nor is it permitted to prefix any of these properties with "Is". .P -6. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters +7. PCRE2 supports the \eQ...\eE escape for quoting substrings. Characters in between are treated as literals. However, this is slightly different from Perl in that $ and @ are also handled as literals inside the quotes. In Perl, -they cause variable interpolation (but of course PCRE2 does not have -variables). Also, Perl does "double-quotish backslash interpolation" on any -backslashes between \eQ and \eE which, its documentation says, "may lead to -confusing results". PCRE2 treats a backslash between \eQ and \eE just like any -other character. Note the following examples: +they cause variable interpolation (PCRE2 does not have variables). Also, Perl +does "double-quotish backslash interpolation" on any backslashes between \eQ +and \eE which, its documentation says, "may lead to confusing results". PCRE2 +treats a backslash between \eQ and \eE just like any other character. Note the +following examples: .sp Pattern PCRE2 matches Perl matches .sp @@ -75,7 +82,7 @@ other character. Note the following examples: The \eQ...\eE sequence is recognized both inside and outside character classes by both PCRE2 and Perl. .P -7. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) +8. Fairly obviously, PCRE2 does not support the (?{code}) and (??{code}) constructions. However, PCRE2 does have a "callout" feature, which allows an external function to be called during pattern matching. See the .\" HREF @@ -83,11 +90,11 @@ external function to be called during pattern matching. See the .\" documentation for details. .P -8. Subroutine calls (whether recursive or not) were treated as atomic groups up +9. Subroutine calls (whether recursive or not) were treated as atomic groups up to PCRE2 release 10.23, but from release 10.30 this changed, and backtracking into subroutine calls is now supported, as in Perl. .P -9. In PCRE2, if any of the backtracking control verbs are used in a group that +10. In PCRE2, if any of the backtracking control verbs are used in a group that is called as a subroutine (whether or not recursively), their effect is confined to that group; it does not extend to the surrounding pattern. This is not always the case in Perl. In particular, if (*THEN) is present in a group @@ -95,18 +102,18 @@ that is called as a subroutine, its action is limited to that group, even if the group does not contain any | characters. Note that such groups are processed as anchored at the point where they are tested. .P -10. If a pattern contains more than one backtracking control verb, the first +11. If a pattern contains more than one backtracking control verb, the first one that is backtracked onto acts. For example, in the pattern A(*COMMIT)B(*PRUNE)C a failure in B triggers (*COMMIT), but a failure in C triggers (*PRUNE). Perl's behaviour is more complex; in many cases it is the same as PCRE2, but there are cases where it differs. .P -11. There are some differences that are concerned with the settings of captured +12. There are some differences that are concerned with the settings of captured strings when part of a pattern is repeated. For example, matching "aba" against the pattern /^(a(b)?)+$/ in Perl leaves $2 unset, but in PCRE2 it is set to "b". .P -12. PCRE2's handling of duplicate capture group numbers and names is not as +13. PCRE2's handling of duplicate capture group numbers and names is not as general as Perl's. This is a consequence of the fact the PCRE2 works internally just with numbers, using an external table to translate between numbers and names. In particular, a pattern such as (?|(?A)|(?B)), where the two @@ -115,37 +122,38 @@ causes an error at compile time. If it were allowed, it would not be possible to distinguish which group matched, because both names map to capture group number 1. To avoid this confusing situation, an error is given at compile time. .P -13. Perl used to recognize comments in some places that PCRE2 does not, for +14. Perl used to recognize comments in some places that PCRE2 does not, for example, between the ( and ? at the start of a group. If the /x modifier is set, Perl allowed white space between ( and ? though the latest Perls give an error (for a while it was just deprecated). There may still be some cases where Perl behaves differently. .P -14. Perl, when in warning mode, gives warnings for character classes such as +15. Perl, when in warning mode, gives warnings for character classes such as [A-\ed] or [a-[:digit:]]. It then treats the hyphens as literals. PCRE2 has no warning features, so it gives an error in these cases because they are almost certainly user mistakes. .P -15. In PCRE2, the upper/lower case character properties Lu and Ll are not +16. In PCRE2, the upper/lower case character properties Lu and Ll are not affected when case-independent matching is specified. For example, \ep{Lu} always matches an upper case letter. I think Perl has changed in this respect; -in the release at the time of writing (5.32), \ep{Lu} and \ep{Ll} match all +in the release at the time of writing (5.34), \ep{Lu} and \ep{Ll} match all letters, regardless of case, when case independence is specified. .P -16. From release 5.32.0, Perl locks out the use of \eK in lookaround +17. From release 5.32.0, Perl locks out the use of \eK in lookaround assertions. From release 10.38 PCRE2 does the same by default. However, there is an option for re-enabling the previous behaviour. When this option is set, \eK is acted on when it occurs in positive assertions, but is ignored in negative assertions. .P -17. PCRE2 provides some extensions to the Perl regular expression facilities. +18. PCRE2 provides some extensions to the Perl regular expression facilities. Perl 5.10 included new features that were not in earlier versions of Perl, some of which (such as named parentheses) were in PCRE2 for some time before. This -list is with respect to Perl 5.32: +list is with respect to Perl 5.34: .sp (a) Although lookbehind assertions in PCRE2 must match fixed length strings, each alternative toplevel branch of a lookbehind assertion can match a -different length of string. Perl requires them all to have the same length. +different length of string. Perl used to require them all to have the same +length, but the latest version has some variable length support. .sp (b) From PCRE2 10.23, backreferences to groups of fixed length are supported in lookbehinds, provided that there is no possibility of referencing a @@ -186,11 +194,11 @@ the pattern. extension to the lookaround facilities. The default, Perl-compatible lookarounds are atomic. .P -18. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa +19. The Perl /a modifier restricts /d numbers to pure ascii, and the /aa modifier restricts /i case-insensitive matching to pure ascii, ignoring Unicode rules. This separation cannot be represented with PCRE2_UCP. .P -19. Perl has different limits than PCRE2. See the +20. Perl has different limits than PCRE2. See the .\" HREF \fBpcre2limit\fP .\" @@ -214,6 +222,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2021 +Last updated: 08 December 2021 Copyright (c) 1997-2021 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2jit.3 b/pcre2/doc/pcre2jit.3 index 9b7755054b94dcb78cbb05aab1aed100b84e36f8..f0b3b151226553477658a8a63b5b42b54f7b1fb1 100644 --- a/pcre2/doc/pcre2jit.3 +++ b/pcre2/doc/pcre2jit.3 @@ -1,4 +1,4 @@ -.TH PCRE2JIT 3 "23 May 2019" "PCRE2 10.34" +.TH PCRE2JIT 3 "30 November 2021" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 JUST-IN-TIME COMPILER SUPPORT" @@ -251,11 +251,11 @@ non-sequential matches in one thread is to use callouts: if a callout function starts another match, that match must use a different JIT stack to the one used for currently suspended match(es). .P -In a multithread application, if you do not -specify a JIT stack, or if you assign or pass back NULL from a callback, that -is thread-safe, because each thread has its own machine stack. However, if you -assign or pass back a non-NULL JIT stack, this must be a different stack for -each thread so that the application is thread-safe. +In a multithread application, if you do not specify a JIT stack, or if you +assign or pass back NULL from a callback, that is thread-safe, because each +thread has its own machine stack. However, if you assign or pass back a +non-NULL JIT stack, this must be a different stack for each thread so that the +application is thread-safe. .P Strictly speaking, even more is allowed. You can assign the same non-NULL stack to a match context that is used by any number of patterns, as long as they are @@ -355,8 +355,8 @@ out this complicated API. .B void pcre2_jit_free_unused_memory(pcre2_general_context *\fIgcontext\fP); .fi .P -The JIT executable allocator does not free all memory when it is possible. -It expects new allocations, and keeps some free memory around to improve +The JIT executable allocator does not free all memory when it is possible. It +expects new allocations, and keeps some free memory around to improve allocation speed. However, in low memory conditions, it might be better to free all possible memory. You can cause this to happen by calling pcre2_jit_free_unused_memory(). Its argument is a general context, for custom @@ -416,10 +416,10 @@ that was not compiled. .P When you call \fBpcre2_match()\fP, as well as testing for invalid options, a number of other sanity checks are performed on the arguments. For example, if -the subject pointer is NULL, an immediate error is given. Also, unless -PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested for validity. In the -interests of speed, these checks do not happen on the JIT fast path, and if -invalid data is passed, the result is undefined. +the subject pointer is NULL but the length is non-zero, an immediate error is +given. Also, unless PCRE2_NO_UTF_CHECK is set, a UTF subject string is tested +for validity. In the interests of speed, these checks do not happen on the JIT +fast path, and if invalid data is passed, the result is undefined. .P Bypassing the sanity checks and the \fBpcre2_match()\fP wrapping can give speedups of more than 10%. @@ -445,6 +445,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 May 2019 -Copyright (c) 1997-2019 University of Cambridge. +Last updated: 30 November 2021 +Copyright (c) 1997-2021 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2pattern.3 b/pcre2/doc/pcre2pattern.3 index 627f229a44f91264c3c2aabb8224d1bbd01c28c1..3088ec0fb28578987eab3c1a71d0fcf4e8207df7 100644 --- a/pcre2/doc/pcre2pattern.3 +++ b/pcre2/doc/pcre2pattern.3 @@ -1,4 +1,4 @@ -.TH PCRE2PATTERN 3 "3o0 August 2021" "PCRE2 10.38" +.TH PCRE2PATTERN 3 "12 January 2022" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION DETAILS" @@ -509,7 +509,6 @@ for themselves. For example, outside a character class: .\" JOIN \e377 might be a backreference, otherwise the value 255 (decimal) -.\" JOIN \e81 is always a backreference .sp Note that octal values of 100 or greater that are specified using this syntax @@ -773,200 +772,64 @@ can be used in any mode, though in 8-bit and 16-bit non-UTF modes these sequences are of course limited to testing characters whose code points are less than U+0100 and U+10000, respectively. In 32-bit non-UTF mode, code points greater than 0x10ffff (the Unicode limit) may be encountered. These are all -treated as being in the Unknown script and with an unassigned type. The extra -escape sequences are: +treated as being in the Unknown script and with an unassigned type. +.P +Matching characters by Unicode property is not fast, because PCRE2 has to do a +multistage table lookup in order to find a character's property. That is why +the traditional escape sequences such as \ed and \ew do not use Unicode +properties in PCRE2 by default, though you can make them do so by setting the +PCRE2_UCP option or by starting the pattern with (*UCP). +.P +The extra escape sequences that provide property support are: .sp \ep{\fIxx\fP} a character with the \fIxx\fP property \eP{\fIxx\fP} a character without the \fIxx\fP property \eX a Unicode extended grapheme cluster .sp -The property names represented by \fIxx\fP above are case-sensitive. There is -support for Unicode script names, Unicode general category properties, "Any", -which matches any character (including newline), and some special PCRE2 -properties (described in the +The property names represented by \fIxx\fP above are not case-sensitive, and in +accordance with Unicode's "loose matching" rules, spaces, hyphens, and +underscores are ignored. There is support for Unicode script names, Unicode +general category properties, "Any", which matches any character (including +newline), Bidi_Class, a number of binary (yes/no) properties, and some special +PCRE2 properties (described .\" HTML .\" -next section). +below). .\" -Other Perl properties such as "InMusicalSymbols" are not supported by PCRE2. -Note that \eP{Any} does not match any characters, so always causes a match -failure. -.P -Sets of Unicode characters are defined as belonging to certain scripts. A -character from one of these sets can be matched using a script name. For -example: -.sp - \ep{Greek} - \eP{Han} +Certain other Perl properties such as "InMusicalSymbols" are not supported by +PCRE2. Note that \eP{Any} does not match any characters, so always causes a +match failure. +. +. +. +.SS "Script properties for \ep and \eP" +.rs .sp +There are three different syntax forms for matching a script. Each Unicode +character has a basic script and, optionally, a list of other scripts ("Script +Extensions") with which it is commonly used. Using the Adlam script as an +example, \ep{sc:Adlam} matches characters whose basic script is Adlam, whereas +\ep{scx:Adlam} matches, in addition, characters that have Adlam in their +extensions list. The full names "script" and "script extensions" for the +property types are recognized, and a equals sign is an alternative to the +colon. If a script name is given without a property type, for example, +\ep{Adlam}, it is treated as \ep{scx:Adlam}. Perl changed to this +interpretation at release 5.26 and PCRE2 changed at release 10.40. +.P Unassigned characters (and in non-UTF 32-bit mode, characters with code points greater than 0x10FFFF) are assigned the "Unknown" script. Others that are not part of an identified script are lumped together as "Common". The current list -of scripts is: -.P -Adlam, -Ahom, -Anatolian_Hieroglyphs, -Arabic, -Armenian, -Avestan, -Balinese, -Bamum, -Bassa_Vah, -Batak, -Bengali, -Bhaiksuki, -Bopomofo, -Brahmi, -Braille, -Buginese, -Buhid, -Canadian_Aboriginal, -Carian, -Caucasian_Albanian, -Chakma, -Cham, -Cherokee, -Chorasmian, -Common, -Coptic, -Cuneiform, -Cypriot, -Cypro_Minoan, -Cyrillic, -Deseret, -Devanagari, -Dives_Akuru, -Dogra, -Duployan, -Egyptian_Hieroglyphs, -Elbasan, -Elymaic, -Ethiopic, -Georgian, -Glagolitic, -Gothic, -Grantha, -Greek, -Gujarati, -Gunjala_Gondi, -Gurmukhi, -Han, -Hangul, -Hanifi_Rohingya, -Hanunoo, -Hatran, -Hebrew, -Hiragana, -Imperial_Aramaic, -Inherited, -Inscriptional_Pahlavi, -Inscriptional_Parthian, -Javanese, -Kaithi, -Kannada, -Katakana, -Kayah_Li, -Kharoshthi, -Khitan_Small_Script, -Khmer, -Khojki, -Khudawadi, -Lao, -Latin, -Lepcha, -Limbu, -Linear_A, -Linear_B, -Lisu, -Lycian, -Lydian, -Mahajani, -Makasar, -Malayalam, -Mandaic, -Manichaean, -Marchen, -Masaram_Gondi, -Medefaidrin, -Meetei_Mayek, -Mende_Kikakui, -Meroitic_Cursive, -Meroitic_Hieroglyphs, -Miao, -Modi, -Mongolian, -Mro, -Multani, -Myanmar, -Nabataean, -Nandinagari, -New_Tai_Lue, -Newa, -Nko, -Nushu, -Nyakeng_Puachue_Hmong, -Ogham, -Ol_Chiki, -Old_Hungarian, -Old_Italic, -Old_North_Arabian, -Old_Permic, -Old_Persian, -Old_Sogdian, -Old_South_Arabian, -Old_Turkic, -Old_Uyghur, -Oriya, -Osage, -Osmanya, -Pahawh_Hmong, -Palmyrene, -Pau_Cin_Hau, -Phags_Pa, -Phoenician, -Psalter_Pahlavi, -Rejang, -Runic, -Samaritan, -Saurashtra, -Sharada, -Shavian, -Siddham, -SignWriting, -Sinhala, -Sogdian, -Sora_Sompeng, -Soyombo, -Sundanese, -Syloti_Nagri, -Syriac, -Tagalog, -Tagbanwa, -Tai_Le, -Tai_Tham, -Tai_Viet, -Takri, -Tamil, -Tangsa, -Tangut, -Telugu, -Thaana, -Thai, -Tibetan, -Tifinagh, -Tirhuta, -Toto, -Ugaritic, -Unknown, -Vai, -Vithkuqi, -Wancho, -Warang_Citi, -Yezidi, -Yi, -Zanabazar_Square. -.P +of recognized script names and their 4-character abbreviations can be obtained +by running this command: +.sp + pcre2test -LS +.sp +. +. +. +.SS "The general category property for \ep and \eP" +.rs +.sp Each character has exactly one Unicode general category property, specified by a two-letter abbreviation. For compatibility with Perl, negation can be specified by including a circumflex between the opening brace and the property @@ -1026,9 +889,9 @@ The following general category property codes are supported: Zp Paragraph separator Zs Space separator .sp -The special property L& is also supported: it matches a character that has -the Lu, Ll, or Lt property, in other words, a letter that is not classified as -a modifier or "other". +The special property LC, which has the synonym L&, is also supported: it +matches a character that has the Lu, Ll, or Lt property, in other words, a +letter that is not classified as a modifier or "other". .P The Cs (Surrogate) property applies only to characters whose code points are in the range U+D800 to U+DFFF. These characters are no different to any other @@ -1052,12 +915,53 @@ Unicode table. Specifying caseless matching does not affect these escape sequences. For example, \ep{Lu} always matches only upper case letters. This is different from the behaviour of current versions of Perl. -.P -Matching characters by Unicode property is not fast, because PCRE2 has to do a -multistage table lookup in order to find a character's property. That is why -the traditional escape sequences such as \ed and \ew do not use Unicode -properties in PCRE2 by default, though you can make them do so by setting the -PCRE2_UCP option or by starting the pattern with (*UCP). +. +. +.SS "Binary (yes/no) properties for \ep and \eP" +.rs +.sp +Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\ep and \eP, along with their abbreviations, by running this command: +.sp + pcre2test -LP +.sp +. +. +.SS "The Bidi_Class property for \ep and \eP" +.rs +.sp + \ep{Bidi_Class: } matches a character with the given class + \ep{BC: } matches a character with the given class +.sp +The recognized classes are: +.sp + AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space +.sp +An equals sign may be used instead of a colon. The class names are +case-insensitive; only the short names listed above are recognized. . . .SS Extended grapheme clusters @@ -1336,14 +1240,19 @@ end of the subject in both modes, and if all branches of a pattern start with .sp Outside a character class, a dot in the pattern matches any one character in the subject string except (by default) a character that signifies the end of a -line. +line. One or more characters may be specified as line terminators (see +.\" HTML +.\" +"Newline conventions" +.\" +above). .P -When a line ending is defined as a single character, dot never matches that -character; when the two-character sequence CRLF is used, dot does not match CR -if it is immediately followed by LF, but otherwise it matches all characters -(including isolated CRs and LFs). When any Unicode line endings are being -recognized, dot does not match CR or LF or any of the other line ending -characters. +Dot never matches a single line-ending character. When the two-character +sequence CRLF is the only line ending, dot does not match CR if it is +immediately followed by LF, but otherwise it matches all characters (including +isolated CRs and LFs). When ANYCRLF is selected for line endings, no occurences +of CR of LF match dot. When all Unicode line endings are being recognized, dot +does not match CR or LF or any of the other line ending characters. .P The behaviour of dot with regard to newlines can be changed. If the PCRE2_DOTALL option is set, a dot matches any one character, without exception. @@ -2186,10 +2095,10 @@ be easier to remember: .sp (*atomic:\ed+)foo .sp -This kind of parenthesized group "locks up" the part of the pattern it -contains once it has matched, and a failure further into the pattern is -prevented from backtracking into it. Backtracking past it to previous items, -however, works as normal. +This kind of parenthesized group "locks up" the part of the pattern it contains +once it has matched, and a failure further into the pattern is prevented from +backtracking into it. Backtracking past it to previous items, however, works as +normal. .P An alternative description is that a group of this type matches exactly the string of characters that an identical standalone pattern would match, if @@ -3905,6 +3814,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2021 -Copyright (c) 1997-2021 University of Cambridge. +Last updated: 12 January 2022 +Copyright (c) 1997-2022 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2serialize.3 b/pcre2/doc/pcre2serialize.3 index 85aee9ba9706f45ac5b40716684973caf8a67cda..987bc3a4c6f8fd482df37ffa104a6846fcb57139 100644 --- a/pcre2/doc/pcre2serialize.3 +++ b/pcre2/doc/pcre2serialize.3 @@ -6,11 +6,11 @@ PCRE2 - Perl-compatible regular expressions (revised API) .sp .nf .B int32_t pcre2_serialize_decode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, const uint32_t *\fIbytes\fP," +.B " int32_t \fInumber_of_codes\fP, const uint8_t *\fIbytes\fP," .B " pcre2_general_context *\fIgcontext\fP);" .sp -.B int32_t pcre2_serialize_encode(pcre2_code **\fIcodes\fP, -.B " int32_t \fInumber_of_codes\fP, uint32_t **\fIserialized_bytes\fP," +.B int32_t pcre2_serialize_encode(const pcre2_code **\fIcodes\fP, +.B " int32_t \fInumber_of_codes\fP, uint8_t **\fIserialized_bytes\fP," .B " PCRE2_SIZE *\fIserialized_size\fP, pcre2_general_context *\fIgcontext\fP);" .sp .B void pcre2_serialize_free(uint8_t *\fIbytes\fP); @@ -141,7 +141,6 @@ mangagement functions for the decoded patterns. If this argument is NULL, \fBmalloc()\fP and \fBfree()\fP are used. After deserialization, the byte stream is no longer needed and can be discarded. .sp - int32_t number_of_codes; pcre2_code *list_of_codes[2]; uint8_t *bytes = ; int32_t number_of_codes = diff --git a/pcre2/doc/pcre2syntax.3 b/pcre2/doc/pcre2syntax.3 index 937c8172d06daa8e3fc148787bfd47e580f7291f..c0a496f4f42e0d8f431bc6491fdcc252dcea7c59 100644 --- a/pcre2/doc/pcre2syntax.3 +++ b/pcre2/doc/pcre2syntax.3 @@ -1,4 +1,4 @@ -.TH PCRE2SYNTAX 3 "30 August 2021" "PCRE2 10.38" +.TH PCRE2SYNTAX 3 "12 January 2022" "PCRE2 10.40" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .SH "PCRE2 REGULAR EXPRESSION SYNTAX SUMMARY" @@ -102,6 +102,10 @@ happening, \es and \ew may also match characters with code points in the range 128-255. If the PCRE2_UCP option is set, the behaviour of these escape sequences is changed to use Unicode properties and they match many more characters. +.P +Property descriptions in \ep and \eP are matched caselessly; hyphens, +underscores, and white space are ignored, in accordance with Unicode's "loose +matching" rules. . . .SH "GENERAL CATEGORY PROPERTIES FOR \ep and \eP" @@ -120,6 +124,7 @@ characters. Lo Other letter Lt Title case letter Lu Upper case letter + Lc Ll, Lu, or Lt L& Ll, Lu, or Lt .sp M Mark @@ -167,170 +172,59 @@ Perl and POSIX space are now the same. Perl added VT to its space character set at release 5.18. . . -.SH "SCRIPT NAMES FOR \ep AND \eP" +.SH "BINARY PROPERTIES FOR \ep AND \eP" +.rs +.sp +Unicode defines a number of binary properties, that is, properties whose only +values are true or false. You can obtain a list of those that are recognized by +\ep and \eP, along with their abbreviations, by running this command: +.sp + pcre2test -LP +. +. +. +.SH "SCRIPT MATCHING WITH \ep AND \eP" +.rs +.sp +Many script names and their 4-letter abbreviations are recognized in +\ep{sc:...} or \ep{scx:...} items, or on their own with \ep (and also \eP of +course). You can obtain a list of these scripts by running this command: +.sp + pcre2test -LS +. +. +. +.SH "THE BIDI_CLASS PROPERTY FOR \ep AND \eP" .rs .sp -Adlam, -Ahom, -Anatolian_Hieroglyphs, -Arabic, -Armenian, -Avestan, -Balinese, -Bamum, -Bassa_Vah, -Batak, -Bengali, -Bhaiksuki, -Bopomofo, -Brahmi, -Braille, -Buginese, -Buhid, -Canadian_Aboriginal, -Carian, -Caucasian_Albanian, -Chakma, -Cham, -Cherokee, -Chorasmian, -Common, -Coptic, -Cuneiform, -Cypriot, -Cypro_Minoan, -Cyrillic, -Deseret, -Devanagari, -Dives_Akuru, -Dogra, -Duployan, -Egyptian_Hieroglyphs, -Elbasan, -Elymaic, -Ethiopic, -Georgian, -Glagolitic, -Gothic, -Grantha, -Greek, -Gujarati, -Gunjala_Gondi, -Gurmukhi, -Han, -Hangul, -Hanifi_Rohingya, -Hanunoo, -Hatran, -Hebrew, -Hiragana, -Imperial_Aramaic, -Inherited, -Inscriptional_Pahlavi, -Inscriptional_Parthian, -Javanese, -Kaithi, -Kannada, -Katakana, -Kayah_Li, -Kharoshthi, -Khitan_Small_Script, -Khmer, -Khojki, -Khudawadi, -Lao, -Latin, -Lepcha, -Limbu, -Linear_A, -Linear_B, -Lisu, -Lycian, -Lydian, -Mahajani, -Makasar, -Malayalam, -Mandaic, -Manichaean, -Marchen, -Masaram_Gondi, -Medefaidrin, -Meetei_Mayek, -Mende_Kikakui, -Meroitic_Cursive, -Meroitic_Hieroglyphs, -Miao, -Modi, -Mongolian, -Mro, -Multani, -Myanmar, -Nabataean, -Nandinagari, -New_Tai_Lue, -Newa, -Nko, -Nushu, -Nyakeng_Puachue_Hmong, -Ogham, -Ol_Chiki, -Old_Hungarian, -Old_Italic, -Old_North_Arabian, -Old_Permic, -Old_Persian, -Old_Sogdian, -Old_South_Arabian, -Old_Turkic, -Old_Uyghur, -Oriya, -Osage, -Osmanya, -Pahawh_Hmong, -Palmyrene, -Pau_Cin_Hau, -Phags_Pa, -Phoenician, -Psalter_Pahlavi, -Rejang, -Runic, -Samaritan, -Saurashtra, -Sharada, -Shavian, -Siddham, -SignWriting, -Sinhala, -Sogdian, -Sora_Sompeng, -Soyombo, -Sundanese, -Syloti_Nagri, -Syriac, -Tagalog, -Tagbanwa, -Tai_Le, -Tai_Tham, -Tai_Viet, -Takri, -Tamil, -Tangsa, -Tangut, -Telugu, -Thaana, -Thai, -Tibetan, -Tifinagh, -Tirhuta, -Toto, -Ugaritic, -Vai, -Vithkuqi, -Wancho, -Warang_Citi, -Yezidi, -Yi, -Zanabazar_Square. + \ep{Bidi_Class: } matches a character with the given class + \ep{BC: } matches a character with the given class +.sp +The recognized classes are: +.sp + AL Arabic letter + AN Arabic number + B paragraph separator + BN boundary neutral + CS common separator + EN European number + ES European separator + ET European terminator + FSI first strong isolate + L left-to-right + LRE left-to-right embedding + LRI left-to-right isolate + LRO left-to-right override + NSM non-spacing mark + ON other neutral + PDF pop directional format + PDI pop directional isolate + R right-to-left + RLE right-to-left embedding + RLI right-to-left isolate + RLO right-to-left override + S segment separator + WS which space . . .SH "CHARACTER CLASSES" @@ -684,6 +578,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2021 -Copyright (c) 1997-2021 University of Cambridge. +Last updated: 12 January 2022 +Copyright (c) 1997-2022 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2test.1 b/pcre2/doc/pcre2test.1 index d98e97441b998a4d1822146d5427048bdec09a5e..d374f3ea46413180f0785dd3e36ab134fe30ab8b 100644 --- a/pcre2/doc/pcre2test.1 +++ b/pcre2/doc/pcre2test.1 @@ -1,4 +1,4 @@ -.TH PCRE2TEST 1 "30 August 2021" "PCRE 10.38" +.TH PCRE2TEST 1 "12 January 2022" "PCRE 10.40" .SH NAME pcre2test - a program for testing Perl-compatible regular expressions. .SH SYNOPSIS @@ -47,7 +47,7 @@ format before being passed to the library functions. Results are converted back to 8-bit code units for output. .P In the rest of this document, the names of library functions and structures -are given in generic form, for example, \fBpcre_compile()\fP. The actual +are given in generic form, for example, \fBpcre2_compile()\fP. The actual names used in the libraries have a suffix _8, _16, or _32, as appropriate. . . @@ -211,7 +211,17 @@ available, and the use of JIT for matching is verified. \fB-LM\fP List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit code. All other options are ignored. -If both -C and -LM are present, whichever is first is recognized. +If both -C and any -Lx options are present, whichever is first is recognized. +.TP 10 +\fB-LP\fP +List properties: write a list of recognized Unicode properties to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. +.TP 10 +\fB-LS\fP +List scripts: write a list of recogized Unicode script names to the standard +output, then exit with zero exit code. All other options are ignored. If both +-C and any -Lx options are present, whichever is first is recognized. .TP 10 \fB-pattern\fP \fImodifier-list\fP Behave as if each pattern line contains the given modifiers. @@ -1206,6 +1216,8 @@ pattern, but can be overridden by modifiers on the subject. match_limit= set a match limit memory show heap memory usage null_context match with a NULL context + null_replacement substitute with NULL replacement + null_subject match with NULL subject offset= set starting offset offset_limit= set offset limit ovector= set size of output vector @@ -1629,7 +1641,7 @@ When testing \fBpcre2_substitute()\fP, this modifier also has the effect of passing the replacement string as zero-terminated. . . -.SS "Passing a NULL context" +.SS "Passing a NULL context, subject, or replacement" .rs .sp Normally, \fBpcre2test\fP passes a context block to \fBpcre2_match()\fP, @@ -1638,6 +1650,10 @@ If the \fBnull_context\fP modifier is set, however, NULL is passed. This is for testing that the matching and substitution functions behave correctly in this case (they use default values). This modifier cannot be used with the \fBfind_limits\fP or \fBsubstitute_callout\fP modifiers. +.P +Similarly, for testing purposes, if the \fBnull_subject\fP or +\fBnull_replacement\fP modifier is set, the subject or replacement string +pointers are passed as NULL, respectively, to the relevant functions. . . .SH "THE ALTERNATIVE MATCHING FUNCTION" @@ -2103,6 +2119,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 30 August 2021 -Copyright (c) 1997-2021 University of Cambridge. +Last updated: 12 January 2022 +Copyright (c) 1997-2022 University of Cambridge. .fi diff --git a/pcre2/doc/pcre2test.txt b/pcre2/doc/pcre2test.txt index 217bed5f03afe99e3c00962c47a2a4fb5ef3c8aa..ed7dd20ec92a68263a1a63d826d1a1eea3c324f7 100644 --- a/pcre2/doc/pcre2test.txt +++ b/pcre2/doc/pcre2test.txt @@ -44,7 +44,7 @@ PCRE2's 8-BIT, 16-BIT AND 32-BIT LIBRARIES output. In the rest of this document, the names of library functions and struc- - tures are given in generic form, for example, pcre_compile(). The ac- + tures are given in generic form, for example, pcre2_compile(). The ac- tual names used in the libraries have a suffix _8, _16, or _32, as ap- propriate. @@ -197,7 +197,17 @@ COMMAND LINE OPTIONS -LM List modifiers: write a list of available pattern and subject modifiers to the standard output, then exit with zero exit - code. All other options are ignored. If both -C and -LM are + code. All other options are ignored. If both -C and any -Lx + options are present, whichever is first is recognized. + + -LP List properties: write a list of recognized Unicode proper- + ties to the standard output, then exit with zero exit code. + All other options are ignored. If both -C and any -Lx options + are present, whichever is first is recognized. + + -LS List scripts: write a list of recogized Unicode script names + to the standard output, then exit with zero exit code. All + other options are ignored. If both -C and any -Lx options are present, whichever is first is recognized. -pattern modifier-list @@ -1111,6 +1121,8 @@ SUBJECT MODIFIERS match_limit= set a match limit memory show heap memory usage null_context match with a NULL context + null_replacement substitute with NULL replacement + null_subject match with NULL subject offset= set starting offset offset_limit= set offset limit ovector= set size of output vector @@ -1499,7 +1511,7 @@ SUBJECT MODIFIERS When testing pcre2_substitute(), this modifier also has the effect of passing the replacement string as zero-terminated. - Passing a NULL context + Passing a NULL context, subject, or replacement Normally, pcre2test passes a context block to pcre2_match(), pcre2_dfa_match(), pcre2_jit_match() or pcre2_substitute(). If the @@ -1508,6 +1520,10 @@ SUBJECT MODIFIERS in this case (they use default values). This modifier cannot be used with the find_limits or substitute_callout modifiers. + Similarly, for testing purposes, if the null_subject or null_replace- + ment modifier is set, the subject or replacement string pointers are + passed as NULL, respectively, to the relevant functions. + THE ALTERNATIVE MATCHING FUNCTION @@ -1933,5 +1949,5 @@ AUTHOR REVISION - Last updated: 30 August 2021 - Copyright (c) 1997-2021 University of Cambridge. + Last updated: 12 January 2022 + Copyright (c) 1997-2022 University of Cambridge. diff --git a/pcre2/doc/pcre2unicode.3 b/pcre2/doc/pcre2unicode.3 index 055a4ce42e59cacf3c42ca7083de61aa1de9465f..e7e37a395eb5798f65a28f9e5aff0ecab02f554a 100644 --- a/pcre2/doc/pcre2unicode.3 +++ b/pcre2/doc/pcre2unicode.3 @@ -1,4 +1,4 @@ -.TH PCRE2UNICODE 3 "23 February 2020" "PCRE2 10.35" +.TH PCRE2UNICODE 3 "22 December 2021" "PCRE2 10.40" .SH NAME PCRE - Perl-compatible regular expressions (revised API) .SH "UNICODE AND UTF SUPPORT" @@ -40,10 +40,11 @@ handled, as documented below. .sp When PCRE2 is built with Unicode support, the escape sequences \ep{..}, \eP{..}, and \eX can be used. This is not dependent on the PCRE2_UTF setting. -The Unicode properties that can be tested are limited to the general category -properties such as Lu for an upper case letter or Nd for a decimal number, the -Unicode script names such as Arabic or Han, and the derived properties Any and -L&. Full lists are given in the +The Unicode properties that can be tested are a subset of those that Perl +supports. Currently they are limited to the general category properties such as +Lu for an upper case letter or Nd for a decimal number, the Unicode script +names such as Arabic or Han, Bidi_Class, Bidi_Control, and the derived +properties Any and LC (synonym L&). Full lists are given in the .\" HREF \fBpcre2pattern\fP .\" @@ -51,10 +52,10 @@ and .\" HREF \fBpcre2syntax\fP .\" -documentation. Only the short names for properties are supported. For example, -\ep{L} matches a letter. Its Perl synonym, \ep{Letter}, is not supported. -Furthermore, in Perl, many properties may optionally be prefixed by "Is", for -compatibility with Perl 5.6. PCRE2 does not support this. +documentation. In general, only the short names for properties are supported. +For example, \ep{L} matches a letter. Its longer synonym, \ep{Letter}, is not +supported. Furthermore, in Perl, many properties may optionally be prefixed by +"Is", for compatibility with Perl 5.6. PCRE2 does not support this. . . .SH "WIDE CHARACTERS AND UTF MODES" @@ -448,7 +449,7 @@ can be useful when searching for UTF text in executable or other binary files. .sp .nf Philip Hazel -University Computing Service +Retired from University Computing Service Cambridge, England. .fi . @@ -457,6 +458,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 23 February 2020 -Copyright (c) 1997-2020 University of Cambridge. +Last updated: 22 December 2021 +Copyright (c) 1997-2021 University of Cambridge. .fi diff --git a/pcre2/maint/GenerateCommon.py b/pcre2/maint/GenerateCommon.py new file mode 100644 index 0000000000000000000000000000000000000000..03f9ac559111b0779047933bfc63f206d7040604 --- /dev/null +++ b/pcre2/maint/GenerateCommon.py @@ -0,0 +1,355 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This file is a Python module containing common lists and functions for the +# GenerateXXX scripts that create various.c and .h files from Unicode data +# files. It was created as part of a re-organizaton of these scripts in +# December 2021. + + +import re + + +# --------------------------------------------------------------------------- +# DATA LISTS +# --------------------------------------------------------------------------- + +# BIDI classes in the DerivedBidiClass.txt file, with comments. + +bidi_classes = [ + 'AL', 'Arabic letter', + 'AN', 'Arabic number', + 'B', 'Paragraph separator', + 'BN', 'Boundary neutral', + 'CS', 'Common separator', + 'EN', 'European number', + 'ES', 'European separator', + 'ET', 'European terminator', + 'FSI', 'First strong isolate', + 'L', 'Left to right', + 'LRE', 'Left to right embedding', + 'LRI', 'Left to right isolate', + 'LRO', 'Left to right override', + 'NSM', 'Non-spacing mark', + 'ON', 'Other neutral', + 'PDF', 'Pop directional format', + 'PDI', 'Pop directional isolate', + 'R', 'Right to left', + 'RLE', 'Right to left embedding', + 'RLI', 'Right to left isolate', + 'RLO', 'Right to left override', + 'S', 'Segment separator', + 'WS', 'White space' + ] + +# Particular category property names, with comments. NOTE: If ever this list +# is changed, the table called "catposstab" in the pcre2_auto_possess.c file +# must be edited to keep in step. + +category_names = [ + 'Cc', 'Control', + 'Cf', 'Format', + 'Cn', 'Unassigned', + 'Co', 'Private use', + 'Cs', 'Surrogate', + 'Ll', 'Lower case letter', + 'Lm', 'Modifier letter', + 'Lo', 'Other letter', + 'Lt', 'Title case letter', + 'Lu', 'Upper case letter', + 'Mc', 'Spacing mark', + 'Me', 'Enclosing mark', + 'Mn', 'Non-spacing mark', + 'Nd', 'Decimal number', + 'Nl', 'Letter number', + 'No', 'Other number', + 'Pc', 'Connector punctuation', + 'Pd', 'Dash punctuation', + 'Pe', 'Close punctuation', + 'Pf', 'Final punctuation', + 'Pi', 'Initial punctuation', + 'Po', 'Other punctuation', + 'Ps', 'Open punctuation', + 'Sc', 'Currency symbol', + 'Sk', 'Modifier symbol', + 'Sm', 'Mathematical symbol', + 'So', 'Other symbol', + 'Zl', 'Line separator', + 'Zp', 'Paragraph separator', + 'Zs', 'Space separator' + ] + +# The Extended_Pictographic property is not found in the file where all the +# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt +# file, but we list it here so that the name has the correct index value. + +break_properties = [ + 'CR', ' 0', + 'LF', ' 1', + 'Control', ' 2', + 'Extend', ' 3', + 'Prepend', ' 4', + 'SpacingMark', ' 5', + 'L', ' 6 Hangul syllable type L', + 'V', ' 7 Hangul syllable type V', + 'T', ' 8 Hangul syllable type T', + 'LV', ' 9 Hangul syllable type LV', + 'LVT', '10 Hangul syllable type LVT', + 'Regional_Indicator', '11', + 'Other', '12', + 'ZWJ', '13', + 'Extended_Pictographic', '14' + ] + +# List of files from which the names of Boolean properties are obtained, along +# with a list of regex patterns for properties to be ignored, and a list of +# extra pattern names to add. + +bool_propsfiles = ['PropList.txt', 'DerivedCoreProperties.txt', 'emoji-data.txt'] +bool_propsignore = [r'^Other_', r'^Hyphen$'] +bool_propsextras = ['ASCII', 'Bidi_Mirrored'] + + +# --------------------------------------------------------------------------- +# GET BOOLEAN PROPERTY NAMES +# --------------------------------------------------------------------------- + +# Get a list of Boolean property names from a number of files. + +def getbpropslist(): + bplist = [] + bplast = "" + + for filename in bool_propsfiles: + try: + file = open('Unicode.tables/' + filename, 'r') + except IOError: + print(f"** Couldn't open {'Unicode.tables/' + filename}\n") + sys.exit(1) + + for line in file: + line = re.sub(r'#.*', '', line) + data = list(map(str.strip, line.split(';'))) + if len(data) <= 1 or data[1] == bplast: + continue + bplast = data[1] + for pat in bool_propsignore: + if re.match(pat, bplast) != None: + break + else: + bplist.append(bplast) + + file.close() + + bplist.extend(bool_propsextras) + bplist.sort() + return bplist + +bool_properties = getbpropslist() +bool_props_list_item_size = (len(bool_properties) + 31) // 32 + + + +# --------------------------------------------------------------------------- +# COLLECTING PROPERTY NAMES AND ALIASES +# --------------------------------------------------------------------------- + +script_names = ['Unknown'] +abbreviations = {} + +def collect_property_names(): + global script_names + global abbreviations + + names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_]+) #') + + last_script_name = "" + with open("Unicode.tables/Scripts.txt") as f: + for line in f: + match_obj = names_re.match(line) + + if match_obj == None or match_obj.group(1) == last_script_name: + continue + + last_script_name = match_obj.group(1) + script_names.append(last_script_name) + + # Sometimes there is comment in the line + # so splitting around semicolon is not enough + value_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_ ]+))?') + + with open("Unicode.tables/PropertyValueAliases.txt") as f: + for line in f: + match_obj = value_alias_re.match(line) + + if match_obj == None: + continue + + if match_obj.group(1) == "sc": + if match_obj.group(2) == match_obj.group(3): + abbreviations[match_obj.group(3)] = () + elif match_obj.group(4) == None: + abbreviations[match_obj.group(3)] = (match_obj.group(2),) + else: + abbreviations[match_obj.group(3)] = (match_obj.group(2), match_obj.group(4)) + + # We can also collect Boolean property abbreviations into the same dictionary + + bin_alias_re = re.compile(r' *([A-Za-z_]+) *; *([A-Za-z_]+)(?: *; *([A-Za-z_]+))?') + with open("Unicode.tables/PropertyAliases.txt") as f: + for line in f: + match_obj = bin_alias_re.match(line) + if match_obj == None: + continue + + if match_obj.group(2) in bool_properties: + if match_obj.group(3) == None: + abbreviations[match_obj.group(2)] = (match_obj.group(1),) + else: + abbreviations[match_obj.group(2)] = (match_obj.group(1), match_obj.group(3)) + +collect_property_names() + + + +# --------------------------------------------------------------------------- +# REORDERING SCRIPT NAMES +# --------------------------------------------------------------------------- + +script_abbrevs = [] + +def reorder_scripts(): + global script_names + global script_abbrevs + global abbreviations + + for name in script_names: + abbrevs = abbreviations[name] + script_abbrevs.append(name if len(abbrevs) == 0 else abbrevs[0]) + + extended_script_abbrevs = set() + with open("Unicode.tables/ScriptExtensions.txt") as f: + names_re = re.compile(r'^[0-9A-F]{4,6}(?:\.\.[0-9A-F]{4,6})? +; ([A-Za-z_ ]+) #') + + for line in f: + match_obj = names_re.match(line) + + if match_obj == None: + continue + + for name in match_obj.group(1).split(" "): + extended_script_abbrevs.add(name) + + new_script_names = [] + new_script_abbrevs = [] + + for idx, abbrev in enumerate(script_abbrevs): + if abbrev in extended_script_abbrevs: + new_script_names.append(script_names[idx]) + new_script_abbrevs.append(abbrev) + + for idx, abbrev in enumerate(script_abbrevs): + if abbrev not in extended_script_abbrevs: + new_script_names.append(script_names[idx]) + new_script_abbrevs.append(abbrev) + + script_names = new_script_names + script_abbrevs = new_script_abbrevs + +reorder_scripts() +script_list_item_size = (script_names.index('Unknown') + 31) // 32 + + +# --------------------------------------------------------------------------- +# DERIVED LISTS +# --------------------------------------------------------------------------- + +# Create general character property names from the first letters of the +# particular categories. + +gcn_set = set(category_names[i][0] for i in range(0, len(category_names), 2)) +general_category_names = list(gcn_set) +general_category_names.sort() + + +# --------------------------------------------------------------------------- +# FUNCTIONS +# --------------------------------------------------------------------------- + +import sys + +# Open an output file, using the command's argument or a default. Write common +# preliminary header information. + +def open_output(default): + if len(sys.argv) > 2: + print('** Too many arguments: just give a file name') + sys.exit(1) + if len(sys.argv) == 2: + output_name = sys.argv[1] + else: + output_name = default + try: + file = open(output_name, "w") + except IOError: + print ("** Couldn't open %s" % output_name) + sys.exit(1) + + script_name = sys.argv[0] + i = script_name.rfind('/') + if i >= 0: + script_name = script_name[i+1:] + + file.write("""\ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2022 University of Cambridge + +This module is auto-generated from Unicode data files. DO NOT EDIT MANUALLY! +""") + + file.write("Instead, modify the maint/%s script and run it to generate\n" + "a new version of this code.\n\n" % script_name) + + file.write("""\ +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ +\n""") + return file + +# End of UcpCommon.py diff --git a/pcre2/maint/GenerateTest26.py b/pcre2/maint/GenerateTest26.py new file mode 100644 index 0000000000000000000000000000000000000000..2afdf25e5d041d1d6a721aefc308a6c368c95c00 --- /dev/null +++ b/pcre2/maint/GenerateTest26.py @@ -0,0 +1,188 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ +# +# This file auto-generates unicode property tests and their expected output. +# It is recommended to re-run this generator after the unicode files are +# updated. The names of the generated files are `testinput26` and `testoutput26` + +import re +import sys + +from GenerateCommon import \ + script_names, \ + script_abbrevs + +def write_both(text): + input_file.write(text) + output_file.write(text) + +def to_string_char(ch_idx): + if ch_idx < 128: + if ch_idx < 16: + return "\\x{0%x}" % ch_idx + if ch_idx >= 32: + return chr(ch_idx) + return "\\x{%x}" % ch_idx + +output_directory = "" + +if len(sys.argv) > 2: + print('** Too many arguments: just give a directory name') + sys.exit(1) +if len(sys.argv) == 2: + output_directory = sys.argv[1] + if not output_directory.endswith("/"): + output_directory += "/" + +try: + input_file = open(output_directory + "testinput26", "w") + output_file = open(output_directory + "testoutput26", "w") +except IOError: + print ("** Couldn't open output files") + sys.exit(1) + +write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n") + +# --------------------------------------------------------------------------- +# UNICODE SCRIPT EXTENSION TESTS +# --------------------------------------------------------------------------- + +write_both("# Unicode Script Extension tests.\n\n") + +def gen_script_tests(): + script_data = [None] * len(script_names) + char_data = [None] * 0x110000 + + property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #") + prev_name = "" + script_idx = -1 + + with open("Unicode.tables/Scripts.txt") as f: + for line in f: + match_obj = property_re.match(line) + + if match_obj == None: + continue + + name = match_obj.group(3) + if name != prev_name: + script_idx = script_names.index(name) + prev_name = name + + low = int(match_obj.group(1), 16) + high = low + char_data[low] = name + + if match_obj.group(2) != None: + high = int(match_obj.group(2), 16) + for idx in range(low + 1, high + 1): + char_data[idx] = name + + if script_data[script_idx] == None: + script_data[script_idx] = [low, None, None, None, None] + script_data[script_idx][1] = high + + extended_script_indicies = {} + + with open("Unicode.tables/ScriptExtensions.txt") as f: + for line in f: + match_obj = property_re.match(line) + + if match_obj == None: + continue + + low = int(match_obj.group(1), 16) + high = low + if match_obj.group(2) != None: + high = int(match_obj.group(2), 16) + + for abbrev in match_obj.group(3).split(" "): + if abbrev not in extended_script_indicies: + idx = script_abbrevs.index(abbrev) + extended_script_indicies[abbrev] = idx + rec = script_data[idx] + rec[2] = low + rec[3] = high + else: + idx = extended_script_indicies[abbrev] + rec = script_data[idx] + if rec[2] > low: + rec[2] = low + if rec[3] < high: + rec[3] = high + + if rec[4] == None: + name = script_names[idx] + for idx in range(low, high + 1): + if char_data[idx] != name: + rec[4] = idx + break + + long_property_name = False + + for idx, rec in enumerate(script_data): + script_name = script_names[idx] + + if script_name == "Unknown": + continue + + script_abbrev = script_abbrevs[idx] + + write_both("# Base script check\n") + write_both("/^\\p{sc=%s}/utf\n" % script_name) + write_both(" %s\n" % to_string_char(rec[0])) + output_file.write(" 0: %s\n" % to_string_char(rec[0])) + write_both("\n") + + write_both("/^\\p{Script=%s}/utf\n" % script_abbrev) + write_both(" %s\n" % to_string_char(rec[1])) + output_file.write(" 0: %s\n" % to_string_char(rec[1])) + write_both("\n") + + if rec[2] != None: + property_name = "scx" + if long_property_name: + property_name = "Script_Extensions" + + write_both("# Script extension check\n") + write_both("/^\\p{%s}/utf\n" % script_name) + write_both(" %s\n" % to_string_char(rec[2])) + output_file.write(" 0: %s\n" % to_string_char(rec[2])) + write_both("\n") + + write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev)) + write_both(" %s\n" % to_string_char(rec[3])) + output_file.write(" 0: %s\n" % to_string_char(rec[3])) + write_both("\n") + + long_property_name = not long_property_name + + if rec[4] != None: + write_both("# Script extension only character\n") + write_both("/^\\p{%s}/utf\n" % script_name) + write_both(" %s\n" % to_string_char(rec[4])) + output_file.write(" 0: %s\n" % to_string_char(rec[4])) + write_both("\n") + + write_both("/^\\p{sc=%s}/utf\n" % script_name) + write_both(" %s\n" % to_string_char(rec[4])) + output_file.write("No match\n") + write_both("\n") + else: + print("External character has not found for %s" % script_name) + + high = rec[1] + if rec[3] != None and rec[3] > rec[1]: + high = rec[3] + write_both("# Character not in script\n") + write_both("/^\\p{%s}/utf\n" % script_name) + write_both(" %s\n" % to_string_char(high + 1)) + output_file.write("No match\n") + write_both("\n") + + +gen_script_tests() + +write_both("# End of testinput26\n") diff --git a/pcre2/maint/GenerateUcd.py b/pcre2/maint/GenerateUcd.py new file mode 100644 index 0000000000000000000000000000000000000000..6081800d0ce2a56295408e6252b77da262e26318 --- /dev/null +++ b/pcre2/maint/GenerateUcd.py @@ -0,0 +1,923 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ +# +# This script generates the pcre2_ucd.c file from Unicode data files. This is +# the compressed Unicode property data used by PCRE2. The script was created in +# December 2021 as part of the Unicode data generation refactoring. It is +# basically a re-working of the MultiStage2.py script that was submitted to the +# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of +# Unicode property support. A number of extensions have since been added. The +# main difference in the 2021 upgrade (apart from comments and layout) is that +# the data tables (e.g. list of script names) are now listed in or generated by +# a separate Python module that is shared with the other Generate scripts. +# +# This script must be run in the "maint" directory. It requires the following +# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt, +# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt, +# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt, +# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and +# emoji-data.txt. These must be in the Unicode.tables subdirectory. +# +# The emoji-data.txt file is found in the "emoji" subdirectory even though it +# is technically part of a different (but coordinated) standard as shown +# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), +# for example: +# +# http://unicode.org/Public/emoji/13.0/ReadMe.txt +# +# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted" +# subdirectory of the Unicode database (UCD) on the Unicode web site; +# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files +# are in the top-level UCD directory. +# +# ----------------------------------------------------------------------------- +# Minor modifications made to the original script: +# Added #! line at start +# Removed tabs +# Made it work with Python 2.4 by rewriting two statements that needed 2.5 +# Consequent code tidy +# Adjusted data file names to take from the Unicode.tables directory +# Adjusted global table names by prefixing _pcre_. +# Commented out stuff relating to the casefolding table, which isn't used; +# removed completely in 2012. +# Corrected size calculation +# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. +# Update for PCRE2: name changes, and SUPPORT_UCP is abolished. +# +# Major modifications made to the original script: +# Added code to add a grapheme break property field to records. +# +# Added code to search for sets of more than two characters that must match +# each other caselessly. A new table is output containing these sets, and +# offsets into the table are added to the main output records. This new +# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer +# used. +# +# Update for Python3: +# . Processed with 2to3, but that didn't fix everything +# . Changed string.strip to str.strip +# . Added encoding='utf-8' to the open() call +# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is +# required and the result of the division is a float +# +# Added code to scan the emoji-data.txt file to find the Extended Pictographic +# property, which is used by PCRE2 as a grapheme breaking property. This was +# done when updating to Unicode 11.0.0 (July 2018). +# +# Added code to add a Script Extensions field to records. This has increased +# their size from 8 to 12 bytes, only 10 of which are currently used. +# +# Added code to add a bidi class field to records by scanning the +# DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare +# bytes, so now 11 out of 12 are in use. +# +# 01-March-2010: Updated list of scripts for Unicode 5.2.0 +# 30-April-2011: Updated list of scripts for Unicode 6.0.0 +# July-2012: Updated list of scripts for Unicode 6.1.0 +# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new +# field in the record to hold the value. Luckily, the +# structure had a hole in it, so the resulting table is +# not much bigger than before. +# 18-September-2012: Added code for multiple caseless sets. This uses the +# final hole in the structure. +# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 +# 13-May-2014: Updated for PCRE2 +# 03-June-2014: Updated for Python 3 +# 20-June-2014: Updated for Unicode 7.0.0 +# 12-August-2014: Updated to put Unicode version into the file +# 19-June-2015: Updated for Unicode 8.0.0 +# 02-July-2017: Updated for Unicode 10.0.0 +# 03-July-2018: Updated for Unicode 11.0.0 +# 07-July-2018: Added code to scan emoji-data.txt for the Extended +# Pictographic property. +# 01-October-2018: Added the 'Unknown' script name +# 03-October-2018: Added new field for Script Extensions +# 27-July-2019: Updated for Unicode 12.1.0 +# 10-March-2020: Updated for Unicode 13.0.0 +# PCRE2-10.39: Updated for Unicode 14.0.0 +# 05-December-2021: Added code to scan DerivedBidiClass.txt for bidi class, +# and also PropList.txt for the Bidi_Control property +# 19-December-2021: Reworked script extensions lists to be bit maps instead +# of zero-terminated lists of script numbers. +# ---------------------------------------------------------------------------- +# +# Changes to the refactored script: +# +# 26-December-2021: Refactoring completed +# 10-January-2022: Addition of general Boolean property support +# 12-January-2022: Merge scriptx and bidiclass fields +# 14-January-2022: Enlarge Boolean property offset to 12 bits +# +# ---------------------------------------------------------------------------- +# +# +# The main tables generated by this script are used by macros defined in +# pcre2_internal.h. They look up Unicode character properties using short +# sequences of code that contains no branches, which makes for greater speed. +# +# Conceptually, there is a table of records (of type ucd_record), one for each +# Unicode character. Each record contains the script number, script extension +# value, character type, grapheme break type, offset to caseless matching set, +# offset to the character's other case, the bidi class, and offset to bitmap of +# Boolean properties. +# +# A real table covering all Unicode characters would be far too big. It can be +# efficiently compressed by observing that many characters have the same +# record, and many blocks of characters (taking 128 characters in a block) have +# the same set of records as other blocks. This leads to a 2-stage lookup +# process. +# +# This script constructs seven tables. The ucd_caseless_sets table contains +# lists of characters that all match each other caselessly. Each list is +# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than +# any valid character. The first list is empty; this is used for characters +# that are not part of any list. +# +# The ucd_digit_sets table contains the code points of the '9' characters in +# each set of 10 decimal digits in Unicode. This is used to ensure that digits +# in script runs all come from the same set. The first element in the vector +# contains the number of subsequent elements, which are in ascending order. +# +# Scripts are partitioned into two groups. Scripts that appear in at least one +# character's script extension list come first, followed by "Unknown" and then +# all the rest. This sorting is done automatically in the GenerateCommon.py +# script. A script's number is its index in the script_names list. +# +# The ucd_script_sets table contains bitmaps that represent lists of scripts +# for Script Extensions properties. Each bitmap consists of a fixed number of +# unsigned 32-bit numbers, enough to allocate a bit for every script that is +# used in any character's extension list, that is, enough for every script +# whose number is less than ucp_Unknown. A character's script extension value +# in its ucd record is an offset into the ucd_script_sets vector. The first +# bitmap has no bits set; characters that have no script extensions have zero +# as their script extensions value so that they use this map. +# +# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean +# properties. Each bitmap consists of a fixed number of unsigned 32-bit +# numbers, enough to allocate a bit for each supported Boolean property. +# +# The ucd_records table contains one instance of every unique character record +# that is required. The ucd_stage1 table is indexed by a character's block +# number, which is the character's code point divided by 128, since 128 is the +# size of each block. The result of a lookup in ucd_stage1 a "virtual" block +# number. +# +# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by +# the offset of a character within its own block, and the result is the index +# number of the required record in the ucd_records vector. +# +# The following examples are correct for the Unicode 14.0.0 database. Future +# updates may make change the actual lookup values. +# +# Example: lowercase "a" (U+0061) is in block 0 +# lookup 0 in stage1 table yields 0 +# lookup 97 (0x61) in the first table in stage2 yields 35 +# record 35 is { 0, 5, 12, 0, -32, 18432, 44 } +# 0 = ucp_Latin => Latin script +# 5 = ucp_Ll => Lower case letter +# 12 = ucp_gbOther => Grapheme break property "Other" +# 0 => Not part of a caseless set +# -32 (-0x20) => Other case is U+0041 +# 18432 = 0x4800 => Combined Bidi class + script extension values +# 44 => Offset to Boolean properties +# +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 9 = ucp_bidiL => Bidi class left-to-right +# 0 => No special script extension property +# +# Almost all lowercase latin characters resolve to the same record. One or two +# are different because they are part of a multi-character caseless set (for +# example, k, K and the Kelvin symbol are such a set). +# +# Example: hiragana letter A (U+3042) is in block 96 (0x60) +# lookup 96 in stage1 table yields 93 +# lookup 66 (0x42) in table 93 in stage2 yields 819 +# record 819 is { 20, 7, 12, 0, 0, 18432, 82 } +# 20 = ucp_Hiragana => Hiragana script +# 7 = ucp_Lo => Other letter +# 12 = ucp_gbOther => Grapheme break property "Other" +# 0 => Not part of a caseless set +# 0 => No other case +# 18432 = 0x4800 => Combined Bidi class + script extension values +# 82 => Offset to Boolean properties +# +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 9 = ucp_bidiL => Bidi class left-to-right +# 0 => No special script extension property +# +# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) +# lookup 57 in stage1 table yields 55 +# lookup 80 (0x50) in table 55 in stage2 yields 621 +# record 621 is { 84, 12, 3, 0, 0, 26762, 96 } +# 84 = ucp_Inherited => Script inherited from predecessor +# 12 = ucp_Mn => Non-spacing mark +# 3 = ucp_gbExtend => Grapheme break property "Extend" +# 0 => Not part of a caseless set +# 0 => No other case +# 26762 = 0x688A => Combined Bidi class + script extension values +# 96 => Offset to Boolean properties +# +# The top 5 bits of the sixth field are the Bidi class, with the rest being the +# script extension value, giving: +# +# 13 = ucp_bidiNSM => Bidi class non-spacing mark +# 138 => Script Extension list offset = 138 +# +# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8, +# 18, and 47 set. This means that this character is expected to be used with +# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha. +# +# Philip Hazel, last updated 14 January 2022. +############################################################################## + + +# Import standard modules + +import re +import string +import sys + +# Import common data lists and functions + +from GenerateCommon import \ + bidi_classes, \ + bool_properties, \ + bool_propsfiles, \ + bool_props_list_item_size, \ + break_properties, \ + category_names, \ + general_category_names, \ + script_abbrevs, \ + script_list_item_size, \ + script_names, \ + open_output + +# Some general parameters + +MAX_UNICODE = 0x110000 +NOTACHAR = 0xffffffff + + +# --------------------------------------------------------------------------- +# DEFINE FUNCTIONS +# --------------------------------------------------------------------------- + + +# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt +# or DerivedGeneralCategory.txt + +def make_get_names(enum): + return lambda chardata: enum.index(chardata[1]) + + +# Parse a line of CaseFolding.txt + +def get_other_case(chardata): + if chardata[1] == 'C' or chardata[1] == 'S': + return int(chardata[2], 16) - int(chardata[0], 16) + return 0 + + +# Parse a line of ScriptExtensions.txt + +def get_script_extension(chardata): + global last_script_extension + + offset = len(script_lists) * script_list_item_size + if last_script_extension == chardata[1]: + return offset - script_list_item_size + + last_script_extension = chardata[1] + script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' '))) + return offset + + +# Read a whole table in memory, setting/checking the Unicode version + +def read_table(file_name, get_value, default_value): + global unicode_version + + f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) + file_base = f.group(1) + version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" + file = open(file_name, 'r', encoding='utf-8') + f = re.match(version_pat, file.readline()) + version = f.group(1) + if unicode_version == "": + unicode_version = version + elif unicode_version != version: + print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) + + table = [default_value] * MAX_UNICODE + for line in file: + line = re.sub(r'#.*', '', line) + chardata = list(map(str.strip, line.split(';'))) + if len(chardata) <= 1: + continue + value = get_value(chardata) + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + for i in range(char, last + 1): + # It is important not to overwrite a previously set value because in the + # CaseFolding file there are lines to be ignored (returning the default + # value of 0) which often come after a line which has already set data. + if table[i] == default_value: + table[i] = value + file.close() + return table + + +# Get the smallest possible C language type for the values in a table + +def get_type_size(table): + type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), + ("signed char", 1), ("int16_t", 2), ("int32_t", 4)] + limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127), + (-32768, 32767), (-2147483648, 2147483647)] + minval = min(table) + maxval = max(table) + for num, (minlimit, maxlimit) in enumerate(limits): + if minlimit <= minval and maxval <= maxlimit: + return type_size[num] + raise OverflowError("Too large to fit into C types") + + +# Get the total size of a list of tables + +def get_tables_size(*tables): + total_size = 0 + for table in tables: + type, size = get_type_size(table) + total_size += size * len(table) + return total_size + + +# Compress a table into the two stages + +def compress_table(table, block_size): + blocks = {} # Dictionary for finding identical blocks + stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) + stage2 = [] # Stage 2 table contains the blocks with property values + table = tuple(table) + for i in range(0, len(table), block_size): + block = table[i:i+block_size] + start = blocks.get(block) + if start is None: + # Allocate a new block + start = len(stage2) / block_size + stage2 += block + blocks[block] = start + stage1.append(start) + return stage1, stage2 + + +# Output a table + +def write_table(table, table_name, block_size = None): + type, size = get_type_size(table) + ELEMS_PER_LINE = 16 + + s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) + if block_size: + s += ", block = %d" % block_size + f.write(s + " */\n") + table = tuple(table) + if block_size is None: + fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n" + mult = MAX_UNICODE / len(table) + for i in range(0, len(table), ELEMS_PER_LINE): + f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),))) + else: + if block_size > ELEMS_PER_LINE: + el = ELEMS_PER_LINE + else: + el = block_size + fmt = "%3d," * el + "\n" + if block_size > ELEMS_PER_LINE: + fmt = fmt * int(block_size / ELEMS_PER_LINE) + for i in range(0, len(table), block_size): + f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) + f.write("};\n\n") + + +# Extract the unique combinations of properties into records + +def combine_tables(*tables): + records = {} + index = [] + for t in zip(*tables): + i = records.get(t) + if i is None: + i = records[t] = len(records) + index.append(i) + return index, records + + +# Create a record struct + +def get_record_size_struct(records): + size = 0 + structure = 'typedef struct {\n' + for i in range(len(records[0])): + record_slice = [record[i] for record in records] + slice_type, slice_size = get_type_size(record_slice) + # add padding: round up to the nearest power of slice_size + size = (size + slice_size - 1) & -slice_size + size += slice_size + structure += '%s property_%d;\n' % (slice_type, i) + + # round up to the first item of the next structure in array + record_slice = [record[0] for record in records] + slice_type, slice_size = get_type_size(record_slice) + size = (size + slice_size - 1) & -slice_size + + structure += '} ucd_record;\n*/\n' + return size, structure + + +# Write records + +def write_records(records, record_size): + f.write('const ucd_record PRIV(ucd_records)[] = { ' + \ + '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size)) + records = list(zip(list(records.keys()), list(records.values()))) + records.sort(key = lambda x: x[1]) + for i, record in enumerate(records): + f.write((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,))) + f.write('};\n\n') + + +# Write a bit set + +def write_bitsets(list, item_size): + for d in list: + bitwords = [0] * item_size + for idx in d: + bitwords[idx // 32] |= 1 << (idx & 31) + s = " " + for x in bitwords: + f.write("%s" % s) + s = ", " + f.write("0x%08xu" % x) + f.write(",\n") + f.write("};\n\n") + + +# --------------------------------------------------------------------------- +# This bit of code must have been useful when the original script was being +# developed. Retain it just in case it is ever needed again. + +# def test_record_size(): +# tests = [ \ +# ( [(3,), (6,), (6,), (1,)], 1 ), \ +# ( [(300,), (600,), (600,), (100,)], 2 ), \ +# ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ +# ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ +# ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ +# ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ +# ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ +# ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ +# ] +# for test in tests: +# size, struct = get_record_size_struct(test[0]) +# assert(size == test[1]) +# test_record_size() +# --------------------------------------------------------------------------- + + + +# --------------------------------------------------------------------------- +# MAIN CODE FOR CREATING TABLES +# --------------------------------------------------------------------------- + +unicode_version = "" + +# Some of the tables imported from GenerateCommon.py have alternate comment +# strings for use by GenerateUcpHeader. The comments are not wanted here, so +# remove them. + +bidi_classes = bidi_classes[::2] +break_properties = break_properties[::2] +category_names = category_names[::2] + +# Create the various tables from Unicode data files + +script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) +category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) +break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other')) +other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) +bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L')) + +# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now +# we need to find the Extended_Pictographic property for emoji characters. This +# can be set as an additional grapheme break property, because the default for +# all the emojis is "other". We scan the emoji-data.txt file and modify the +# break-props table. + +file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') +for line in file: + line = re.sub(r'#.*', '', line) + chardata = list(map(str.strip, line.split(';'))) + if len(chardata) <= 1: + continue + if chardata[1] != "Extended_Pictographic": + continue + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + for i in range(char, last + 1): + if break_props[i] != break_properties.index('Other'): + print("WARNING: Emoji 0x%x has break property %s, not 'Other'", + i, break_properties[break_props[i]], file=sys.stderr) + break_props[i] = break_properties.index('Extended_Pictographic') +file.close() + +# Handle script extensions. The get_script_extesion() function maintains a +# list of unique bitmaps representing lists of scripts, returning the offset +# in that list. Initialize the list with an empty set, which is used for +# characters that have no script extensions. + +script_lists = [[]] +last_script_extension = "" +scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0) + +for idx in range(len(scriptx_bidi_class)): + scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11) +bidi_class = None + +# Find the Boolean properties of each character. This next bit of magic creates +# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to +# the *same* list, which is not what we want. + +bprops = [[] for _ in range(MAX_UNICODE)] + +# Collect the properties from the various files + +for filename in bool_propsfiles: + try: + file = open('Unicode.tables/' + filename, 'r') + except IOError: + print(f"** Couldn't open {'Unicode.tables/' + filename}\n") + sys.exit(1) + + for line in file: + line = re.sub(r'#.*', '', line) + data = list(map(str.strip, line.split(';'))) + if len(data) <= 1: + continue + + try: + ix = bool_properties.index(data[1]) + except ValueError: + continue + + m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0]) + char = int(m.group(1), 16) + if m.group(3) is None: + last = char + else: + last = int(m.group(3), 16) + + for i in range(char, last + 1): + bprops[i].append(ix) + + file.close() + +# The ASCII property isn't listed in any files, but it is easy enough to add +# it manually. + +ix = bool_properties.index("ASCII") +for i in range(128): + bprops[i].append(ix) + +# The Bidi_Mirrored property isn't listed in any property files. We have to +# deduce it from the file that lists the mirrored characters. + +ix = bool_properties.index("Bidi_Mirrored") + +try: + file = open('Unicode.tables/BidiMirroring.txt', 'r') +except IOError: + print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n") + sys.exit(1) + +for line in file: + line = re.sub(r'#.*', '', line) + data = list(map(str.strip, line.split(';'))) + if len(data) <= 1: + continue + c = int(data[0], 16) + bprops[c].append(ix) + +file.close() + +# Scan each character's boolean property list and created a list of unique +# lists, at the same time, setting the index in that list for each property in +# the bool_props vector. + +bool_props = [0] * MAX_UNICODE +bool_props_lists = [[]] + +for c in range(MAX_UNICODE): + s = set(bprops[c]) + for i in range(len(bool_props_lists)): + if s == set(bool_props_lists[i]): + break; + else: + bool_props_lists.append(bprops[c]) + i += 1 + + bool_props[c] = i * bool_props_list_item_size + +# This block of code was added by PH in September 2012. It scans the other_case +# table to find sets of more than two characters that must all match each other +# caselessly. Later in this script a table of these sets is written out. +# However, we have to do this work here in order to compute the offsets in the +# table that are inserted into the main table. + +# The CaseFolding.txt file lists pairs, but the common logic for reading data +# sets only one value, so first we go through the table and set "return" +# offsets for those that are not already set. + +for c in range(MAX_UNICODE): + if other_case[c] != 0 and other_case[c + other_case[c]] == 0: + other_case[c + other_case[c]] = -other_case[c] + +# Now scan again and create equivalence sets. + +caseless_sets = [] + +for c in range(MAX_UNICODE): + o = c + other_case[c] + + # Trigger when this character's other case does not point back here. We + # now have three characters that are case-equivalent. + + if other_case[o] != -other_case[c]: + t = o + other_case[o] + + # Scan the existing sets to see if any of the three characters are already + # part of a set. If so, unite the existing set with the new set. + + appended = 0 + for s in caseless_sets: + found = 0 + for x in s: + if x == c or x == o or x == t: + found = 1 + + # Add new characters to an existing set + + if found: + found = 0 + for y in [c, o, t]: + for x in s: + if x == y: + found = 1 + if not found: + s.append(y) + appended = 1 + + # If we have not added to an existing set, create a new one. + + if not appended: + caseless_sets.append([c, o, t]) + +# End of loop looking for caseless sets. + +# Now scan the sets and set appropriate offsets for the characters. + +caseless_offsets = [0] * MAX_UNICODE + +offset = 1; +for s in caseless_sets: + for x in s: + caseless_offsets[x] = offset + offset += len(s) + 1 + +# End of block of code for creating offsets for caseless matching sets. + + +# Combine all the tables + +table, records = combine_tables(script, category, break_props, + caseless_offsets, other_case, scriptx_bidi_class, bool_props) + +# Find the record size and create a string definition of the structure for +# outputting as a comment. + +record_size, record_struct = get_record_size_struct(list(records.keys())) + +# Find the optimum block size for the two-stage table + +min_size = sys.maxsize +for block_size in [2 ** i for i in range(5,10)]: + size = len(records) * record_size + stage1, stage2 = compress_table(table, block_size) + size += get_tables_size(stage1, stage2) + #print "/* block size %5d => %5d bytes */" % (block_size, size) + if size < min_size: + min_size = size + min_stage1, min_stage2 = stage1, stage2 + min_block_size = block_size + + +# --------------------------------------------------------------------------- +# MAIN CODE FOR WRITING THE OUTPUT FILE +# --------------------------------------------------------------------------- + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucd.c") + +# Output this file's heading text + +f.write("""\ +/* This file contains tables of Unicode properties that are extracted from +Unicode data files. See the comments at the start of maint/GenerateUcd.py for +details. + +As well as being part of the PCRE2 library, this file is #included by the +pcre2test program, which redefines the PRIV macro to change table names from +_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present, +just one of these tables is actually needed. When compiling the library, some +headers are needed. */ + +#ifndef PCRE2_PCRE2TEST +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "pcre2_internal.h" +#endif /* PCRE2_PCRE2TEST */ + +/* The tables herein are needed only when UCP support is built, and in PCRE2 +that happens automatically with UTF support. This module should not be +referenced otherwise, so it should not matter whether it is compiled or not. +However a comment was received about space saving - maybe the guy linked all +the modules rather than using a library - so we include a condition to cut out +the tables when not needed. But don't leave a totally empty module because some +compilers barf at that. Instead, just supply some small dummy tables. */ + +#ifndef SUPPORT_UNICODE +const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}}; +const uint16_t PRIV(ucd_stage1)[] = {0}; +const uint16_t PRIV(ucd_stage2)[] = {0}; +const uint32_t PRIV(ucd_caseless_sets)[] = {0}; +#else +\n""") + +# --- Output some variable heading stuff --- + +f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size)) +f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version)) + +f.write("""\ +/* When recompiling tables with a new Unicode version, please check the types +in this structure definition with those in pcre2_internal.h (the actual field +names will be different). +\n""") + +f.write(record_struct) + +f.write(""" +/* If the 32-bit library is run in non-32-bit mode, character values greater +than 0x10ffff may be encountered. For these we set up a special record. */ + +#if PCRE2_CODE_UNIT_WIDTH == 32 +const ucd_record PRIV(dummy_ucd_record)[] = {{ + ucp_Unknown, /* script */ + ucp_Cn, /* type unassigned */ + ucp_gbOther, /* grapheme break property */ + 0, /* case set */ + 0, /* other case */ + 0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */ + 0, /* bool properties offset */ + }}; +#endif +\n""") + +# --- Output the table of caseless character sets --- + +f.write("""\ +/* This table contains lists of characters that are caseless sets of +more than one character. Each list is terminated by NOTACHAR. */ + +const uint32_t PRIV(ucd_caseless_sets)[] = { + NOTACHAR, +""") + +for s in caseless_sets: + s = sorted(s) + for x in s: + f.write(' 0x%04x,' % x) + f.write(' NOTACHAR,\n') +f.write('};\n\n') + +# --- Other tables are not needed by pcre2test --- + +f.write("""\ +/* When #included in pcre2test, we don't need the table of digit sets, nor the +the large main UCD tables. */ + +#ifndef PCRE2_PCRE2TEST +\n""") + +# --- Read Scripts.txt again for the sets of 10 digits. --- + +digitsets = [] +file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') + +for line in file: + m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) + if m is None: + continue + first = int(m.group(1),16) + last = int(m.group(2),16) + if ((last - first + 1) % 10) != 0: + f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), + file=sys.stderr) + while first < last: + digitsets.append(first + 9) + first += 10 +file.close() +digitsets.sort() + +f.write("""\ +/* This table lists the code points for the '9' characters in each set of +decimal digits. It is used to ensure that all the digits in a script run come +from the same set. */ + +const uint32_t PRIV(ucd_digit_sets)[] = { +""") + +f.write(" %d, /* Number of subsequent values */" % len(digitsets)) +count = 8 +for d in digitsets: + if count == 8: + f.write("\n ") + count = 0 + f.write(" 0x%05x," % d) + count += 1 +f.write("\n};\n\n") + +f.write("""\ +/* This vector is a list of script bitsets for the Script Extension property. +The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as +ucd_script_sets_item_size. */ + +const uint32_t PRIV(ucd_script_sets)[] = { +""") +write_bitsets(script_lists, script_list_item_size) + +f.write("""\ +/* This vector is a list of bitsets for Boolean properties. The number of +32_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in +pcre2_ucp.h. */ + +const uint32_t PRIV(ucd_boolprop_sets)[] = { +""") +write_bitsets(bool_props_lists, bool_props_list_item_size) + + +# Output the main UCD tables. + +f.write("""\ +/* These are the main two-stage UCD tables. The fields in each record are: +script (8 bits), character type (8 bits), grapheme break property (8 bits), +offset to multichar other cases or zero (8 bits), offset to other case or zero +(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed +into a 16-bit field, and offset in binary properties table (16 bits). */ +\n""") + +write_records(records, record_size) +write_table(min_stage1, 'PRIV(ucd_stage1)') +write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) + +f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size) +f.write("""\ +#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h +#endif +#endif /* SUPPORT_UNICODE */ + +#endif /* PCRE2_PCRE2TEST */ + +/* End of pcre2_ucd.c */ +""") + +f.close + +# End diff --git a/pcre2/maint/GenerateUcpHeader.py b/pcre2/maint/GenerateUcpHeader.py new file mode 100644 index 0000000000000000000000000000000000000000..4fe43d5eb68fb084989a8a7f37810357d1f05816 --- /dev/null +++ b/pcre2/maint/GenerateUcpHeader.py @@ -0,0 +1,98 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This script generates the pcre2_ucp.h file from Unicode data files. This +# header uses enumerations to give names to Unicode property types and script +# names. + +# This script was created in December 2021 as part of the Unicode data +# generation refactoring. + + +# Import common data lists and functions + +from GenerateCommon import \ + bidi_classes, \ + bool_properties, \ + bool_props_list_item_size, \ + break_properties, \ + category_names, \ + general_category_names, \ + script_list_item_size, \ + script_names, \ + open_output + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucp.h") + +# Output this file's heading text + +f.write("""\ +#ifndef PCRE2_UCP_H_IDEMPOTENT_GUARD +#define PCRE2_UCP_H_IDEMPOTENT_GUARD + +/* This file contains definitions of the Unicode property values that are +returned by the UCD access macros and used throughout PCRE2. + +IMPORTANT: The specific values of the first two enums (general and particular +character categories) are assumed by the table called catposstab in the file +pcre2_auto_possess.c. They are unlikely to change, but should be checked after +an update. */ +\n""") + +f.write("/* These are the general character categories. */\n\nenum {\n") +for i in general_category_names: + f.write(" ucp_%s,\n" % i) +f.write("};\n\n") + +f.write("/* These are the particular character categories. */\n\nenum {\n") +for i in range(0, len(category_names), 2): + f.write(" ucp_%s, /* %s */\n" % (category_names[i], category_names[i+1])) +f.write("};\n\n") + +f.write("/* These are Boolean properties. */\n\nenum {\n") +for i in bool_properties: + f.write(" ucp_%s,\n" % i) + +f.write(" /* This must be last */\n") +f.write(" ucp_Bprop_Count\n};\n\n") + +f.write("/* Size of entries in ucd_boolprop_sets[] */\n\n") +f.write("#define ucd_boolprop_sets_item_size %d\n\n" % bool_props_list_item_size) + +f.write("/* These are the bidi class values. */\n\nenum {\n") +for i in range(0, len(bidi_classes), 2): + sp = ' ' * (4 - len(bidi_classes[i])) + f.write(" ucp_bidi%s,%s /* %s */\n" % (bidi_classes[i], sp, bidi_classes[i+1])) +f.write("};\n\n") + +f.write("/* These are grapheme break properties. The Extended Pictographic " + "property\ncomes from the emoji-data.txt file. */\n\nenum {\n") +for i in range(0, len(break_properties), 2): + sp = ' ' * (21 - len(break_properties[i])) + f.write(" ucp_gb%s,%s /* %s */\n" % (break_properties[i], sp, break_properties[i+1])) +f.write("};\n\n") + +f.write("/* These are the script identifications. */\n\nenum {\n /* Scripts which has characters in other scripts. */\n") +for i in script_names: + if i == "Unknown": + f.write("\n /* Scripts which has no characters in other scripts. */\n") + f.write(" ucp_%s,\n" % i) +f.write("\n") + +f.write(" /* This must be last */\n") +f.write(" ucp_Script_Count\n};\n\n") + +f.write("/* Size of entries in ucd_script_sets[] */\n\n") +f.write("#define ucd_script_sets_item_size %d\n\n" % script_list_item_size) + +f.write("#endif /* PCRE2_UCP_H_IDEMPOTENT_GUARD */\n\n") +f.write("/* End of pcre2_ucp.h */\n") + +f.close() + +# End diff --git a/pcre2/maint/GenerateUcpTables.py b/pcre2/maint/GenerateUcpTables.py new file mode 100644 index 0000000000000000000000000000000000000000..528ff91669245a6bfd959c469b552c20d6dbe226 --- /dev/null +++ b/pcre2/maint/GenerateUcpTables.py @@ -0,0 +1,203 @@ +#! /usr/bin/python + +# PCRE2 UNICODE PROPERTY SUPPORT +# ------------------------------ + +# This script generates the pcre2_ucptables.c file, which contains tables for +# recognizing Unicode property names. It is #included by pcre2_tables.c. In +# order to reduce the number of relocations when loading the PCRE2 library, the +# names are held as a single large string, with offsets in the table. This is +# tedious to maintain by hand. Therefore, a script is used to generate the +# table. + +# This script was created in December 2021 based on the previous GenerateUtt +# script, whose output had to be manually edited into pcre2_tables.c. Here is +# the history of the original script: + +# ----------------------------------------------------------------------------- +# Modified by PH 17-March-2009 to generate the more verbose form that works +# for UTF-support in EBCDIC as well as ASCII environments. +# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. +# Modified by PH 04-May-2010 to add new "X.." special categories. +# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 +# Modified by ChPe 30-September-2012 to add this note; no other changes were +# necessary for Unicode 6.2.0 support. +# Modfied by PH 26-February-2013 to add the Xuc special category. +# Comment modified by PH 13-May-2014 to update to PCRE2 file names. +# Script updated to Python 3 by running it through the 2to3 converter. +# Added script names for Unicode 7.0.0, 20-June-2014. +# Added script names for Unicode 8.0.0, 19-June-2015. +# Added script names for Unicode 10.0.0, 02-July-2017. +# Added script names for Unicode 11.0.0, 03-July-2018. +# Added 'Unknown' script, 01-October-2018. +# Added script names for Unicode 12.1.0, 27-July-2019. +# Added script names for Unicode 13.0.0, 10-March-2020. +# Added Script names for Unicode 14.0.0, PCRE2-10.39 +# Added support for bidi class and bidi control, 06-December-2021 +# This also involved lower casing strings and removing underscores, in +# accordance with Unicode's "loose matching" rules, which Perl observes. +# Changed default script type from PT_SC to PT_SCX, 18-December-2021 +# ----------------------------------------------------------------------------- +# +# Note subsequent changes here: +# +# 27-December-2021: Added support for 4-letter script abbreviations. +# 10-January-2022: Further updates for Boolean property support +# ----------------------------------------------------------------------------- + + +# Import common data lists and functions + +from GenerateCommon import \ + abbreviations, \ + bool_properties, \ + bidi_classes, \ + category_names, \ + general_category_names, \ + script_names, \ + open_output + +# Open the output file (no return on failure). This call also writes standard +# header boilerplate. + +f = open_output("pcre2_ucptables.c") + +# The list in bidi_classes contains just the Unicode classes such as AN, LRE, +# etc., along with comments. We need to add "bidi" in front of each value, in +# order to create names that don't clash with other types of property. + +bidi_class_names = [] +for i in range(0, len(bidi_classes), 2): + bidi_class_names.append("bidi" + bidi_classes[i]) + +# Remove the comments from other lists that contain them. + +category_names = category_names[::2] + +# Create standardized versions of the names by lowercasing and removing +# underscores. + +def stdname(x): + return x.lower().replace('_', '') + +def stdnames(x): + y = [''] * len(x) + for i in range(len(x)): + y[i] = stdname(x[i]) + return y + +std_category_names = stdnames(category_names) +std_general_category_names = stdnames(general_category_names) +std_bidi_class_names = stdnames(bidi_class_names) +std_bool_properties = stdnames(bool_properties) + +# Create the table, starting with the Unicode script, category and bidi class +# names. We keep both the standardized name and the original, because the +# latter is used for the ucp_xx names. NOTE: for the script abbreviations, we +# still use the full original names. + +utt_table = [] + +scx_end = script_names.index('Unknown') + +for idx, name in enumerate(script_names): + pt_type = 'PT_SCX' if idx < scx_end else 'PT_SC' + utt_table.append((stdname(name), name, pt_type)) + for abbrev in abbreviations[name]: + utt_table.append((stdname(abbrev), name, pt_type)) + +# Add the remaining property lists + +utt_table += list(zip(std_category_names, category_names, ['PT_PC'] * len(category_names))) +utt_table += list(zip(std_general_category_names, general_category_names, ['PT_GC'] * len(general_category_names))) +utt_table += list(zip(std_bidi_class_names, bidi_class_names, ['PT_BIDICL'] * len(bidi_class_names))) + +for name in bool_properties: + utt_table.append((stdname(name), name, 'PT_BOOL')) + if name in abbreviations: + for abbrev in abbreviations[name]: + utt_table.append((stdname(abbrev), name, 'PT_BOOL')) + +# Now add specials and synonyms. Note both the standardized and capitalized +# forms are needed. + +utt_table.append(('any', 'Any', 'PT_ANY')) +utt_table.append(('l&', 'L&', 'PT_LAMP')) +utt_table.append(('lc', 'LC', 'PT_LAMP')) +utt_table.append(('xan', 'Xan', 'PT_ALNUM')) +utt_table.append(('xps', 'Xps', 'PT_PXSPACE')) +utt_table.append(('xsp', 'Xsp', 'PT_SPACE')) +utt_table.append(('xuc', 'Xuc', 'PT_UCNC')) +utt_table.append(('xwd', 'Xwd', 'PT_WORD')) + +# Remove duplicates from the table and then sort it. + +utt_table = list(set(utt_table)) +utt_table.sort() + +# Output file-specific heading + +f.write("""\ +#ifdef SUPPORT_UNICODE + +/* The PRIV(utt)[] table below translates Unicode property names into type and +code values. It is searched by binary chop, so must be in collating sequence of +name. Originally, the table contained pointers to the name strings in the first +field of each entry. However, that leads to a large number of relocations when +a shared library is dynamically loaded. A significant reduction is made by +putting all the names into a single, large string and using offsets instead. +All letters are lower cased, and underscores are removed, in accordance with +the "loose matching" rules that Unicode advises and Perl uses. */ +\n""") + +# We have to use STR_ macros to define the strings so that it all works in +# UTF-8 mode on EBCDIC platforms. + +for utt in utt_table: + f.write('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND'))) + for c in utt[0]: + if c == '&': + f.write(' STR_AMPERSAND') + else: + f.write(' STR_%s' % c); + f.write(' "\\0"\n') + +# Output the long string of concatenated names + +f.write('\nconst char PRIV(utt_names)[] =\n'); +last = '' +for utt in utt_table: + if utt == utt_table[-1]: + last = ';' + f.write(' STRING_%s0%s\n' % (utt[0].replace('&', '_AMPERSAND'), last)) + +# Output the property type table + +f.write('\nconst ucp_type_table PRIV(utt)[] = {\n') +offset = 0 +last = ',' +for utt in utt_table: + if utt[2] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', + 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): + value = '0' + else: + value = 'ucp_' + utt[1] + if utt == utt_table[-1]: + last = '' + f.write(' { %3d, %s, %s }%s\n' % (offset, utt[2], value, last)) + offset += len(utt[0]) + 1 +f.write('};\n\n') + +# Ending text + +f.write("""\ +const size_t PRIV(utt_size) = sizeof(PRIV(utt)) / sizeof(ucp_type_table); + +#endif /* SUPPORT_UNICODE */ + +/* End of pcre2_ucptables.c */ +""") + +f.close + +# End diff --git a/pcre2/maint/GenerateUtt.py b/pcre2/maint/GenerateUtt.py deleted file mode 100755 index eea6efce83fef3556e812639e4bc61ece21cae4c..0000000000000000000000000000000000000000 --- a/pcre2/maint/GenerateUtt.py +++ /dev/null @@ -1,140 +0,0 @@ -#! /usr/bin/python - -# Generate utt tables. Note: this script has now been converted to Python 3. - -# The source file pcre2_tables.c contains (amongst other things), a table that -# is indexed by script name. In order to reduce the number of relocations when -# loading the library, the names are held as a single large string, with -# offsets in the table. This is tedious to maintain by hand. Therefore, this -# script is used to generate the table. The output is sent to stdout; usually -# that should be directed to a temporary file. Then pcre2_tables.c can be -# edited by replacing the relevant definitions and table therein with the -# temporary file. - -# Modified by PH 17-March-2009 to generate the more verbose form that works -# for UTF-support in EBCDIC as well as ASCII environments. -# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. -# Modified by PH 04-May-2010 to add new "X.." special categories. -# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 -# Modified by ChPe 30-September-2012 to add this note; no other changes were -# necessary for Unicode 6.2.0 support. -# Modfied by PH 26-February-2013 to add the Xuc special category. -# Comment modified by PH 13-May-2014 to update to PCRE2 file names. -# Script updated to Python 3 by running it through the 2to3 converter. -# Added script names for Unicode 7.0.0, 20-June-2014. -# Added script names for Unicode 8.0.0, 19-June-2015. -# Added script names for Unicode 10.0.0, 02-July-2017. -# Added script names for Unicode 11.0.0, 03-July-2018. -# Added 'Unknown' script, 01-October-2018. -# Added script names for Unicode 12.1.0, 27-July-2019. -# Added script names for Unicode 13.0.0, 10-March-2020. -# Added Script names for Unicode 14.0.0, PCRE2-10.39 - -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ - # New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ - # New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ - # New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ - # New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', \ -# New for Unicode 6.1.0 - 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', -# New for Unicode 7.0.0 - 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', - 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', - 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', - 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', -# New for Unicode 8.0.0 - 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', - 'SignWriting', -# New for Unicode 10.0.0 - 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', - 'Nushu', 'Soyombo', 'Zanabazar_Square', -# New for Unicode 11.0.0 - 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', - 'Old_Sogdian', 'Sogdian', -# New for Unicode 12.0.0 - 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', -# New for Unicode 13.0.0 - 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', -# New for Unicode 14.0.0 - 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' - ] - -category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', - 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', - 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] - -general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z'] - -# First add the Unicode script and category names. - -utt_table = list(zip(script_names, ['PT_SC'] * len(script_names))) -utt_table += list(zip(category_names, ['PT_PC'] * len(category_names))) -utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names))) - -# Now add our own specials. - -utt_table.append(('Any', 'PT_ANY')) -utt_table.append(('L&', 'PT_LAMP')) -utt_table.append(('Xan', 'PT_ALNUM')) -utt_table.append(('Xps', 'PT_PXSPACE')) -utt_table.append(('Xsp', 'PT_SPACE')) -utt_table.append(('Xuc', 'PT_UCNC')) -utt_table.append(('Xwd', 'PT_WORD')) - -# Sort the table. - -utt_table.sort() - -# We have to use STR_ macros to define the strings so that it all works in -# UTF-8 mode on EBCDIC platforms. - -for utt in utt_table: - print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ') - for c in utt[0]: - if c == '_': - print('STR_UNDERSCORE', end=' ') - elif c == '&': - print('STR_AMPERSAND', end=' ') - else: - print('STR_%s' % c, end=' '); - print('"\\0"') - -# Print the actual table, using the string names - -print('') -print('const char PRIV(utt_names)[] ='); -last = '' -for utt in utt_table: - if utt == utt_table[-1]: - last = ';' - print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)) -# This was how it was done before the EBCDIC-compatible modification. -# print ' "%s\\0"%s' % (utt[0], last) - -print('\nconst ucp_type_table PRIV(utt)[] = {') -offset = 0 -last = ',' -for utt in utt_table: - if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', - 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): - value = '0' - else: - value = 'ucp_' + utt[0] - if utt == utt_table[-1]: - last = '' - print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last)) - offset += len(utt[0]) + 1 -print('};') diff --git a/pcre2/maint/ManyConfigTests b/pcre2/maint/ManyConfigTests old mode 100755 new mode 100644 diff --git a/pcre2/maint/MultiStage2.py b/pcre2/maint/MultiStage2.py deleted file mode 100755 index 10fa412905ea564429f77416b51f68642f132ca7..0000000000000000000000000000000000000000 --- a/pcre2/maint/MultiStage2.py +++ /dev/null @@ -1,819 +0,0 @@ -#! /usr/bin/python - -# Multistage table builder -# (c) Peter Kankowski, 2008 - -############################################################################## -# This script was submitted to the PCRE project by Peter Kankowski as part of -# the upgrading of Unicode property support. The new code speeds up property -# matching many times. The script is for the use of PCRE maintainers, to -# generate the pcre2_ucd.c file that contains a digested form of the Unicode -# data tables. A number of extensions have been added to the original script. -# -# The script has now been upgraded to Python 3 for PCRE2, and should be run in -# the maint subdirectory, using the command -# -# [python3] ./MultiStage2.py >../src/pcre2_ucd.c -# -# It requires six Unicode data tables: DerivedGeneralCategory.txt, -# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt, -# CaseFolding.txt, and emoji-data.txt. These must be in the -# maint/Unicode.tables subdirectory. -# -# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the -# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is -# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and -# CaseFolding.txt are directly in the UCD directory. -# -# The emoji-data.txt file is found in the "emoji" subdirectory even though it -# is technically part of a different (but coordinated) standard as shown -# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), -# for example: -# -# http://unicode.org/Public/emoji/13.0/ReadMe.txt -# -# ----------------------------------------------------------------------------- -# Minor modifications made to this script: -# Added #! line at start -# Removed tabs -# Made it work with Python 2.4 by rewriting two statements that needed 2.5 -# Consequent code tidy -# Adjusted data file names to take from the Unicode.tables directory -# Adjusted global table names by prefixing _pcre_. -# Commented out stuff relating to the casefolding table, which isn't used; -# removed completely in 2012. -# Corrected size calculation -# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. -# Update for PCRE2: name changes, and SUPPORT_UCP is abolished. -# -# Major modifications made to this script: -# Added code to add a grapheme break property field to records. -# -# Added code to search for sets of more than two characters that must match -# each other caselessly. A new table is output containing these sets, and -# offsets into the table are added to the main output records. This new -# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer -# used. -# -# Update for Python3: -# . Processed with 2to3, but that didn't fix everything -# . Changed string.strip to str.strip -# . Added encoding='utf-8' to the open() call -# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is -# required and the result of the division is a float -# -# Added code to scan the emoji-data.txt file to find the Extended Pictographic -# property, which is used by PCRE2 as a grapheme breaking property. This was -# done when updating to Unicode 11.0.0 (July 2018). -# -# Added code to add a Script Extensions field to records. This has increased -# their size from 8 to 12 bytes, only 10 of which are currently used. -# -# 01-March-2010: Updated list of scripts for Unicode 5.2.0 -# 30-April-2011: Updated list of scripts for Unicode 6.0.0 -# July-2012: Updated list of scripts for Unicode 6.1.0 -# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new -# field in the record to hold the value. Luckily, the -# structure had a hole in it, so the resulting table is -# not much bigger than before. -# 18-September-2012: Added code for multiple caseless sets. This uses the -# final hole in the structure. -# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 -# 13-May-2014: Updated for PCRE2 -# 03-June-2014: Updated for Python 3 -# 20-June-2014: Updated for Unicode 7.0.0 -# 12-August-2014: Updated to put Unicode version into the file -# 19-June-2015: Updated for Unicode 8.0.0 -# 02-July-2017: Updated for Unicode 10.0.0 -# 03-July-2018: Updated for Unicode 11.0.0 -# 07-July-2018: Added code to scan emoji-data.txt for the Extended -# Pictographic property. -# 01-October-2018: Added the 'Unknown' script name -# 03-October-2018: Added new field for Script Extensions -# 27-July-2019: Updated for Unicode 12.1.0 -# 10-March-2020: Updated for Unicode 13.0.0 -# PCRE2-10.39: Updated for Unicode 14.0.0 -# ---------------------------------------------------------------------------- -# -# -# The main tables generated by this script are used by macros defined in -# pcre2_internal.h. They look up Unicode character properties using short -# sequences of code that contains no branches, which makes for greater speed. -# -# Conceptually, there is a table of records (of type ucd_record), containing a -# script number, script extension value, character type, grapheme break type, -# offset to caseless matching set, offset to the character's other case, for -# every Unicode character. However, a real table covering all Unicode -# characters would be far too big. It can be efficiently compressed by -# observing that many characters have the same record, and many blocks of -# characters (taking 128 characters in a block) have the same set of records as -# other blocks. This leads to a 2-stage lookup process. -# -# This script constructs six tables. The ucd_caseless_sets table contains -# lists of characters that all match each other caselessly. Each list is -# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than -# any valid character. The first list is empty; this is used for characters -# that are not part of any list. -# -# The ucd_digit_sets table contains the code points of the '9' characters in -# each set of 10 decimal digits in Unicode. This is used to ensure that digits -# in script runs all come from the same set. The first element in the vector -# contains the number of subsequent elements, which are in ascending order. -# -# The ucd_script_sets vector contains lists of script numbers that are the -# Script Extensions properties of certain characters. Each list is terminated -# by zero (ucp_Unknown). A character with more than one script listed for its -# Script Extension property has a negative value in its record. This is the -# negated offset to the start of the relevant list in the ucd_script_sets -# vector. -# -# The ucd_records table contains one instance of every unique record that is -# required. The ucd_stage1 table is indexed by a character's block number, -# which is the character's code point divided by 128, since 128 is the size -# of each block. The result of a lookup in ucd_stage1 a "virtual" block number. -# -# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by -# the offset of a character within its own block, and the result is the index -# number of the required record in the ucd_records vector. -# -# The following examples are correct for the Unicode 11.0.0 database. Future -# updates may make change the actual lookup values. -# -# Example: lowercase "a" (U+0061) is in block 0 -# lookup 0 in stage1 table yields 0 -# lookup 97 (0x61) in the first table in stage2 yields 17 -# record 17 is { 34, 5, 12, 0, -32, 34, 0 } -# 34 = ucp_Latin => Latin script -# 5 = ucp_Ll => Lower case letter -# 12 = ucp_gbOther => Grapheme break property "Other" -# 0 => Not part of a caseless set -# -32 (-0x20) => Other case is U+0041 -# 34 = ucp_Latin => No special Script Extension property -# 0 => Dummy value, unused at present -# -# Almost all lowercase latin characters resolve to the same record. One or two -# are different because they are part of a multi-character caseless set (for -# example, k, K and the Kelvin symbol are such a set). -# -# Example: hiragana letter A (U+3042) is in block 96 (0x60) -# lookup 96 in stage1 table yields 90 -# lookup 66 (0x42) in table 90 in stage2 yields 564 -# record 564 is { 27, 7, 12, 0, 0, 27, 0 } -# 27 = ucp_Hiragana => Hiragana script -# 7 = ucp_Lo => Other letter -# 12 = ucp_gbOther => Grapheme break property "Other" -# 0 => Not part of a caseless set -# 0 => No other case -# 27 = ucp_Hiragana => No special Script Extension property -# 0 => Dummy value, unused at present -# -# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) -# lookup 57 in stage1 table yields 55 -# lookup 80 (0x50) in table 55 in stage2 yields 458 -# record 458 is { 28, 12, 3, 0, 0, -101, 0 } -# 28 = ucp_Inherited => Script inherited from predecessor -# 12 = ucp_Mn => Non-spacing mark -# 3 = ucp_gbExtend => Grapheme break property "Extend" -# 0 => Not part of a caseless set -# 0 => No other case -# -101 => Script Extension list offset = 101 -# 0 => Dummy value, unused at present -# -# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29, -# and terminator 0. This means that this character is expected to be used with -# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada. -# -# Philip Hazel, 03 July 2008 -############################################################################## - - -import re -import string -import sys - -MAX_UNICODE = 0x110000 -NOTACHAR = 0xffffffff - - -# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt -def make_get_names(enum): - return lambda chardata: enum.index(chardata[1]) - -# Parse a line of CaseFolding.txt -def get_other_case(chardata): - if chardata[1] == 'C' or chardata[1] == 'S': - return int(chardata[2], 16) - int(chardata[0], 16) - return 0 - -# Parse a line of ScriptExtensions.txt -def get_script_extension(chardata): - this_script_list = list(chardata[1].split(' ')) - if len(this_script_list) == 1: - return script_abbrevs.index(this_script_list[0]) - - script_numbers = [] - for d in this_script_list: - script_numbers.append(script_abbrevs.index(d)) - script_numbers.append(0) - script_numbers_length = len(script_numbers) - - for i in range(1, len(script_lists) - script_numbers_length + 1): - for j in range(0, script_numbers_length): - found = True - if script_lists[i+j] != script_numbers[j]: - found = False - break - if found: - return -i - - # Not found in existing lists - - return_value = len(script_lists) - script_lists.extend(script_numbers) - return -return_value - -# Read the whole table in memory, setting/checking the Unicode version -def read_table(file_name, get_value, default_value): - global unicode_version - - f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) - file_base = f.group(1) - version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" - file = open(file_name, 'r', encoding='utf-8') - f = re.match(version_pat, file.readline()) - version = f.group(1) - if unicode_version == "": - unicode_version = version - elif unicode_version != version: - print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) - - table = [default_value] * MAX_UNICODE - for line in file: - line = re.sub(r'#.*', '', line) - chardata = list(map(str.strip, line.split(';'))) - if len(chardata) <= 1: - continue - value = get_value(chardata) - m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) - char = int(m.group(1), 16) - if m.group(3) is None: - last = char - else: - last = int(m.group(3), 16) - for i in range(char, last + 1): - # It is important not to overwrite a previously set - # value because in the CaseFolding file there are lines - # to be ignored (returning the default value of 0) - # which often come after a line which has already set - # data. - if table[i] == default_value: - table[i] = value - file.close() - return table - -# Get the smallest possible C language type for the values -def get_type_size(table): - type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), - ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)] - limits = [(0, 255), (0, 65535), (0, 4294967295), - (-128, 127), (-32768, 32767), (-2147483648, 2147483647)] - minval = min(table) - maxval = max(table) - for num, (minlimit, maxlimit) in enumerate(limits): - if minlimit <= minval and maxval <= maxlimit: - return type_size[num] - else: - raise OverflowError("Too large to fit into C types") - -def get_tables_size(*tables): - total_size = 0 - for table in tables: - type, size = get_type_size(table) - total_size += size * len(table) - return total_size - -# Compress the table into the two stages -def compress_table(table, block_size): - blocks = {} # Dictionary for finding identical blocks - stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) - stage2 = [] # Stage 2 table contains the blocks with property values - table = tuple(table) - for i in range(0, len(table), block_size): - block = table[i:i+block_size] - start = blocks.get(block) - if start is None: - # Allocate a new block - start = len(stage2) / block_size - stage2 += block - blocks[block] = start - stage1.append(start) - - return stage1, stage2 - -# Print a table -def print_table(table, table_name, block_size = None): - type, size = get_type_size(table) - ELEMS_PER_LINE = 16 - - s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) - if block_size: - s += ", block = %d" % block_size - print(s + " */") - table = tuple(table) - if block_size is None: - fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" - mult = MAX_UNICODE / len(table) - for i in range(0, len(table), ELEMS_PER_LINE): - print(fmt % (table[i:i+ELEMS_PER_LINE] + - (int(i * mult),))) - else: - if block_size > ELEMS_PER_LINE: - el = ELEMS_PER_LINE - else: - el = block_size - fmt = "%3d," * el + "\n" - if block_size > ELEMS_PER_LINE: - fmt = fmt * int(block_size / ELEMS_PER_LINE) - for i in range(0, len(table), block_size): - print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) - print("};\n") - -# Extract the unique combinations of properties into records -def combine_tables(*tables): - records = {} - index = [] - for t in zip(*tables): - i = records.get(t) - if i is None: - i = records[t] = len(records) - index.append(i) - return index, records - -def get_record_size_struct(records): - size = 0 - structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \ - 'types in this structure definition from pcre2_internal.h (the actual\n' + \ - 'field names will be different):\n\ntypedef struct {\n' - for i in range(len(records[0])): - record_slice = [record[i] for record in records] - slice_type, slice_size = get_type_size(record_slice) - # add padding: round up to the nearest power of slice_size - size = (size + slice_size - 1) & -slice_size - size += slice_size - structure += '%s property_%d;\n' % (slice_type, i) - - # round up to the first item of the next structure in array - record_slice = [record[0] for record in records] - slice_type, slice_size = get_type_size(record_slice) - size = (size + slice_size - 1) & -slice_size - - structure += '} ucd_record;\n*/\n' - return size, structure - -def test_record_size(): - tests = [ \ - ( [(3,), (6,), (6,), (1,)], 1 ), \ - ( [(300,), (600,), (600,), (100,)], 2 ), \ - ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ - ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ - ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ - ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ - ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ - ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ - ] - for test in tests: - size, struct = get_record_size_struct(test[0]) - assert(size == test[1]) - #print struct - -def print_records(records, record_size): - print('const ucd_record PRIV(ucd_records)[] = { ' + \ - '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)) - - records = list(zip(list(records.keys()), list(records.values()))) - records.sort(key = lambda x: x[1]) - for i, record in enumerate(records): - print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) - print('};\n') - -script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', - 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', - 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', - 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', - 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', - 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', - 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', -# New for Unicode 5.0 - 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', -# New for Unicode 5.1 - 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', -# New for Unicode 5.2 - 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', - 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', - 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', - 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', -# New for Unicode 6.0.0 - 'Batak', 'Brahmi', 'Mandaic', -# New for Unicode 6.1.0 - 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', -# New for Unicode 7.0.0 - 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', - 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', - 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', - 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', -# New for Unicode 8.0.0 - 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', - 'SignWriting', -# New for Unicode 10.0.0 - 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', - 'Nushu', 'Soyombo', 'Zanabazar_Square', -# New for Unicode 11.0.0 - 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', - 'Old_Sogdian', 'Sogdian', -# New for Unicode 12.0.0 - 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', -# New for Unicode 13.0.0 - 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', -# New for Unicode 14.0.0 - 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' - ] - -script_abbrevs = [ - 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', - 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', - 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', - 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', - 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', - 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', - 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', -#New for Unicode 5.0 - 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', -#New for Unicode 5.1 - 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', - 'Sund', 'Vaii', -#New for Unicode 5.2 - 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', - 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', -#New for Unicode 6.0.0 - 'Batk', 'Brah', 'Mand', -#New for Unicode 6.1.0 - 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', -#New for Unicode 7.0.0 - 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', - 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', - 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', -#New for Unicode 8.0.0 - 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', -#New for Unicode 10.0.0 - 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', - 'Zanb', -#New for Unicode 11.0.0 - 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd', -#New for Unicode 12.0.0 - 'Elym', 'Nand', 'Hmnp', 'Wcho', -#New for Unicode 13.0.0 - 'Chrs', 'Diak', 'Kits', 'Yezi', -#New for Unicode 14.0.0 - 'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith' - ] - -category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', - 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', - 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] - -# The Extended_Pictographic property is not found in the file where all the -# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt -# file, but we list it here so that the name has the correct index value. - -break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend', - 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other', - 'ZWJ', 'Extended_Pictographic' ] - -test_record_size() -unicode_version = "" - -script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) -category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) -break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other')) -other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) - -# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now -# we need to find the Extended_Pictographic property for emoji characters. This -# can be set as an additional grapheme break property, because the default for -# all the emojis is "other". We scan the emoji-data.txt file and modify the -# break-props table. - -file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') -for line in file: - line = re.sub(r'#.*', '', line) - chardata = list(map(str.strip, line.split(';'))) - if len(chardata) <= 1: - continue - - if chardata[1] != "Extended_Pictographic": - continue - - m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) - char = int(m.group(1), 16) - if m.group(3) is None: - last = char - else: - last = int(m.group(3), 16) - for i in range(char, last + 1): - if break_props[i] != break_property_names.index('Other'): - print("WARNING: Emoji 0x%x has break property %s, not 'Other'", - i, break_property_names[break_props[i]], file=sys.stderr) - break_props[i] = break_property_names.index('Extended_Pictographic') -file.close() - -# The Script Extensions property default value is the Script value. Parse the -# file, setting 'Unknown' as the default (this will never be a Script Extension -# value), then scan it and fill in the default from Scripts. Code added by PH -# in October 2018. Positive values are used for just a single script for a -# code point. Negative values are negated offsets in a list of lists of -# multiple scripts. Initialize this list with a single entry, as the zeroth -# element is never used. - -script_lists = [0] -script_abbrevs_default = script_abbrevs.index('Zzzz') -scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) - -for i in range(0, MAX_UNICODE): - if scriptx[i] == script_abbrevs_default: - scriptx[i] = script[i] - -# With the addition of the new Script Extensions field, we need some padding -# to get the Unicode records up to 12 bytes (multiple of 4). Set a value -# greater than 255 to make the field 16 bits. - -padding_dummy = [0] * MAX_UNICODE -padding_dummy[0] = 256 - -# This block of code was added by PH in September 2012. I am not a Python -# programmer, so the style is probably dreadful, but it does the job. It scans -# the other_case table to find sets of more than two characters that must all -# match each other caselessly. Later in this script a table of these sets is -# written out. However, we have to do this work here in order to compute the -# offsets in the table that are inserted into the main table. - -# The CaseFolding.txt file lists pairs, but the common logic for reading data -# sets only one value, so first we go through the table and set "return" -# offsets for those that are not already set. - -for c in range(MAX_UNICODE): - if other_case[c] != 0 and other_case[c + other_case[c]] == 0: - other_case[c + other_case[c]] = -other_case[c] - -# Now scan again and create equivalence sets. - -sets = [] - -for c in range(MAX_UNICODE): - o = c + other_case[c] - - # Trigger when this character's other case does not point back here. We - # now have three characters that are case-equivalent. - - if other_case[o] != -other_case[c]: - t = o + other_case[o] - - # Scan the existing sets to see if any of the three characters are already - # part of a set. If so, unite the existing set with the new set. - - appended = 0 - for s in sets: - found = 0 - for x in s: - if x == c or x == o or x == t: - found = 1 - - # Add new characters to an existing set - - if found: - found = 0 - for y in [c, o, t]: - for x in s: - if x == y: - found = 1 - if not found: - s.append(y) - appended = 1 - - # If we have not added to an existing set, create a new one. - - if not appended: - sets.append([c, o, t]) - -# End of loop looking for caseless sets. - -# Now scan the sets and set appropriate offsets for the characters. - -caseless_offsets = [0] * MAX_UNICODE - -offset = 1; -for s in sets: - for x in s: - caseless_offsets[x] = offset - offset += len(s) + 1 - -# End of block of code for creating offsets for caseless matching sets. - - -# Combine the tables - -table, records = combine_tables(script, category, break_props, - caseless_offsets, other_case, scriptx, padding_dummy) - -record_size, record_struct = get_record_size_struct(list(records.keys())) - -# Find the optimum block size for the two-stage table -min_size = sys.maxsize -for block_size in [2 ** i for i in range(5,10)]: - size = len(records) * record_size - stage1, stage2 = compress_table(table, block_size) - size += get_tables_size(stage1, stage2) - #print "/* block size %5d => %5d bytes */" % (block_size, size) - if size < min_size: - min_size = size - min_stage1, min_stage2 = stage1, stage2 - min_block_size = block_size - -print("/* This module is generated by the maint/MultiStage2.py script.") -print("Do not modify it by hand. Instead modify the script and run it") -print("to regenerate this code.") -print() -print("As well as being part of the PCRE2 library, this module is #included") -print("by the pcre2test program, which redefines the PRIV macro to change") -print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes") -print("with the library. At present, just one of these tables is actually") -print("needed. */") -print() -print("#ifndef PCRE2_PCRE2TEST") -print() -print("#ifdef HAVE_CONFIG_H") -print("#include \"config.h\"") -print("#endif") -print() -print("#include \"pcre2_internal.h\"") -print() -print("#endif /* PCRE2_PCRE2TEST */") -print() -print("/* Unicode character database. */") -print("/* This file was autogenerated by the MultiStage2.py script. */") -print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)) -print() -print("/* The tables herein are needed only when UCP support is built,") -print("and in PCRE2 that happens automatically with UTF support.") -print("This module should not be referenced otherwise, so") -print("it should not matter whether it is compiled or not. However") -print("a comment was received about space saving - maybe the guy linked") -print("all the modules rather than using a library - so we include a") -print("condition to cut out the tables when not needed. But don't leave") -print("a totally empty module because some compilers barf at that.") -print("Instead, just supply some small dummy tables. */") -print() -print("#ifndef SUPPORT_UNICODE") -print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};") -print("const uint16_t PRIV(ucd_stage1)[] = {0};") -print("const uint16_t PRIV(ucd_stage2)[] = {0};") -print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};") -print("#else") -print() -print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version)) -print() -print("/* If the 32-bit library is run in non-32-bit mode, character values") -print("greater than 0x10ffff may be encountered. For these we set up a") -print("special record. */") -print() -print("#if PCRE2_CODE_UNIT_WIDTH == 32") -print("const ucd_record PRIV(dummy_ucd_record)[] = {{") -print(" ucp_Unknown, /* script */") -print(" ucp_Cn, /* type unassigned */") -print(" ucp_gbOther, /* grapheme break property */") -print(" 0, /* case set */") -print(" 0, /* other case */") -print(" ucp_Unknown, /* script extension */") -print(" 0, /* dummy filler */") -print(" }};") -print("#endif") -print() -print(record_struct) - -# --- Added by PH: output the table of caseless character sets --- - -print("/* This table contains lists of characters that are caseless sets of") -print("more than one character. Each list is terminated by NOTACHAR. */\n") - -print("const uint32_t PRIV(ucd_caseless_sets)[] = {") -print(" NOTACHAR,") -for s in sets: - s = sorted(s) - for x in s: - print(' 0x%04x,' % x, end=' ') - print(' NOTACHAR,') -print('};') -print() - -# ------ - -print("/* When #included in pcre2test, we don't need the table of digit") -print("sets, nor the the large main UCD tables. */") -print() -print("#ifndef PCRE2_PCRE2TEST") -print() - -# --- Added by PH: read Scripts.txt again for the sets of 10 digits. --- - -digitsets = [] -file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') - -for line in file: - m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) - if m is None: - continue - first = int(m.group(1),16) - last = int(m.group(2),16) - if ((last - first + 1) % 10) != 0: - print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), - file=sys.stderr) - while first < last: - digitsets.append(first + 9) - first += 10 -file.close() -digitsets.sort() - -print("/* This table lists the code points for the '9' characters in each") -print("set of decimal digits. It is used to ensure that all the digits in") -print("a script run come from the same set. */\n") -print("const uint32_t PRIV(ucd_digit_sets)[] = {") - -print(" %d, /* Number of subsequent values */" % len(digitsets), end='') -count = 8 -for d in digitsets: - if count == 8: - print("\n ", end='') - count = 0 - print(" 0x%05x," % d, end='') - count += 1 -print("\n};\n") - -print("/* This vector is a list of lists of scripts for the Script Extension") -print("property. Each sublist is zero-terminated. */\n") -print("const uint8_t PRIV(ucd_script_sets)[] = {") - -count = 0 -print(" /* 0 */", end='') -for d in script_lists: - print(" %3d," % d, end='') - count += 1 - if d == 0: - print("\n /* %3d */" % count, end='') -print("\n};\n") - -# Output the main UCD tables. - -print("/* These are the main two-stage UCD tables. The fields in each record are:") -print("script (8 bits), character type (8 bits), grapheme break property (8 bits),") -print("offset to multichar other cases or zero (8 bits), offset to other case") -print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy") -print("16-bit field to make the whole thing a multiple of 4 bytes. */\n") - -print_records(records, record_size) -print_table(min_stage1, 'PRIV(ucd_stage1)') -print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) -print("#if UCD_BLOCK_SIZE != %d" % min_block_size) -print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h") -print("#endif") -print("#endif /* SUPPORT_UNICODE */") -print() -print("#endif /* PCRE2_PCRE2TEST */") - - -# This code was part of the original contribution, but is commented out as it -# was never used. A two-stage table has sufficed. - -""" - -# Three-stage tables: - -# Find the optimum block size for 3-stage table -min_size = sys.maxint -for stage3_block in [2 ** i for i in range(2,6)]: - stage_i, stage3 = compress_table(table, stage3_block) - for stage2_block in [2 ** i for i in range(5,10)]: - size = len(records) * 4 - stage1, stage2 = compress_table(stage_i, stage2_block) - size += get_tables_size(stage1, stage2, stage3) - # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size) - if size < min_size: - min_size = size - min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3 - min_stage2_block, min_stage3_block = stage2_block, stage3_block - -print "/* Total size: %d bytes" % min_size */ -print_records(records) -print_table(min_stage1, 'ucd_stage1') -print_table(min_stage2, 'ucd_stage2', min_stage2_block) -print_table(min_stage3, 'ucd_stage3', min_stage3_block) - -""" diff --git a/pcre2/maint/README b/pcre2/maint/README index ab9845c37738cc2905c356beed290b6a0428d871..f21ff87d82d11e9825efd2befb04cbd18118fba4 100644 --- a/pcre2/maint/README +++ b/pcre2/maint/README @@ -16,92 +16,114 @@ and also contains some notes for maintainers. Its contents are: Files in the maint directory ============================ -GenerateUtt.py A Python script to generate part of the pcre2_tables.c file - that contains Unicode script names in a long string with - offsets, which is tedious to maintain by hand. - -ManyConfigTests A shell script that runs "configure, make, test" a number of - times with different configuration settings. - -MultiStage2.py A Python script that generates the file pcre2_ucd.c from six - Unicode data files, which are themselves downloaded from the - Unicode web site. Run this script in the "maint" directory. - The generated file is written to stdout. It contains the - tables for a 2-stage lookup of Unicode properties, along with - some auxiliary tables. +GenerateCommon.py + A Python module containing data and functions that are used by the other + Generate scripts. + +GenerateTest26.py + A Python script that generates input and expected output test data for test + 26, which tests certain aspects of Unicode property support. + +GenerateUcd.py + A Python script that generates the file pcre2_ucd.c from GenerateCommon.py + and Unicode data files, which are themselves downloaded from the Unicode web + site. The generated file contains the tables for a 2-stage lookup of Unicode + properties, along with some auxiliary tables. The script starts with a long + comment that gives details of the tables it constructs. + +GenerateUcpHeader.py + A Python script that generates the file pcre2_ucp.h from GenerateCommon.py + and Unicode data files. The generated file defines constants for various + Unicode property values. + +GenerateUcpTables.py + A Python script that generates the file pcre2_ucptables.c from + GenerateCommon.py and Unicode data files. The generated file contains tables + for looking up Unicode property names. + +ManyConfigTests + A shell script that runs "configure, make, test" a number of times with + different configuration settings. pcre2_chartables.c.non-standard - This is a set of character tables that came from a Windows - system. It has characters greater than 128 that are set as - spaces, amongst other things. I kept it so that it can be - used for testing from time to time. - -README This file. - -Unicode.tables The files in this directory were downloaded from the Unicode - web site. They contain information about Unicode characters - and scripts. The ones used by the MultiStage2.py script are - CaseFolding.txt, DerivedGeneralCategory.txt, Scripts.txt, - ScriptExtensions.txt, GraphemeBreakProperty.txt, and - emoji-data.txt. I've kept UnicodeData.txt (which is no longer - used by the script) because it is useful occasionally for - manually looking up the details of certain characters. - However, note that character names in this file such as - "Arabic sign sanah" do NOT mean that the character is in a - particular script (in this case, Arabic). Scripts.txt and - ScriptExtensions.txt are where to look for script information. - -ucptest.c A short C program for testing the Unicode property macros - that do lookups in the pcre2_ucd.c data, mainly useful after - rebuilding the Unicode property table. Compile and run this in - the "maint" directory (see comments at its head). This program - can also be used to find characters with specific properties. - -ucptestdata A directory containing four files, testinput{1,2} and - testoutput{1,2}, for use in conjunction with the ucptest - program. - -utf8.c A short, freestanding C program for converting a Unicode code - point into a sequence of bytes in the UTF-8 encoding, and vice - versa. If its argument is a hex number such as 0x1234, it - outputs a list of the equivalent UTF-8 bytes. If its argument - is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it - treats them as a UTF-8 character and outputs the equivalent - code point in hex. See comments at its head for details. + This is a set of character tables that came from a Windows system. It has + characters greater than 128 that are set as spaces, amongst other things. I + kept it so that it can be used for testing from time to time. + +README + This file. + +Unicode.tables + The files in this directory were downloaded from the Unicode web site. They + contain information about Unicode characters and scripts, and are used by the + Generate scripts. There is also UnicodeData.txt, which is no longer used by + any script, because it is useful occasionally for manually looking up the + details of certain characters. However, note that character names in this + file such as "Arabic sign sanah" do NOT mean that the character is in a + particular script (in this case, Arabic). Scripts.txt and + ScriptExtensions.txt are where to look for script information. + +ucptest.c + A program for testing the Unicode property macros that do lookups in the + pcre2_ucd.c data, mainly useful after rebuilding the Unicode property tables. + Compile and run this in the "maint" directory (see comments at its head). + This program can also be used to find characters with specific properties and + to list which properties are supported. + +ucptestdata + A directory containing four files, testinput{1,2} and testoutput{1,2}, for + use in conjunction with the ucptest program. + +utf8.c + A short, freestanding C program for converting a Unicode code point into a + sequence of bytes in the UTF-8 encoding, and vice versa. If its argument is a + hex number such as 0x1234, it outputs a list of the equivalent UTF-8 bytes. + If its argument is a sequence of concatenated UTF-8 bytes (e.g. e188b4) it + treats them as a UTF-8 character and outputs the equivalent code point in + hex. See comments at its head for details. Updating to a new Unicode release ================================= When there is a new release of Unicode, the files in Unicode.tables must be -refreshed from the web site. If the new version of Unicode adds new character -scripts, the source file pcre2_ucp.h and both the MultiStage2.py and the -GenerateUtt.py scripts must be edited to add the new names. I have been adding -each new group at the end of the relevant list, with a comment. Note also that -both the pcre2syntax.3 and pcre2pattern.3 man pages contain lists of Unicode -script names. - -MultiStage2.py has two lists: the full names and the abbreviations that are -found in the ScriptExtensions.txt file. A list of script names and their -abbreviations can be found in the PropertyValueAliases.txt file on the -Unicode web site. There is also a Wikipedia page that lists them, and notes the -Unicode version in which they were introduced: - -https://en.wikipedia.org/wiki/Unicode_scripts#Table_of_Unicode_scripts - -Once the script name lists have been updated, MultiStage2.py can be run to -generate a new version of pcre2_ucd.c, and GenerateUtt.py can be run to -generate the tricky tables for inclusion in pcre2_tables.c (which must be -hand-edited). If MultiStage2.py gives the error "ValueError: list.index(x): x -not in list", the cause is usually a missing (or misspelt) name in one of the -lists of scripts. - -The ucptest program can be compiled and used to check that the new tables in -pcre2_ucd.c work properly, using the data files in ucptestdata to check a -number of test characters. It used to be necessary to update the source -ucptest.c whenever new Unicode scripts were added, but this is no longer -required because that program now uses the lists in the PCRE2 source. However, -adding a few tests for new scripts to the files in ucptestdata is a good idea. +refreshed from the web site. Once that is done, the four Python scripts that +generate files from the Unicode data can be run from within the "maint" +directory. + +Note: Previously, it was necessary to update lists of scripts and their +abbreviations by hand before running the Python scripts. This is no longer +necessary because the scripts have been upgraded to extract this information +themselves. Also, there used to be explicit lists of script in two of the man +pages. This is no longer the case. + +You can give an output file name as an argument to the following scripts, but +by default: + +GenerateUcd.py creates pcre2_ucd.c ) +GenerateUcpHeader.py creates pcre2_ucp.h ) in the current directory +GenerateUcpTables.py creates pcre2_ucptables.c ) + +These files can be compared against the existing versions in the src directory +to check on any changes before replacing the old files, but you can also +generate directly into the final location by running: + +./GenerateUcd.py ../src/pcre2_ucd.c +./GenerateUcpHeader.py ../src/pcre2_ucp.h +./GenerateUcpTables.py ../src/pcre2_ucptables.c + +Once the .c and .h files are in the ../src directory, the ucptest program can +be compiled and used to check that the new tables work properly. The data files +in ucptestdata are set up to check a number of test characters. See the +comments at the start of ucptest.c. If there are new scripts, adding a few +tests to the files in ucptestdata is a good idea. + +Finally, you should run the GenerateTest26.py script to regenerate new versions +of the input and expected output from a series of Unicode property tests that +are automatically generated from the Unicode data files. By default, the files +are written to testinput26 and testoutput26 in the current directory, but you +can give an alternative directory name as an argument to the script. These +files should eventually be installed in the main testdata directory. Preparing for a PCRE2 release @@ -439,4 +461,4 @@ years. Philip Hazel Email local part: Philip.Hazel Email domain: gmail.com -Last updated: 26 August 2021 +Last updated: 10 January 2022 diff --git a/pcre2/maint/Unicode.tables/BidiMirroring.txt b/pcre2/maint/Unicode.tables/BidiMirroring.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd8e2c5d00177655b5a84b79415fe7693e272148 --- /dev/null +++ b/pcre2/maint/Unicode.tables/BidiMirroring.txt @@ -0,0 +1,633 @@ +# BidiMirroring-14.0.0.txt +# Date: 2021-08-08, 22:55:00 GMT [KW, RP] +# © 2021 Unicode®, Inc. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Bidi_Mirroring_Glyph Property +# +# This file is an informative contributory data file in the +# Unicode Character Database. +# +# This data file lists characters that have the Bidi_Mirrored=Yes property +# value, for which there is another Unicode character that typically has a glyph +# that is the mirror image of the original character's glyph. +# +# The repertoire covered by the file is Unicode 14.0.0. +# +# The file contains a list of lines with mappings from one code point +# to another one for character-based mirroring. +# Note that for "real" mirroring, a rendering engine needs to select +# appropriate alternative glyphs, and that many Unicode characters do not +# have a mirror-image Unicode character. +# +# Each mapping line contains two fields, separated by a semicolon (';'). +# Each of the two fields contains a code point represented as a +# variable-length hexadecimal value with 4 to 6 digits. +# A comment indicates where the characters are "BEST FIT" mirroring. +# +# Code points for which Bidi_Mirrored=Yes, but for which no appropriate +# characters exist with mirrored glyphs, are +# listed as comments at the end of the file. +# +# Formally, the default value of the Bidi_Mirroring_Glyph property +# for each code point is , unless a mapping to +# some other character is specified in this data file. When a code +# point has the default value for the Bidi_Mirroring_Glyph property, +# that means that no other character exists whose glyph is suitable +# for character-based mirroring. +# +# For information on bidi mirroring, see UAX #9: Unicode Bidirectional Algorithm, +# at https://www.unicode.org/reports/tr9/ +# +# This file was originally created by Markus Scherer. +# Extended for Unicode 3.2, 4.0, 4.1, 5.0, 5.1, 5.2, and 6.0 by Ken Whistler, +# and for subsequent versions by Ken Whistler, Laurentiu Iancu, and Roozbeh Pournader. +# +# Historical and Compatibility Information: +# +# The OpenType Mirroring Pairs List (OMPL) is frozen to match the +# Unicode 5.1 version of the Bidi_Mirroring_Glyph property (2008). +# See https://www.microsoft.com/typography/otspec/ompl.txt +# +# The Unicode 6.1 version of the Bidi_Mirroring_Glyph property (2011) +# added one mirroring pair: 27CB <--> 27CD. +# +# The Unicode 11.0 version of the Bidi_Mirroring_Glyph property (2018) +# underwent a substantial revision, to formally recognize all of the +# exact mirroring pairs and "BEST FIT" mirroring pairs that had been +# added after the freezing of the OMPL list. As a result, starting +# with Unicode 11.0, the bmg mapping values more accurately reflect +# the current status of glyphs for Bidi_Mirrored characters in +# the Unicode Standard, but this listing now extends significantly +# beyond the frozen OMPL list. Implementers should be aware of this +# intentional distinction. +# +# ############################################################ +# +# Property: Bidi_Mirroring_Glyph +# +# @missing: 0000..10FFFF; + +0028; 0029 # LEFT PARENTHESIS +0029; 0028 # RIGHT PARENTHESIS +003C; 003E # LESS-THAN SIGN +003E; 003C # GREATER-THAN SIGN +005B; 005D # LEFT SQUARE BRACKET +005D; 005B # RIGHT SQUARE BRACKET +007B; 007D # LEFT CURLY BRACKET +007D; 007B # RIGHT CURLY BRACKET +00AB; 00BB # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +00BB; 00AB # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +0F3A; 0F3B # TIBETAN MARK GUG RTAGS GYON +0F3B; 0F3A # TIBETAN MARK GUG RTAGS GYAS +0F3C; 0F3D # TIBETAN MARK ANG KHANG GYON +0F3D; 0F3C # TIBETAN MARK ANG KHANG GYAS +169B; 169C # OGHAM FEATHER MARK +169C; 169B # OGHAM REVERSED FEATHER MARK +2039; 203A # SINGLE LEFT-POINTING ANGLE QUOTATION MARK +203A; 2039 # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +2045; 2046 # LEFT SQUARE BRACKET WITH QUILL +2046; 2045 # RIGHT SQUARE BRACKET WITH QUILL +207D; 207E # SUPERSCRIPT LEFT PARENTHESIS +207E; 207D # SUPERSCRIPT RIGHT PARENTHESIS +208D; 208E # SUBSCRIPT LEFT PARENTHESIS +208E; 208D # SUBSCRIPT RIGHT PARENTHESIS +2208; 220B # ELEMENT OF +2209; 220C # [BEST FIT] NOT AN ELEMENT OF +220A; 220D # SMALL ELEMENT OF +220B; 2208 # CONTAINS AS MEMBER +220C; 2209 # [BEST FIT] DOES NOT CONTAIN AS MEMBER +220D; 220A # SMALL CONTAINS AS MEMBER +2215; 29F5 # DIVISION SLASH +221F; 2BFE # RIGHT ANGLE +2220; 29A3 # ANGLE +2221; 299B # MEASURED ANGLE +2222; 29A0 # SPHERICAL ANGLE +2224; 2AEE # DOES NOT DIVIDE +223C; 223D # TILDE OPERATOR +223D; 223C # REVERSED TILDE +2243; 22CD # ASYMPTOTICALLY EQUAL TO +2245; 224C # APPROXIMATELY EQUAL TO +224C; 2245 # ALL EQUAL TO +2252; 2253 # APPROXIMATELY EQUAL TO OR THE IMAGE OF +2253; 2252 # IMAGE OF OR APPROXIMATELY EQUAL TO +2254; 2255 # COLON EQUALS +2255; 2254 # EQUALS COLON +2264; 2265 # LESS-THAN OR EQUAL TO +2265; 2264 # GREATER-THAN OR EQUAL TO +2266; 2267 # LESS-THAN OVER EQUAL TO +2267; 2266 # GREATER-THAN OVER EQUAL TO +2268; 2269 # [BEST FIT] LESS-THAN BUT NOT EQUAL TO +2269; 2268 # [BEST FIT] GREATER-THAN BUT NOT EQUAL TO +226A; 226B # MUCH LESS-THAN +226B; 226A # MUCH GREATER-THAN +226E; 226F # [BEST FIT] NOT LESS-THAN +226F; 226E # [BEST FIT] NOT GREATER-THAN +2270; 2271 # [BEST FIT] NEITHER LESS-THAN NOR EQUAL TO +2271; 2270 # [BEST FIT] NEITHER GREATER-THAN NOR EQUAL TO +2272; 2273 # [BEST FIT] LESS-THAN OR EQUIVALENT TO +2273; 2272 # [BEST FIT] GREATER-THAN OR EQUIVALENT TO +2274; 2275 # [BEST FIT] NEITHER LESS-THAN NOR EQUIVALENT TO +2275; 2274 # [BEST FIT] NEITHER GREATER-THAN NOR EQUIVALENT TO +2276; 2277 # LESS-THAN OR GREATER-THAN +2277; 2276 # GREATER-THAN OR LESS-THAN +2278; 2279 # [BEST FIT] NEITHER LESS-THAN NOR GREATER-THAN +2279; 2278 # [BEST FIT] NEITHER GREATER-THAN NOR LESS-THAN +227A; 227B # PRECEDES +227B; 227A # SUCCEEDS +227C; 227D # PRECEDES OR EQUAL TO +227D; 227C # SUCCEEDS OR EQUAL TO +227E; 227F # [BEST FIT] PRECEDES OR EQUIVALENT TO +227F; 227E # [BEST FIT] SUCCEEDS OR EQUIVALENT TO +2280; 2281 # [BEST FIT] DOES NOT PRECEDE +2281; 2280 # [BEST FIT] DOES NOT SUCCEED +2282; 2283 # SUBSET OF +2283; 2282 # SUPERSET OF +2284; 2285 # [BEST FIT] NOT A SUBSET OF +2285; 2284 # [BEST FIT] NOT A SUPERSET OF +2286; 2287 # SUBSET OF OR EQUAL TO +2287; 2286 # SUPERSET OF OR EQUAL TO +2288; 2289 # [BEST FIT] NEITHER A SUBSET OF NOR EQUAL TO +2289; 2288 # [BEST FIT] NEITHER A SUPERSET OF NOR EQUAL TO +228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO +228B; 228A # [BEST FIT] SUPERSET OF WITH NOT EQUAL TO +228F; 2290 # SQUARE IMAGE OF +2290; 228F # SQUARE ORIGINAL OF +2291; 2292 # SQUARE IMAGE OF OR EQUAL TO +2292; 2291 # SQUARE ORIGINAL OF OR EQUAL TO +2298; 29B8 # CIRCLED DIVISION SLASH +22A2; 22A3 # RIGHT TACK +22A3; 22A2 # LEFT TACK +22A6; 2ADE # ASSERTION +22A8; 2AE4 # TRUE +22A9; 2AE3 # FORCES +22AB; 2AE5 # DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE +22B0; 22B1 # PRECEDES UNDER RELATION +22B1; 22B0 # SUCCEEDS UNDER RELATION +22B2; 22B3 # NORMAL SUBGROUP OF +22B3; 22B2 # CONTAINS AS NORMAL SUBGROUP +22B4; 22B5 # NORMAL SUBGROUP OF OR EQUAL TO +22B5; 22B4 # CONTAINS AS NORMAL SUBGROUP OR EQUAL TO +22B6; 22B7 # ORIGINAL OF +22B7; 22B6 # IMAGE OF +22B8; 27DC # MULTIMAP +22C9; 22CA # LEFT NORMAL FACTOR SEMIDIRECT PRODUCT +22CA; 22C9 # RIGHT NORMAL FACTOR SEMIDIRECT PRODUCT +22CB; 22CC # LEFT SEMIDIRECT PRODUCT +22CC; 22CB # RIGHT SEMIDIRECT PRODUCT +22CD; 2243 # REVERSED TILDE EQUALS +22D0; 22D1 # DOUBLE SUBSET +22D1; 22D0 # DOUBLE SUPERSET +22D6; 22D7 # LESS-THAN WITH DOT +22D7; 22D6 # GREATER-THAN WITH DOT +22D8; 22D9 # VERY MUCH LESS-THAN +22D9; 22D8 # VERY MUCH GREATER-THAN +22DA; 22DB # LESS-THAN EQUAL TO OR GREATER-THAN +22DB; 22DA # GREATER-THAN EQUAL TO OR LESS-THAN +22DC; 22DD # EQUAL TO OR LESS-THAN +22DD; 22DC # EQUAL TO OR GREATER-THAN +22DE; 22DF # EQUAL TO OR PRECEDES +22DF; 22DE # EQUAL TO OR SUCCEEDS +22E0; 22E1 # [BEST FIT] DOES NOT PRECEDE OR EQUAL +22E1; 22E0 # [BEST FIT] DOES NOT SUCCEED OR EQUAL +22E2; 22E3 # [BEST FIT] NOT SQUARE IMAGE OF OR EQUAL TO +22E3; 22E2 # [BEST FIT] NOT SQUARE ORIGINAL OF OR EQUAL TO +22E4; 22E5 # [BEST FIT] SQUARE IMAGE OF OR NOT EQUAL TO +22E5; 22E4 # [BEST FIT] SQUARE ORIGINAL OF OR NOT EQUAL TO +22E6; 22E7 # [BEST FIT] LESS-THAN BUT NOT EQUIVALENT TO +22E7; 22E6 # [BEST FIT] GREATER-THAN BUT NOT EQUIVALENT TO +22E8; 22E9 # [BEST FIT] PRECEDES BUT NOT EQUIVALENT TO +22E9; 22E8 # [BEST FIT] SUCCEEDS BUT NOT EQUIVALENT TO +22EA; 22EB # [BEST FIT] NOT NORMAL SUBGROUP OF +22EB; 22EA # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP +22EC; 22ED # [BEST FIT] NOT NORMAL SUBGROUP OF OR EQUAL TO +22ED; 22EC # [BEST FIT] DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL +22F0; 22F1 # UP RIGHT DIAGONAL ELLIPSIS +22F1; 22F0 # DOWN RIGHT DIAGONAL ELLIPSIS +22F2; 22FA # ELEMENT OF WITH LONG HORIZONTAL STROKE +22F3; 22FB # ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE +22F4; 22FC # SMALL ELEMENT OF WITH VERTICAL BAR AT END OF HORIZONTAL STROKE +22F6; 22FD # ELEMENT OF WITH OVERBAR +22F7; 22FE # SMALL ELEMENT OF WITH OVERBAR +22FA; 22F2 # CONTAINS WITH LONG HORIZONTAL STROKE +22FB; 22F3 # CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE +22FC; 22F4 # SMALL CONTAINS WITH VERTICAL BAR AT END OF HORIZONTAL STROKE +22FD; 22F6 # CONTAINS WITH OVERBAR +22FE; 22F7 # SMALL CONTAINS WITH OVERBAR +2308; 2309 # LEFT CEILING +2309; 2308 # RIGHT CEILING +230A; 230B # LEFT FLOOR +230B; 230A # RIGHT FLOOR +2329; 232A # LEFT-POINTING ANGLE BRACKET +232A; 2329 # RIGHT-POINTING ANGLE BRACKET +2768; 2769 # MEDIUM LEFT PARENTHESIS ORNAMENT +2769; 2768 # MEDIUM RIGHT PARENTHESIS ORNAMENT +276A; 276B # MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT +276B; 276A # MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT +276C; 276D # MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT +276D; 276C # MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT +276E; 276F # HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT +276F; 276E # HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT +2770; 2771 # HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT +2771; 2770 # HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT +2772; 2773 # LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT +2773; 2772 # LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT +2774; 2775 # MEDIUM LEFT CURLY BRACKET ORNAMENT +2775; 2774 # MEDIUM RIGHT CURLY BRACKET ORNAMENT +27C3; 27C4 # OPEN SUBSET +27C4; 27C3 # OPEN SUPERSET +27C5; 27C6 # LEFT S-SHAPED BAG DELIMITER +27C6; 27C5 # RIGHT S-SHAPED BAG DELIMITER +27C8; 27C9 # REVERSE SOLIDUS PRECEDING SUBSET +27C9; 27C8 # SUPERSET PRECEDING SOLIDUS +27CB; 27CD # MATHEMATICAL RISING DIAGONAL +27CD; 27CB # MATHEMATICAL FALLING DIAGONAL +27D5; 27D6 # LEFT OUTER JOIN +27D6; 27D5 # RIGHT OUTER JOIN +27DC; 22B8 # LEFT MULTIMAP +27DD; 27DE # LONG RIGHT TACK +27DE; 27DD # LONG LEFT TACK +27E2; 27E3 # WHITE CONCAVE-SIDED DIAMOND WITH LEFTWARDS TICK +27E3; 27E2 # WHITE CONCAVE-SIDED DIAMOND WITH RIGHTWARDS TICK +27E4; 27E5 # WHITE SQUARE WITH LEFTWARDS TICK +27E5; 27E4 # WHITE SQUARE WITH RIGHTWARDS TICK +27E6; 27E7 # MATHEMATICAL LEFT WHITE SQUARE BRACKET +27E7; 27E6 # MATHEMATICAL RIGHT WHITE SQUARE BRACKET +27E8; 27E9 # MATHEMATICAL LEFT ANGLE BRACKET +27E9; 27E8 # MATHEMATICAL RIGHT ANGLE BRACKET +27EA; 27EB # MATHEMATICAL LEFT DOUBLE ANGLE BRACKET +27EB; 27EA # MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET +27EC; 27ED # MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET +27ED; 27EC # MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET +27EE; 27EF # MATHEMATICAL LEFT FLATTENED PARENTHESIS +27EF; 27EE # MATHEMATICAL RIGHT FLATTENED PARENTHESIS +2983; 2984 # LEFT WHITE CURLY BRACKET +2984; 2983 # RIGHT WHITE CURLY BRACKET +2985; 2986 # LEFT WHITE PARENTHESIS +2986; 2985 # RIGHT WHITE PARENTHESIS +2987; 2988 # Z NOTATION LEFT IMAGE BRACKET +2988; 2987 # Z NOTATION RIGHT IMAGE BRACKET +2989; 298A # Z NOTATION LEFT BINDING BRACKET +298A; 2989 # Z NOTATION RIGHT BINDING BRACKET +298B; 298C # LEFT SQUARE BRACKET WITH UNDERBAR +298C; 298B # RIGHT SQUARE BRACKET WITH UNDERBAR +298D; 2990 # LEFT SQUARE BRACKET WITH TICK IN TOP CORNER +298E; 298F # RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +298F; 298E # LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2990; 298D # RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER +2991; 2992 # LEFT ANGLE BRACKET WITH DOT +2992; 2991 # RIGHT ANGLE BRACKET WITH DOT +2993; 2994 # LEFT ARC LESS-THAN BRACKET +2994; 2993 # RIGHT ARC GREATER-THAN BRACKET +2995; 2996 # DOUBLE LEFT ARC GREATER-THAN BRACKET +2996; 2995 # DOUBLE RIGHT ARC LESS-THAN BRACKET +2997; 2998 # LEFT BLACK TORTOISE SHELL BRACKET +2998; 2997 # RIGHT BLACK TORTOISE SHELL BRACKET +299B; 2221 # MEASURED ANGLE OPENING LEFT +29A0; 2222 # SPHERICAL ANGLE OPENING LEFT +29A3; 2220 # REVERSED ANGLE +29A4; 29A5 # ANGLE WITH UNDERBAR +29A5; 29A4 # REVERSED ANGLE WITH UNDERBAR +29A8; 29A9 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND RIGHT +29A9; 29A8 # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING UP AND LEFT +29AA; 29AB # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND RIGHT +29AB; 29AA # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING DOWN AND LEFT +29AC; 29AD # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND UP +29AD; 29AC # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND UP +29AE; 29AF # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING RIGHT AND DOWN +29AF; 29AE # MEASURED ANGLE WITH OPEN ARM ENDING IN ARROW POINTING LEFT AND DOWN +29B8; 2298 # CIRCLED REVERSE SOLIDUS +29C0; 29C1 # CIRCLED LESS-THAN +29C1; 29C0 # CIRCLED GREATER-THAN +29C4; 29C5 # SQUARED RISING DIAGONAL SLASH +29C5; 29C4 # SQUARED FALLING DIAGONAL SLASH +29CF; 29D0 # LEFT TRIANGLE BESIDE VERTICAL BAR +29D0; 29CF # VERTICAL BAR BESIDE RIGHT TRIANGLE +29D1; 29D2 # BOWTIE WITH LEFT HALF BLACK +29D2; 29D1 # BOWTIE WITH RIGHT HALF BLACK +29D4; 29D5 # TIMES WITH LEFT HALF BLACK +29D5; 29D4 # TIMES WITH RIGHT HALF BLACK +29D8; 29D9 # LEFT WIGGLY FENCE +29D9; 29D8 # RIGHT WIGGLY FENCE +29DA; 29DB # LEFT DOUBLE WIGGLY FENCE +29DB; 29DA # RIGHT DOUBLE WIGGLY FENCE +29E8; 29E9 # DOWN-POINTING TRIANGLE WITH LEFT HALF BLACK +29E9; 29E8 # DOWN-POINTING TRIANGLE WITH RIGHT HALF BLACK +29F5; 2215 # REVERSE SOLIDUS OPERATOR +29F8; 29F9 # BIG SOLIDUS +29F9; 29F8 # BIG REVERSE SOLIDUS +29FC; 29FD # LEFT-POINTING CURVED ANGLE BRACKET +29FD; 29FC # RIGHT-POINTING CURVED ANGLE BRACKET +2A2B; 2A2C # MINUS SIGN WITH FALLING DOTS +2A2C; 2A2B # MINUS SIGN WITH RISING DOTS +2A2D; 2A2E # PLUS SIGN IN LEFT HALF CIRCLE +2A2E; 2A2D # PLUS SIGN IN RIGHT HALF CIRCLE +2A34; 2A35 # MULTIPLICATION SIGN IN LEFT HALF CIRCLE +2A35; 2A34 # MULTIPLICATION SIGN IN RIGHT HALF CIRCLE +2A3C; 2A3D # INTERIOR PRODUCT +2A3D; 2A3C # RIGHTHAND INTERIOR PRODUCT +2A64; 2A65 # Z NOTATION DOMAIN ANTIRESTRICTION +2A65; 2A64 # Z NOTATION RANGE ANTIRESTRICTION +2A79; 2A7A # LESS-THAN WITH CIRCLE INSIDE +2A7A; 2A79 # GREATER-THAN WITH CIRCLE INSIDE +2A7B; 2A7C # [BEST FIT] LESS-THAN WITH QUESTION MARK ABOVE +2A7C; 2A7B # [BEST FIT] GREATER-THAN WITH QUESTION MARK ABOVE +2A7D; 2A7E # LESS-THAN OR SLANTED EQUAL TO +2A7E; 2A7D # GREATER-THAN OR SLANTED EQUAL TO +2A7F; 2A80 # LESS-THAN OR SLANTED EQUAL TO WITH DOT INSIDE +2A80; 2A7F # GREATER-THAN OR SLANTED EQUAL TO WITH DOT INSIDE +2A81; 2A82 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE +2A82; 2A81 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE +2A83; 2A84 # LESS-THAN OR SLANTED EQUAL TO WITH DOT ABOVE RIGHT +2A84; 2A83 # GREATER-THAN OR SLANTED EQUAL TO WITH DOT ABOVE LEFT +2A85; 2A86 # [BEST FIT] LESS-THAN OR APPROXIMATE +2A86; 2A85 # [BEST FIT] GREATER-THAN OR APPROXIMATE +2A87; 2A88 # [BEST FIT] LESS-THAN AND SINGLE-LINE NOT EQUAL TO +2A88; 2A87 # [BEST FIT] GREATER-THAN AND SINGLE-LINE NOT EQUAL TO +2A89; 2A8A # [BEST FIT] LESS-THAN AND NOT APPROXIMATE +2A8A; 2A89 # [BEST FIT] GREATER-THAN AND NOT APPROXIMATE +2A8B; 2A8C # LESS-THAN ABOVE DOUBLE-LINE EQUAL ABOVE GREATER-THAN +2A8C; 2A8B # GREATER-THAN ABOVE DOUBLE-LINE EQUAL ABOVE LESS-THAN +2A8D; 2A8E # [BEST FIT] LESS-THAN ABOVE SIMILAR OR EQUAL +2A8E; 2A8D # [BEST FIT] GREATER-THAN ABOVE SIMILAR OR EQUAL +2A8F; 2A90 # [BEST FIT] LESS-THAN ABOVE SIMILAR ABOVE GREATER-THAN +2A90; 2A8F # [BEST FIT] GREATER-THAN ABOVE SIMILAR ABOVE LESS-THAN +2A91; 2A92 # LESS-THAN ABOVE GREATER-THAN ABOVE DOUBLE-LINE EQUAL +2A92; 2A91 # GREATER-THAN ABOVE LESS-THAN ABOVE DOUBLE-LINE EQUAL +2A93; 2A94 # LESS-THAN ABOVE SLANTED EQUAL ABOVE GREATER-THAN ABOVE SLANTED EQUAL +2A94; 2A93 # GREATER-THAN ABOVE SLANTED EQUAL ABOVE LESS-THAN ABOVE SLANTED EQUAL +2A95; 2A96 # SLANTED EQUAL TO OR LESS-THAN +2A96; 2A95 # SLANTED EQUAL TO OR GREATER-THAN +2A97; 2A98 # SLANTED EQUAL TO OR LESS-THAN WITH DOT INSIDE +2A98; 2A97 # SLANTED EQUAL TO OR GREATER-THAN WITH DOT INSIDE +2A99; 2A9A # DOUBLE-LINE EQUAL TO OR LESS-THAN +2A9A; 2A99 # DOUBLE-LINE EQUAL TO OR GREATER-THAN +2A9B; 2A9C # DOUBLE-LINE SLANTED EQUAL TO OR LESS-THAN +2A9C; 2A9B # DOUBLE-LINE SLANTED EQUAL TO OR GREATER-THAN +2A9D; 2A9E # [BEST FIT] SIMILAR OR LESS-THAN +2A9E; 2A9D # [BEST FIT] SIMILAR OR GREATER-THAN +2A9F; 2AA0 # [BEST FIT] SIMILAR ABOVE LESS-THAN ABOVE EQUALS SIGN +2AA0; 2A9F # [BEST FIT] SIMILAR ABOVE GREATER-THAN ABOVE EQUALS SIGN +2AA1; 2AA2 # DOUBLE NESTED LESS-THAN +2AA2; 2AA1 # DOUBLE NESTED GREATER-THAN +2AA6; 2AA7 # LESS-THAN CLOSED BY CURVE +2AA7; 2AA6 # GREATER-THAN CLOSED BY CURVE +2AA8; 2AA9 # LESS-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL +2AA9; 2AA8 # GREATER-THAN CLOSED BY CURVE ABOVE SLANTED EQUAL +2AAA; 2AAB # SMALLER THAN +2AAB; 2AAA # LARGER THAN +2AAC; 2AAD # SMALLER THAN OR EQUAL TO +2AAD; 2AAC # LARGER THAN OR EQUAL TO +2AAF; 2AB0 # PRECEDES ABOVE SINGLE-LINE EQUALS SIGN +2AB0; 2AAF # SUCCEEDS ABOVE SINGLE-LINE EQUALS SIGN +2AB1; 2AB2 # [BEST FIT] PRECEDES ABOVE SINGLE-LINE NOT EQUAL TO +2AB2; 2AB1 # [BEST FIT] SUCCEEDS ABOVE SINGLE-LINE NOT EQUAL TO +2AB3; 2AB4 # PRECEDES ABOVE EQUALS SIGN +2AB4; 2AB3 # SUCCEEDS ABOVE EQUALS SIGN +2AB5; 2AB6 # [BEST FIT] PRECEDES ABOVE NOT EQUAL TO +2AB6; 2AB5 # [BEST FIT] SUCCEEDS ABOVE NOT EQUAL TO +2AB7; 2AB8 # [BEST FIT] PRECEDES ABOVE ALMOST EQUAL TO +2AB8; 2AB7 # [BEST FIT] SUCCEEDS ABOVE ALMOST EQUAL TO +2AB9; 2ABA # [BEST FIT] PRECEDES ABOVE NOT ALMOST EQUAL TO +2ABA; 2AB9 # [BEST FIT] SUCCEEDS ABOVE NOT ALMOST EQUAL TO +2ABB; 2ABC # DOUBLE PRECEDES +2ABC; 2ABB # DOUBLE SUCCEEDS +2ABD; 2ABE # SUBSET WITH DOT +2ABE; 2ABD # SUPERSET WITH DOT +2ABF; 2AC0 # SUBSET WITH PLUS SIGN BELOW +2AC0; 2ABF # SUPERSET WITH PLUS SIGN BELOW +2AC1; 2AC2 # SUBSET WITH MULTIPLICATION SIGN BELOW +2AC2; 2AC1 # SUPERSET WITH MULTIPLICATION SIGN BELOW +2AC3; 2AC4 # SUBSET OF OR EQUAL TO WITH DOT ABOVE +2AC4; 2AC3 # SUPERSET OF OR EQUAL TO WITH DOT ABOVE +2AC5; 2AC6 # SUBSET OF ABOVE EQUALS SIGN +2AC6; 2AC5 # SUPERSET OF ABOVE EQUALS SIGN +2AC7; 2AC8 # [BEST FIT] SUBSET OF ABOVE TILDE OPERATOR +2AC8; 2AC7 # [BEST FIT] SUPERSET OF ABOVE TILDE OPERATOR +2AC9; 2ACA # [BEST FIT] SUBSET OF ABOVE ALMOST EQUAL TO +2ACA; 2AC9 # [BEST FIT] SUPERSET OF ABOVE ALMOST EQUAL TO +2ACB; 2ACC # [BEST FIT] SUBSET OF ABOVE NOT EQUAL TO +2ACC; 2ACB # [BEST FIT] SUPERSET OF ABOVE NOT EQUAL TO +2ACD; 2ACE # SQUARE LEFT OPEN BOX OPERATOR +2ACE; 2ACD # SQUARE RIGHT OPEN BOX OPERATOR +2ACF; 2AD0 # CLOSED SUBSET +2AD0; 2ACF # CLOSED SUPERSET +2AD1; 2AD2 # CLOSED SUBSET OR EQUAL TO +2AD2; 2AD1 # CLOSED SUPERSET OR EQUAL TO +2AD3; 2AD4 # SUBSET ABOVE SUPERSET +2AD4; 2AD3 # SUPERSET ABOVE SUBSET +2AD5; 2AD6 # SUBSET ABOVE SUBSET +2AD6; 2AD5 # SUPERSET ABOVE SUPERSET +2ADE; 22A6 # SHORT LEFT TACK +2AE3; 22A9 # DOUBLE VERTICAL BAR LEFT TURNSTILE +2AE4; 22A8 # VERTICAL BAR DOUBLE LEFT TURNSTILE +2AE5; 22AB # DOUBLE VERTICAL BAR DOUBLE LEFT TURNSTILE +2AEC; 2AED # DOUBLE STROKE NOT SIGN +2AED; 2AEC # REVERSED DOUBLE STROKE NOT SIGN +2AEE; 2224 # DOES NOT DIVIDE WITH REVERSED NEGATION SLASH +2AF7; 2AF8 # TRIPLE NESTED LESS-THAN +2AF8; 2AF7 # TRIPLE NESTED GREATER-THAN +2AF9; 2AFA # DOUBLE-LINE SLANTED LESS-THAN OR EQUAL TO +2AFA; 2AF9 # DOUBLE-LINE SLANTED GREATER-THAN OR EQUAL TO +2BFE; 221F # REVERSED RIGHT ANGLE +2E02; 2E03 # LEFT SUBSTITUTION BRACKET +2E03; 2E02 # RIGHT SUBSTITUTION BRACKET +2E04; 2E05 # LEFT DOTTED SUBSTITUTION BRACKET +2E05; 2E04 # RIGHT DOTTED SUBSTITUTION BRACKET +2E09; 2E0A # LEFT TRANSPOSITION BRACKET +2E0A; 2E09 # RIGHT TRANSPOSITION BRACKET +2E0C; 2E0D # LEFT RAISED OMISSION BRACKET +2E0D; 2E0C # RIGHT RAISED OMISSION BRACKET +2E1C; 2E1D # LEFT LOW PARAPHRASE BRACKET +2E1D; 2E1C # RIGHT LOW PARAPHRASE BRACKET +2E20; 2E21 # LEFT VERTICAL BAR WITH QUILL +2E21; 2E20 # RIGHT VERTICAL BAR WITH QUILL +2E22; 2E23 # TOP LEFT HALF BRACKET +2E23; 2E22 # TOP RIGHT HALF BRACKET +2E24; 2E25 # BOTTOM LEFT HALF BRACKET +2E25; 2E24 # BOTTOM RIGHT HALF BRACKET +2E26; 2E27 # LEFT SIDEWAYS U BRACKET +2E27; 2E26 # RIGHT SIDEWAYS U BRACKET +2E28; 2E29 # LEFT DOUBLE PARENTHESIS +2E29; 2E28 # RIGHT DOUBLE PARENTHESIS +2E55; 2E56 # LEFT SQUARE BRACKET WITH STROKE +2E56; 2E55 # RIGHT SQUARE BRACKET WITH STROKE +2E57; 2E58 # LEFT SQUARE BRACKET WITH DOUBLE STROKE +2E58; 2E57 # RIGHT SQUARE BRACKET WITH DOUBLE STROKE +2E59; 2E5A # TOP HALF LEFT PARENTHESIS +2E5A; 2E59 # TOP HALF RIGHT PARENTHESIS +2E5B; 2E5C # BOTTOM HALF LEFT PARENTHESIS +2E5C; 2E5B # BOTTOM HALF RIGHT PARENTHESIS +3008; 3009 # LEFT ANGLE BRACKET +3009; 3008 # RIGHT ANGLE BRACKET +300A; 300B # LEFT DOUBLE ANGLE BRACKET +300B; 300A # RIGHT DOUBLE ANGLE BRACKET +300C; 300D # [BEST FIT] LEFT CORNER BRACKET +300D; 300C # [BEST FIT] RIGHT CORNER BRACKET +300E; 300F # [BEST FIT] LEFT WHITE CORNER BRACKET +300F; 300E # [BEST FIT] RIGHT WHITE CORNER BRACKET +3010; 3011 # LEFT BLACK LENTICULAR BRACKET +3011; 3010 # RIGHT BLACK LENTICULAR BRACKET +3014; 3015 # LEFT TORTOISE SHELL BRACKET +3015; 3014 # RIGHT TORTOISE SHELL BRACKET +3016; 3017 # LEFT WHITE LENTICULAR BRACKET +3017; 3016 # RIGHT WHITE LENTICULAR BRACKET +3018; 3019 # LEFT WHITE TORTOISE SHELL BRACKET +3019; 3018 # RIGHT WHITE TORTOISE SHELL BRACKET +301A; 301B # LEFT WHITE SQUARE BRACKET +301B; 301A # RIGHT WHITE SQUARE BRACKET +FE59; FE5A # SMALL LEFT PARENTHESIS +FE5A; FE59 # SMALL RIGHT PARENTHESIS +FE5B; FE5C # SMALL LEFT CURLY BRACKET +FE5C; FE5B # SMALL RIGHT CURLY BRACKET +FE5D; FE5E # SMALL LEFT TORTOISE SHELL BRACKET +FE5E; FE5D # SMALL RIGHT TORTOISE SHELL BRACKET +FE64; FE65 # SMALL LESS-THAN SIGN +FE65; FE64 # SMALL GREATER-THAN SIGN +FF08; FF09 # FULLWIDTH LEFT PARENTHESIS +FF09; FF08 # FULLWIDTH RIGHT PARENTHESIS +FF1C; FF1E # FULLWIDTH LESS-THAN SIGN +FF1E; FF1C # FULLWIDTH GREATER-THAN SIGN +FF3B; FF3D # FULLWIDTH LEFT SQUARE BRACKET +FF3D; FF3B # FULLWIDTH RIGHT SQUARE BRACKET +FF5B; FF5D # FULLWIDTH LEFT CURLY BRACKET +FF5D; FF5B # FULLWIDTH RIGHT CURLY BRACKET +FF5F; FF60 # FULLWIDTH LEFT WHITE PARENTHESIS +FF60; FF5F # FULLWIDTH RIGHT WHITE PARENTHESIS +FF62; FF63 # [BEST FIT] HALFWIDTH LEFT CORNER BRACKET +FF63; FF62 # [BEST FIT] HALFWIDTH RIGHT CORNER BRACKET + +# The following characters have no appropriate mirroring character. +# For these characters it is up to the rendering system +# to provide mirrored glyphs. + +# 2140; DOUBLE-STRUCK N-ARY SUMMATION +# 2201; COMPLEMENT +# 2202; PARTIAL DIFFERENTIAL +# 2203; THERE EXISTS +# 2204; THERE DOES NOT EXIST +# 2211; N-ARY SUMMATION +# 2216; SET MINUS +# 221A; SQUARE ROOT +# 221B; CUBE ROOT +# 221C; FOURTH ROOT +# 221D; PROPORTIONAL TO +# 2226; NOT PARALLEL TO +# 222B; INTEGRAL +# 222C; DOUBLE INTEGRAL +# 222D; TRIPLE INTEGRAL +# 222E; CONTOUR INTEGRAL +# 222F; SURFACE INTEGRAL +# 2230; VOLUME INTEGRAL +# 2231; CLOCKWISE INTEGRAL +# 2232; CLOCKWISE CONTOUR INTEGRAL +# 2233; ANTICLOCKWISE CONTOUR INTEGRAL +# 2239; EXCESS +# 223B; HOMOTHETIC +# 223E; INVERTED LAZY S +# 223F; SINE WAVE +# 2240; WREATH PRODUCT +# 2241; NOT TILDE +# 2242; MINUS TILDE +# 2244; NOT ASYMPTOTICALLY EQUAL TO +# 2246; APPROXIMATELY BUT NOT ACTUALLY EQUAL TO +# 2247; NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO +# 2248; ALMOST EQUAL TO +# 2249; NOT ALMOST EQUAL TO +# 224A; ALMOST EQUAL OR EQUAL TO +# 224B; TRIPLE TILDE +# 225F; QUESTIONED EQUAL TO +# 2260; NOT EQUAL TO +# 2262; NOT IDENTICAL TO +# 228C; MULTISET +# 22A7; MODELS +# 22AA; TRIPLE VERTICAL BAR RIGHT TURNSTILE +# 22AC; DOES NOT PROVE +# 22AD; NOT TRUE +# 22AE; DOES NOT FORCE +# 22AF; NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE +# 22BE; RIGHT ANGLE WITH ARC +# 22BF; RIGHT TRIANGLE +# 22F5; ELEMENT OF WITH DOT ABOVE +# 22F8; ELEMENT OF WITH UNDERBAR +# 22F9; ELEMENT OF WITH TWO HORIZONTAL STROKES +# 22FF; Z NOTATION BAG MEMBERSHIP +# 2320; TOP HALF INTEGRAL +# 2321; BOTTOM HALF INTEGRAL +# 27C0; THREE DIMENSIONAL ANGLE +# 27CC; LONG DIVISION +# 27D3; LOWER RIGHT CORNER WITH DOT +# 27D4; UPPER LEFT CORNER WITH DOT +# 299C; RIGHT ANGLE VARIANT WITH SQUARE +# 299D; MEASURED RIGHT ANGLE WITH DOT +# 299E; ANGLE WITH S INSIDE +# 299F; ACUTE ANGLE +# 29A2; TURNED ANGLE +# 29A6; OBLIQUE ANGLE OPENING UP +# 29A7; OBLIQUE ANGLE OPENING DOWN +# 29C2; CIRCLE WITH SMALL CIRCLE TO THE RIGHT +# 29C3; CIRCLE WITH TWO HORIZONTAL STROKES TO THE RIGHT +# 29C9; TWO JOINED SQUARES +# 29CE; RIGHT TRIANGLE ABOVE LEFT TRIANGLE +# 29DC; INCOMPLETE INFINITY +# 29E1; INCREASES AS +# 29E3; EQUALS SIGN AND SLANTED PARALLEL +# 29E4; EQUALS SIGN AND SLANTED PARALLEL WITH TILDE ABOVE +# 29E5; IDENTICAL TO AND SLANTED PARALLEL +# 29F4; RULE-DELAYED +# 29F6; SOLIDUS WITH OVERBAR +# 29F7; REVERSE SOLIDUS WITH HORIZONTAL STROKE +# 2A0A; MODULO TWO SUM +# 2A0B; SUMMATION WITH INTEGRAL +# 2A0C; QUADRUPLE INTEGRAL OPERATOR +# 2A0D; FINITE PART INTEGRAL +# 2A0E; INTEGRAL WITH DOUBLE STROKE +# 2A0F; INTEGRAL AVERAGE WITH SLASH +# 2A10; CIRCULATION FUNCTION +# 2A11; ANTICLOCKWISE INTEGRATION +# 2A12; LINE INTEGRATION WITH RECTANGULAR PATH AROUND POLE +# 2A13; LINE INTEGRATION WITH SEMICIRCULAR PATH AROUND POLE +# 2A14; LINE INTEGRATION NOT INCLUDING THE POLE +# 2A15; INTEGRAL AROUND A POINT OPERATOR +# 2A16; QUATERNION INTEGRAL OPERATOR +# 2A17; INTEGRAL WITH LEFTWARDS ARROW WITH HOOK +# 2A18; INTEGRAL WITH TIMES SIGN +# 2A19; INTEGRAL WITH INTERSECTION +# 2A1A; INTEGRAL WITH UNION +# 2A1B; INTEGRAL WITH OVERBAR +# 2A1C; INTEGRAL WITH UNDERBAR +# 2A1E; LARGE LEFT TRIANGLE OPERATOR +# 2A1F; Z NOTATION SCHEMA COMPOSITION +# 2A20; Z NOTATION SCHEMA PIPING +# 2A21; Z NOTATION SCHEMA PROJECTION +# 2A24; PLUS SIGN WITH TILDE ABOVE +# 2A26; PLUS SIGN WITH TILDE BELOW +# 2A29; MINUS SIGN WITH COMMA ABOVE +# 2A3E; Z NOTATION RELATIONAL COMPOSITION +# 2A57; SLOPING LARGE OR +# 2A58; SLOPING LARGE AND +# 2A6A; TILDE OPERATOR WITH DOT ABOVE +# 2A6B; TILDE OPERATOR WITH RISING DOTS +# 2A6C; SIMILAR MINUS SIMILAR +# 2A6D; CONGRUENT WITH DOT ABOVE +# 2A6F; ALMOST EQUAL TO WITH CIRCUMFLEX ACCENT +# 2A70; APPROXIMATELY EQUAL OR EQUAL TO +# 2A73; EQUALS SIGN ABOVE TILDE OPERATOR +# 2A74; DOUBLE COLON EQUAL +# 2AA3; DOUBLE NESTED LESS-THAN WITH UNDERBAR +# 2ADC; FORKING +# 2AE2; VERTICAL BAR TRIPLE RIGHT TURNSTILE +# 2AE6; LONG DASH FROM LEFT MEMBER OF DOUBLE VERTICAL +# 2AF3; PARALLEL WITH TILDE OPERATOR +# 2AFB; TRIPLE SOLIDUS BINARY RELATION +# 2AFD; DOUBLE SOLIDUS OPERATOR +# 1D6DB; MATHEMATICAL BOLD PARTIAL DIFFERENTIAL +# 1D715; MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL +# 1D74F; MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL +# 1D789; MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL +# 1D7C3; MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL + +# EOF diff --git a/pcre2/maint/Unicode.tables/DerivedBidiClass.txt b/pcre2/maint/Unicode.tables/DerivedBidiClass.txt new file mode 100644 index 0000000000000000000000000000000000000000..4012dc25da1ae38de025aa8baf5a864f555b8fa5 --- /dev/null +++ b/pcre2/maint/Unicode.tables/DerivedBidiClass.txt @@ -0,0 +1,2524 @@ +# DerivedBidiClass-14.0.0.txt +# Date: 2021-07-10, 00:35:02 GMT +# © 2021 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see http://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see http://www.unicode.org/reports/tr44/ + +# ================================================ + +# Bidi Class (listing UnicodeData.txt, field 4: see UAX #44: https://www.unicode.org/reports/tr44/) +# Unlike other properties, unassigned code points in blocks +# reserved for right-to-left scripts are given either types R or AL. +# +# The unassigned code points that default to AL are in the ranges: +# [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF +# \U00010D00-\U00010D3F \U00010F30-\U00010F6F +# \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF] +# +# This includes code points in the Arabic, Syriac, and Thaana blocks, among others. +# +# The unassigned code points that default to R are in the ranges: +# [\u0590-\u05FF \u07C0-\u085F \uFB1D-\uFB4F +# \U00010800-\U00010CFF \U00010D40-\U00010F2F \U00010F70-\U00010FFF +# \U0001E800-\U0001EC6F \U0001ECC0-\U0001ECFF \U0001ED50-\U0001EDFF \U0001EF00-\U0001EFFF] +# +# This includes code points in the Hebrew, NKo, and Phoenician blocks, among others. +# +# The unassigned code points that default to ET are in the range: +# [\u20A0-\u20CF] +# +# This consists of code points in the Currency Symbols block. +# +# The unassigned code points that default to BN have one of the following properties: +# Default_Ignorable_Code_Point +# Noncharacter_Code_Point +# +# For all other cases: + +# All code points not explicitly listed for Bidi_Class +# have the value Left_To_Right (L). + +# @missing: 0000..10FFFF; Left_To_Right + +# ================================================ + +# Bidi_Class=Left_To_Right + +0041..005A ; L # L& [26] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z +0061..007A ; L # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z +00AA ; L # Lo FEMININE ORDINAL INDICATOR +00B5 ; L # L& MICRO SIGN +00BA ; L # Lo MASCULINE ORDINAL INDICATOR +00C0..00D6 ; L # L& [23] LATIN CAPITAL LETTER A WITH GRAVE..LATIN CAPITAL LETTER O WITH DIAERESIS +00D8..00F6 ; L # L& [31] LATIN CAPITAL LETTER O WITH STROKE..LATIN SMALL LETTER O WITH DIAERESIS +00F8..01BA ; L # L& [195] LATIN SMALL LETTER O WITH STROKE..LATIN SMALL LETTER EZH WITH TAIL +01BB ; L # Lo LATIN LETTER TWO WITH STROKE +01BC..01BF ; L # L& [4] LATIN CAPITAL LETTER TONE FIVE..LATIN LETTER WYNN +01C0..01C3 ; L # Lo [4] LATIN LETTER DENTAL CLICK..LATIN LETTER RETROFLEX CLICK +01C4..0293 ; L # L& [208] LATIN CAPITAL LETTER DZ WITH CARON..LATIN SMALL LETTER EZH WITH CURL +0294 ; L # Lo LATIN LETTER GLOTTAL STOP +0295..02AF ; L # L& [27] LATIN LETTER PHARYNGEAL VOICED FRICATIVE..LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL +02B0..02B8 ; L # Lm [9] MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y +02BB..02C1 ; L # Lm [7] MODIFIER LETTER TURNED COMMA..MODIFIER LETTER REVERSED GLOTTAL STOP +02D0..02D1 ; L # Lm [2] MODIFIER LETTER TRIANGULAR COLON..MODIFIER LETTER HALF TRIANGULAR COLON +02E0..02E4 ; L # Lm [5] MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP +02EE ; L # Lm MODIFIER LETTER DOUBLE APOSTROPHE +0370..0373 ; L # L& [4] GREEK CAPITAL LETTER HETA..GREEK SMALL LETTER ARCHAIC SAMPI +0376..0377 ; L # L& [2] GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA..GREEK SMALL LETTER PAMPHYLIAN DIGAMMA +037A ; L # Lm GREEK YPOGEGRAMMENI +037B..037D ; L # L& [3] GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL +037F ; L # L& GREEK CAPITAL LETTER YOT +0386 ; L # L& GREEK CAPITAL LETTER ALPHA WITH TONOS +0388..038A ; L # L& [3] GREEK CAPITAL LETTER EPSILON WITH TONOS..GREEK CAPITAL LETTER IOTA WITH TONOS +038C ; L # L& GREEK CAPITAL LETTER OMICRON WITH TONOS +038E..03A1 ; L # L& [20] GREEK CAPITAL LETTER UPSILON WITH TONOS..GREEK CAPITAL LETTER RHO +03A3..03F5 ; L # L& [83] GREEK CAPITAL LETTER SIGMA..GREEK LUNATE EPSILON SYMBOL +03F7..0481 ; L # L& [139] GREEK CAPITAL LETTER SHO..CYRILLIC SMALL LETTER KOPPA +0482 ; L # So CYRILLIC THOUSANDS SIGN +048A..052F ; L # L& [166] CYRILLIC CAPITAL LETTER SHORT I WITH TAIL..CYRILLIC SMALL LETTER EL WITH DESCENDER +0531..0556 ; L # L& [38] ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITAL LETTER FEH +0559 ; L # Lm ARMENIAN MODIFIER LETTER LEFT HALF RING +055A..055F ; L # Po [6] ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION MARK +0560..0588 ; L # L& [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE +0589 ; L # Po ARMENIAN FULL STOP +0903 ; L # Mc DEVANAGARI SIGN VISARGA +0904..0939 ; L # Lo [54] DEVANAGARI LETTER SHORT A..DEVANAGARI LETTER HA +093B ; L # Mc DEVANAGARI VOWEL SIGN OOE +093D ; L # Lo DEVANAGARI SIGN AVAGRAHA +093E..0940 ; L # Mc [3] DEVANAGARI VOWEL SIGN AA..DEVANAGARI VOWEL SIGN II +0949..094C ; L # Mc [4] DEVANAGARI VOWEL SIGN CANDRA O..DEVANAGARI VOWEL SIGN AU +094E..094F ; L # Mc [2] DEVANAGARI VOWEL SIGN PRISHTHAMATRA E..DEVANAGARI VOWEL SIGN AW +0950 ; L # Lo DEVANAGARI OM +0958..0961 ; L # Lo [10] DEVANAGARI LETTER QA..DEVANAGARI LETTER VOCALIC LL +0964..0965 ; L # Po [2] DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA +0966..096F ; L # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE +0970 ; L # Po DEVANAGARI ABBREVIATION SIGN +0971 ; L # Lm DEVANAGARI SIGN HIGH SPACING DOT +0972..0980 ; L # Lo [15] DEVANAGARI LETTER CANDRA A..BENGALI ANJI +0982..0983 ; L # Mc [2] BENGALI SIGN ANUSVARA..BENGALI SIGN VISARGA +0985..098C ; L # Lo [8] BENGALI LETTER A..BENGALI LETTER VOCALIC L +098F..0990 ; L # Lo [2] BENGALI LETTER E..BENGALI LETTER AI +0993..09A8 ; L # Lo [22] BENGALI LETTER O..BENGALI LETTER NA +09AA..09B0 ; L # Lo [7] BENGALI LETTER PA..BENGALI LETTER RA +09B2 ; L # Lo BENGALI LETTER LA +09B6..09B9 ; L # Lo [4] BENGALI LETTER SHA..BENGALI LETTER HA +09BD ; L # Lo BENGALI SIGN AVAGRAHA +09BE..09C0 ; L # Mc [3] BENGALI VOWEL SIGN AA..BENGALI VOWEL SIGN II +09C7..09C8 ; L # Mc [2] BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI +09CB..09CC ; L # Mc [2] BENGALI VOWEL SIGN O..BENGALI VOWEL SIGN AU +09CE ; L # Lo BENGALI LETTER KHANDA TA +09D7 ; L # Mc BENGALI AU LENGTH MARK +09DC..09DD ; L # Lo [2] BENGALI LETTER RRA..BENGALI LETTER RHA +09DF..09E1 ; L # Lo [3] BENGALI LETTER YYA..BENGALI LETTER VOCALIC LL +09E6..09EF ; L # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE +09F0..09F1 ; L # Lo [2] BENGALI LETTER RA WITH MIDDLE DIAGONAL..BENGALI LETTER RA WITH LOWER DIAGONAL +09F4..09F9 ; L # No [6] BENGALI CURRENCY NUMERATOR ONE..BENGALI CURRENCY DENOMINATOR SIXTEEN +09FA ; L # So BENGALI ISSHAR +09FC ; L # Lo BENGALI LETTER VEDIC ANUSVARA +09FD ; L # Po BENGALI ABBREVIATION SIGN +0A03 ; L # Mc GURMUKHI SIGN VISARGA +0A05..0A0A ; L # Lo [6] GURMUKHI LETTER A..GURMUKHI LETTER UU +0A0F..0A10 ; L # Lo [2] GURMUKHI LETTER EE..GURMUKHI LETTER AI +0A13..0A28 ; L # Lo [22] GURMUKHI LETTER OO..GURMUKHI LETTER NA +0A2A..0A30 ; L # Lo [7] GURMUKHI LETTER PA..GURMUKHI LETTER RA +0A32..0A33 ; L # Lo [2] GURMUKHI LETTER LA..GURMUKHI LETTER LLA +0A35..0A36 ; L # Lo [2] GURMUKHI LETTER VA..GURMUKHI LETTER SHA +0A38..0A39 ; L # Lo [2] GURMUKHI LETTER SA..GURMUKHI LETTER HA +0A3E..0A40 ; L # Mc [3] GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN II +0A59..0A5C ; L # Lo [4] GURMUKHI LETTER KHHA..GURMUKHI LETTER RRA +0A5E ; L # Lo GURMUKHI LETTER FA +0A66..0A6F ; L # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE +0A72..0A74 ; L # Lo [3] GURMUKHI IRI..GURMUKHI EK ONKAR +0A76 ; L # Po GURMUKHI ABBREVIATION SIGN +0A83 ; L # Mc GUJARATI SIGN VISARGA +0A85..0A8D ; L # Lo [9] GUJARATI LETTER A..GUJARATI VOWEL CANDRA E +0A8F..0A91 ; L # Lo [3] GUJARATI LETTER E..GUJARATI VOWEL CANDRA O +0A93..0AA8 ; L # Lo [22] GUJARATI LETTER O..GUJARATI LETTER NA +0AAA..0AB0 ; L # Lo [7] GUJARATI LETTER PA..GUJARATI LETTER RA +0AB2..0AB3 ; L # Lo [2] GUJARATI LETTER LA..GUJARATI LETTER LLA +0AB5..0AB9 ; L # Lo [5] GUJARATI LETTER VA..GUJARATI LETTER HA +0ABD ; L # Lo GUJARATI SIGN AVAGRAHA +0ABE..0AC0 ; L # Mc [3] GUJARATI VOWEL SIGN AA..GUJARATI VOWEL SIGN II +0AC9 ; L # Mc GUJARATI VOWEL SIGN CANDRA O +0ACB..0ACC ; L # Mc [2] GUJARATI VOWEL SIGN O..GUJARATI VOWEL SIGN AU +0AD0 ; L # Lo GUJARATI OM +0AE0..0AE1 ; L # Lo [2] GUJARATI LETTER VOCALIC RR..GUJARATI LETTER VOCALIC LL +0AE6..0AEF ; L # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE +0AF0 ; L # Po GUJARATI ABBREVIATION SIGN +0AF9 ; L # Lo GUJARATI LETTER ZHA +0B02..0B03 ; L # Mc [2] ORIYA SIGN ANUSVARA..ORIYA SIGN VISARGA +0B05..0B0C ; L # Lo [8] ORIYA LETTER A..ORIYA LETTER VOCALIC L +0B0F..0B10 ; L # Lo [2] ORIYA LETTER E..ORIYA LETTER AI +0B13..0B28 ; L # Lo [22] ORIYA LETTER O..ORIYA LETTER NA +0B2A..0B30 ; L # Lo [7] ORIYA LETTER PA..ORIYA LETTER RA +0B32..0B33 ; L # Lo [2] ORIYA LETTER LA..ORIYA LETTER LLA +0B35..0B39 ; L # Lo [5] ORIYA LETTER VA..ORIYA LETTER HA +0B3D ; L # Lo ORIYA SIGN AVAGRAHA +0B3E ; L # Mc ORIYA VOWEL SIGN AA +0B40 ; L # Mc ORIYA VOWEL SIGN II +0B47..0B48 ; L # Mc [2] ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI +0B4B..0B4C ; L # Mc [2] ORIYA VOWEL SIGN O..ORIYA VOWEL SIGN AU +0B57 ; L # Mc ORIYA AU LENGTH MARK +0B5C..0B5D ; L # Lo [2] ORIYA LETTER RRA..ORIYA LETTER RHA +0B5F..0B61 ; L # Lo [3] ORIYA LETTER YYA..ORIYA LETTER VOCALIC LL +0B66..0B6F ; L # Nd [10] ORIYA DIGIT ZERO..ORIYA DIGIT NINE +0B70 ; L # So ORIYA ISSHAR +0B71 ; L # Lo ORIYA LETTER WA +0B72..0B77 ; L # No [6] ORIYA FRACTION ONE QUARTER..ORIYA FRACTION THREE SIXTEENTHS +0B83 ; L # Lo TAMIL SIGN VISARGA +0B85..0B8A ; L # Lo [6] TAMIL LETTER A..TAMIL LETTER UU +0B8E..0B90 ; L # Lo [3] TAMIL LETTER E..TAMIL LETTER AI +0B92..0B95 ; L # Lo [4] TAMIL LETTER O..TAMIL LETTER KA +0B99..0B9A ; L # Lo [2] TAMIL LETTER NGA..TAMIL LETTER CA +0B9C ; L # Lo TAMIL LETTER JA +0B9E..0B9F ; L # Lo [2] TAMIL LETTER NYA..TAMIL LETTER TTA +0BA3..0BA4 ; L # Lo [2] TAMIL LETTER NNA..TAMIL LETTER TA +0BA8..0BAA ; L # Lo [3] TAMIL LETTER NA..TAMIL LETTER PA +0BAE..0BB9 ; L # Lo [12] TAMIL LETTER MA..TAMIL LETTER HA +0BBE..0BBF ; L # Mc [2] TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN I +0BC1..0BC2 ; L # Mc [2] TAMIL VOWEL SIGN U..TAMIL VOWEL SIGN UU +0BC6..0BC8 ; L # Mc [3] TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI +0BCA..0BCC ; L # Mc [3] TAMIL VOWEL SIGN O..TAMIL VOWEL SIGN AU +0BD0 ; L # Lo TAMIL OM +0BD7 ; L # Mc TAMIL AU LENGTH MARK +0BE6..0BEF ; L # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE +0BF0..0BF2 ; L # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND +0C01..0C03 ; L # Mc [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA +0C05..0C0C ; L # Lo [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L +0C0E..0C10 ; L # Lo [3] TELUGU LETTER E..TELUGU LETTER AI +0C12..0C28 ; L # Lo [23] TELUGU LETTER O..TELUGU LETTER NA +0C2A..0C39 ; L # Lo [16] TELUGU LETTER PA..TELUGU LETTER HA +0C3D ; L # Lo TELUGU SIGN AVAGRAHA +0C41..0C44 ; L # Mc [4] TELUGU VOWEL SIGN U..TELUGU VOWEL SIGN VOCALIC RR +0C58..0C5A ; L # Lo [3] TELUGU LETTER TSA..TELUGU LETTER RRRA +0C5D ; L # Lo TELUGU LETTER NAKAARA POLLU +0C60..0C61 ; L # Lo [2] TELUGU LETTER VOCALIC RR..TELUGU LETTER VOCALIC LL +0C66..0C6F ; L # Nd [10] TELUGU DIGIT ZERO..TELUGU DIGIT NINE +0C77 ; L # Po TELUGU SIGN SIDDHAM +0C7F ; L # So TELUGU SIGN TUUMU +0C80 ; L # Lo KANNADA SIGN SPACING CANDRABINDU +0C82..0C83 ; L # Mc [2] KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA +0C84 ; L # Po KANNADA SIGN SIDDHAM +0C85..0C8C ; L # Lo [8] KANNADA LETTER A..KANNADA LETTER VOCALIC L +0C8E..0C90 ; L # Lo [3] KANNADA LETTER E..KANNADA LETTER AI +0C92..0CA8 ; L # Lo [23] KANNADA LETTER O..KANNADA LETTER NA +0CAA..0CB3 ; L # Lo [10] KANNADA LETTER PA..KANNADA LETTER LLA +0CB5..0CB9 ; L # Lo [5] KANNADA LETTER VA..KANNADA LETTER HA +0CBD ; L # Lo KANNADA SIGN AVAGRAHA +0CBE ; L # Mc KANNADA VOWEL SIGN AA +0CBF ; L # Mn KANNADA VOWEL SIGN I +0CC0..0CC4 ; L # Mc [5] KANNADA VOWEL SIGN II..KANNADA VOWEL SIGN VOCALIC RR +0CC6 ; L # Mn KANNADA VOWEL SIGN E +0CC7..0CC8 ; L # Mc [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI +0CCA..0CCB ; L # Mc [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO +0CD5..0CD6 ; L # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK +0CDD..0CDE ; L # Lo [2] KANNADA LETTER NAKAARA POLLU..KANNADA LETTER FA +0CE0..0CE1 ; L # Lo [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL +0CE6..0CEF ; L # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE +0CF1..0CF2 ; L # Lo [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA +0D02..0D03 ; L # Mc [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA +0D04..0D0C ; L # Lo [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L +0D0E..0D10 ; L # Lo [3] MALAYALAM LETTER E..MALAYALAM LETTER AI +0D12..0D3A ; L # Lo [41] MALAYALAM LETTER O..MALAYALAM LETTER TTTA +0D3D ; L # Lo MALAYALAM SIGN AVAGRAHA +0D3E..0D40 ; L # Mc [3] MALAYALAM VOWEL SIGN AA..MALAYALAM VOWEL SIGN II +0D46..0D48 ; L # Mc [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI +0D4A..0D4C ; L # Mc [3] MALAYALAM VOWEL SIGN O..MALAYALAM VOWEL SIGN AU +0D4E ; L # Lo MALAYALAM LETTER DOT REPH +0D4F ; L # So MALAYALAM SIGN PARA +0D54..0D56 ; L # Lo [3] MALAYALAM LETTER CHILLU M..MALAYALAM LETTER CHILLU LLL +0D57 ; L # Mc MALAYALAM AU LENGTH MARK +0D58..0D5E ; L # No [7] MALAYALAM FRACTION ONE ONE-HUNDRED-AND-SIXTIETH..MALAYALAM FRACTION ONE FIFTH +0D5F..0D61 ; L # Lo [3] MALAYALAM LETTER ARCHAIC II..MALAYALAM LETTER VOCALIC LL +0D66..0D6F ; L # Nd [10] MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE +0D70..0D78 ; L # No [9] MALAYALAM NUMBER TEN..MALAYALAM FRACTION THREE SIXTEENTHS +0D79 ; L # So MALAYALAM DATE MARK +0D7A..0D7F ; L # Lo [6] MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER CHILLU K +0D82..0D83 ; L # Mc [2] SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARGAYA +0D85..0D96 ; L # Lo [18] SINHALA LETTER AYANNA..SINHALA LETTER AUYANNA +0D9A..0DB1 ; L # Lo [24] SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA LETTER DANTAJA NAYANNA +0DB3..0DBB ; L # Lo [9] SINHALA LETTER SANYAKA DAYANNA..SINHALA LETTER RAYANNA +0DBD ; L # Lo SINHALA LETTER DANTAJA LAYANNA +0DC0..0DC6 ; L # Lo [7] SINHALA LETTER VAYANNA..SINHALA LETTER FAYANNA +0DCF..0DD1 ; L # Mc [3] SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL SIGN DIGA AEDA-PILLA +0DD8..0DDF ; L # Mc [8] SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOWEL SIGN GAYANUKITTA +0DE6..0DEF ; L # Nd [10] SINHALA LITH DIGIT ZERO..SINHALA LITH DIGIT NINE +0DF2..0DF3 ; L # Mc [2] SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHALA VOWEL SIGN DIGA GAYANUKITTA +0DF4 ; L # Po SINHALA PUNCTUATION KUNDDALIYA +0E01..0E30 ; L # Lo [48] THAI CHARACTER KO KAI..THAI CHARACTER SARA A +0E32..0E33 ; L # Lo [2] THAI CHARACTER SARA AA..THAI CHARACTER SARA AM +0E40..0E45 ; L # Lo [6] THAI CHARACTER SARA E..THAI CHARACTER LAKKHANGYAO +0E46 ; L # Lm THAI CHARACTER MAIYAMOK +0E4F ; L # Po THAI CHARACTER FONGMAN +0E50..0E59 ; L # Nd [10] THAI DIGIT ZERO..THAI DIGIT NINE +0E5A..0E5B ; L # Po [2] THAI CHARACTER ANGKHANKHU..THAI CHARACTER KHOMUT +0E81..0E82 ; L # Lo [2] LAO LETTER KO..LAO LETTER KHO SUNG +0E84 ; L # Lo LAO LETTER KHO TAM +0E86..0E8A ; L # Lo [5] LAO LETTER PALI GHA..LAO LETTER SO TAM +0E8C..0EA3 ; L # Lo [24] LAO LETTER PALI JHA..LAO LETTER LO LING +0EA5 ; L # Lo LAO LETTER LO LOOT +0EA7..0EB0 ; L # Lo [10] LAO LETTER WO..LAO VOWEL SIGN A +0EB2..0EB3 ; L # Lo [2] LAO VOWEL SIGN AA..LAO VOWEL SIGN AM +0EBD ; L # Lo LAO SEMIVOWEL SIGN NYO +0EC0..0EC4 ; L # Lo [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI +0EC6 ; L # Lm LAO KO LA +0ED0..0ED9 ; L # Nd [10] LAO DIGIT ZERO..LAO DIGIT NINE +0EDC..0EDF ; L # Lo [4] LAO HO NO..LAO LETTER KHMU NYO +0F00 ; L # Lo TIBETAN SYLLABLE OM +0F01..0F03 ; L # So [3] TIBETAN MARK GTER YIG MGO TRUNCATED A..TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA +0F04..0F12 ; L # Po [15] TIBETAN MARK INITIAL YIG MGO MDUN MA..TIBETAN MARK RGYA GRAM SHAD +0F13 ; L # So TIBETAN MARK CARET -DZUD RTAGS ME LONG CAN +0F14 ; L # Po TIBETAN MARK GTER TSHEG +0F15..0F17 ; L # So [3] TIBETAN LOGOTYPE SIGN CHAD RTAGS..TIBETAN ASTROLOGICAL SIGN SGRA GCAN -CHAR RTAGS +0F1A..0F1F ; L # So [6] TIBETAN SIGN RDEL DKAR GCIG..TIBETAN SIGN RDEL DKAR RDEL NAG +0F20..0F29 ; L # Nd [10] TIBETAN DIGIT ZERO..TIBETAN DIGIT NINE +0F2A..0F33 ; L # No [10] TIBETAN DIGIT HALF ONE..TIBETAN DIGIT HALF ZERO +0F34 ; L # So TIBETAN MARK BSDUS RTAGS +0F36 ; L # So TIBETAN MARK CARET -DZUD RTAGS BZHI MIG CAN +0F38 ; L # So TIBETAN MARK CHE MGO +0F3E..0F3F ; L # Mc [2] TIBETAN SIGN YAR TSHES..TIBETAN SIGN MAR TSHES +0F40..0F47 ; L # Lo [8] TIBETAN LETTER KA..TIBETAN LETTER JA +0F49..0F6C ; L # Lo [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA +0F7F ; L # Mc TIBETAN SIGN RNAM BCAD +0F85 ; L # Po TIBETAN MARK PALUTA +0F88..0F8C ; L # Lo [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN +0FBE..0FC5 ; L # So [8] TIBETAN KU RU KHA..TIBETAN SYMBOL RDO RJE +0FC7..0FCC ; L # So [6] TIBETAN SYMBOL RDO RJE RGYA GRAM..TIBETAN SYMBOL NOR BU BZHI -KHYIL +0FCE..0FCF ; L # So [2] TIBETAN SIGN RDEL NAG RDEL DKAR..TIBETAN SIGN RDEL NAG GSUM +0FD0..0FD4 ; L # Po [5] TIBETAN MARK BSKA- SHOG GI MGO RGYAN..TIBETAN MARK CLOSING BRDA RNYING YIG MGO SGAB MA +0FD5..0FD8 ; L # So [4] RIGHT-FACING SVASTI SIGN..LEFT-FACING SVASTI SIGN WITH DOTS +0FD9..0FDA ; L # Po [2] TIBETAN MARK LEADING MCHAN RTAGS..TIBETAN MARK TRAILING MCHAN RTAGS +1000..102A ; L # Lo [43] MYANMAR LETTER KA..MYANMAR LETTER AU +102B..102C ; L # Mc [2] MYANMAR VOWEL SIGN TALL AA..MYANMAR VOWEL SIGN AA +1031 ; L # Mc MYANMAR VOWEL SIGN E +1038 ; L # Mc MYANMAR SIGN VISARGA +103B..103C ; L # Mc [2] MYANMAR CONSONANT SIGN MEDIAL YA..MYANMAR CONSONANT SIGN MEDIAL RA +103F ; L # Lo MYANMAR LETTER GREAT SA +1040..1049 ; L # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE +104A..104F ; L # Po [6] MYANMAR SIGN LITTLE SECTION..MYANMAR SYMBOL GENITIVE +1050..1055 ; L # Lo [6] MYANMAR LETTER SHA..MYANMAR LETTER VOCALIC LL +1056..1057 ; L # Mc [2] MYANMAR VOWEL SIGN VOCALIC R..MYANMAR VOWEL SIGN VOCALIC RR +105A..105D ; L # Lo [4] MYANMAR LETTER MON NGA..MYANMAR LETTER MON BBE +1061 ; L # Lo MYANMAR LETTER SGAW KAREN SHA +1062..1064 ; L # Mc [3] MYANMAR VOWEL SIGN SGAW KAREN EU..MYANMAR TONE MARK SGAW KAREN KE PHO +1065..1066 ; L # Lo [2] MYANMAR LETTER WESTERN PWO KAREN THA..MYANMAR LETTER WESTERN PWO KAREN PWA +1067..106D ; L # Mc [7] MYANMAR VOWEL SIGN WESTERN PWO KAREN EU..MYANMAR SIGN WESTERN PWO KAREN TONE-5 +106E..1070 ; L # Lo [3] MYANMAR LETTER EASTERN PWO KAREN NNA..MYANMAR LETTER EASTERN PWO KAREN GHWA +1075..1081 ; L # Lo [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA +1083..1084 ; L # Mc [2] MYANMAR VOWEL SIGN SHAN AA..MYANMAR VOWEL SIGN SHAN E +1087..108C ; L # Mc [6] MYANMAR SIGN SHAN TONE-2..MYANMAR SIGN SHAN COUNCIL TONE-3 +108E ; L # Lo MYANMAR LETTER RUMAI PALAUNG FA +108F ; L # Mc MYANMAR SIGN RUMAI PALAUNG TONE-5 +1090..1099 ; L # Nd [10] MYANMAR SHAN DIGIT ZERO..MYANMAR SHAN DIGIT NINE +109A..109C ; L # Mc [3] MYANMAR SIGN KHAMTI TONE-1..MYANMAR VOWEL SIGN AITON A +109E..109F ; L # So [2] MYANMAR SYMBOL SHAN ONE..MYANMAR SYMBOL SHAN EXCLAMATION +10A0..10C5 ; L # L& [38] GEORGIAN CAPITAL LETTER AN..GEORGIAN CAPITAL LETTER HOE +10C7 ; L # L& GEORGIAN CAPITAL LETTER YN +10CD ; L # L& GEORGIAN CAPITAL LETTER AEN +10D0..10FA ; L # L& [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN +10FB ; L # Po GEORGIAN PARAGRAPH SEPARATOR +10FC ; L # Lm MODIFIER LETTER GEORGIAN NAR +10FD..10FF ; L # L& [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN +1100..1248 ; L # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA +124A..124D ; L # Lo [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE +1250..1256 ; L # Lo [7] ETHIOPIC SYLLABLE QHA..ETHIOPIC SYLLABLE QHO +1258 ; L # Lo ETHIOPIC SYLLABLE QHWA +125A..125D ; L # Lo [4] ETHIOPIC SYLLABLE QHWI..ETHIOPIC SYLLABLE QHWE +1260..1288 ; L # Lo [41] ETHIOPIC SYLLABLE BA..ETHIOPIC SYLLABLE XWA +128A..128D ; L # Lo [4] ETHIOPIC SYLLABLE XWI..ETHIOPIC SYLLABLE XWE +1290..12B0 ; L # Lo [33] ETHIOPIC SYLLABLE NA..ETHIOPIC SYLLABLE KWA +12B2..12B5 ; L # Lo [4] ETHIOPIC SYLLABLE KWI..ETHIOPIC SYLLABLE KWE +12B8..12BE ; L # Lo [7] ETHIOPIC SYLLABLE KXA..ETHIOPIC SYLLABLE KXO +12C0 ; L # Lo ETHIOPIC SYLLABLE KXWA +12C2..12C5 ; L # Lo [4] ETHIOPIC SYLLABLE KXWI..ETHIOPIC SYLLABLE KXWE +12C8..12D6 ; L # Lo [15] ETHIOPIC SYLLABLE WA..ETHIOPIC SYLLABLE PHARYNGEAL O +12D8..1310 ; L # Lo [57] ETHIOPIC SYLLABLE ZA..ETHIOPIC SYLLABLE GWA +1312..1315 ; L # Lo [4] ETHIOPIC SYLLABLE GWI..ETHIOPIC SYLLABLE GWE +1318..135A ; L # Lo [67] ETHIOPIC SYLLABLE GGA..ETHIOPIC SYLLABLE FYA +1360..1368 ; L # Po [9] ETHIOPIC SECTION MARK..ETHIOPIC PARAGRAPH SEPARATOR +1369..137C ; L # No [20] ETHIOPIC DIGIT ONE..ETHIOPIC NUMBER TEN THOUSAND +1380..138F ; L # Lo [16] ETHIOPIC SYLLABLE SEBATBEIT MWA..ETHIOPIC SYLLABLE PWE +13A0..13F5 ; L # L& [86] CHEROKEE LETTER A..CHEROKEE LETTER MV +13F8..13FD ; L # L& [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV +1401..166C ; L # Lo [620] CANADIAN SYLLABICS E..CANADIAN SYLLABICS CARRIER TTSA +166D ; L # So CANADIAN SYLLABICS CHI SIGN +166E ; L # Po CANADIAN SYLLABICS FULL STOP +166F..167F ; L # Lo [17] CANADIAN SYLLABICS QAI..CANADIAN SYLLABICS BLACKFOOT W +1681..169A ; L # Lo [26] OGHAM LETTER BEITH..OGHAM LETTER PEITH +16A0..16EA ; L # Lo [75] RUNIC LETTER FEHU FEOH FE F..RUNIC LETTER X +16EB..16ED ; L # Po [3] RUNIC SINGLE PUNCTUATION..RUNIC CROSS PUNCTUATION +16EE..16F0 ; L # Nl [3] RUNIC ARLAUG SYMBOL..RUNIC BELGTHOR SYMBOL +16F1..16F8 ; L # Lo [8] RUNIC LETTER K..RUNIC LETTER FRANKS CASKET AESC +1700..1711 ; L # Lo [18] TAGALOG LETTER A..TAGALOG LETTER HA +1715 ; L # Mc TAGALOG SIGN PAMUDPOD +171F..1731 ; L # Lo [19] TAGALOG LETTER ARCHAIC RA..HANUNOO LETTER HA +1734 ; L # Mc HANUNOO SIGN PAMUDPOD +1735..1736 ; L # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION +1740..1751 ; L # Lo [18] BUHID LETTER A..BUHID LETTER HA +1760..176C ; L # Lo [13] TAGBANWA LETTER A..TAGBANWA LETTER YA +176E..1770 ; L # Lo [3] TAGBANWA LETTER LA..TAGBANWA LETTER SA +1780..17B3 ; L # Lo [52] KHMER LETTER KA..KHMER INDEPENDENT VOWEL QAU +17B6 ; L # Mc KHMER VOWEL SIGN AA +17BE..17C5 ; L # Mc [8] KHMER VOWEL SIGN OE..KHMER VOWEL SIGN AU +17C7..17C8 ; L # Mc [2] KHMER SIGN REAHMUK..KHMER SIGN YUUKALEAPINTU +17D4..17D6 ; L # Po [3] KHMER SIGN KHAN..KHMER SIGN CAMNUC PII KUUH +17D7 ; L # Lm KHMER SIGN LEK TOO +17D8..17DA ; L # Po [3] KHMER SIGN BEYYAL..KHMER SIGN KOOMUUT +17DC ; L # Lo KHMER SIGN AVAKRAHASANYA +17E0..17E9 ; L # Nd [10] KHMER DIGIT ZERO..KHMER DIGIT NINE +1810..1819 ; L # Nd [10] MONGOLIAN DIGIT ZERO..MONGOLIAN DIGIT NINE +1820..1842 ; L # Lo [35] MONGOLIAN LETTER A..MONGOLIAN LETTER CHI +1843 ; L # Lm MONGOLIAN LETTER TODO LONG VOWEL SIGN +1844..1878 ; L # Lo [53] MONGOLIAN LETTER TODO E..MONGOLIAN LETTER CHA WITH TWO DOTS +1880..1884 ; L # Lo [5] MONGOLIAN LETTER ALI GALI ANUSVARA ONE..MONGOLIAN LETTER ALI GALI INVERTED UBADAMA +1887..18A8 ; L # Lo [34] MONGOLIAN LETTER ALI GALI A..MONGOLIAN LETTER MANCHU ALI GALI BHA +18AA ; L # Lo MONGOLIAN LETTER MANCHU ALI GALI LHA +18B0..18F5 ; L # Lo [70] CANADIAN SYLLABICS OY..CANADIAN SYLLABICS CARRIER DENTAL S +1900..191E ; L # Lo [31] LIMBU VOWEL-CARRIER LETTER..LIMBU LETTER TRA +1923..1926 ; L # Mc [4] LIMBU VOWEL SIGN EE..LIMBU VOWEL SIGN AU +1929..192B ; L # Mc [3] LIMBU SUBJOINED LETTER YA..LIMBU SUBJOINED LETTER WA +1930..1931 ; L # Mc [2] LIMBU SMALL LETTER KA..LIMBU SMALL LETTER NGA +1933..1938 ; L # Mc [6] LIMBU SMALL LETTER TA..LIMBU SMALL LETTER LA +1946..194F ; L # Nd [10] LIMBU DIGIT ZERO..LIMBU DIGIT NINE +1950..196D ; L # Lo [30] TAI LE LETTER KA..TAI LE LETTER AI +1970..1974 ; L # Lo [5] TAI LE LETTER TONE-2..TAI LE LETTER TONE-6 +1980..19AB ; L # Lo [44] NEW TAI LUE LETTER HIGH QA..NEW TAI LUE LETTER LOW SUA +19B0..19C9 ; L # Lo [26] NEW TAI LUE VOWEL SIGN VOWEL SHORTENER..NEW TAI LUE TONE MARK-2 +19D0..19D9 ; L # Nd [10] NEW TAI LUE DIGIT ZERO..NEW TAI LUE DIGIT NINE +19DA ; L # No NEW TAI LUE THAM DIGIT ONE +1A00..1A16 ; L # Lo [23] BUGINESE LETTER KA..BUGINESE LETTER HA +1A19..1A1A ; L # Mc [2] BUGINESE VOWEL SIGN E..BUGINESE VOWEL SIGN O +1A1E..1A1F ; L # Po [2] BUGINESE PALLAWA..BUGINESE END OF SECTION +1A20..1A54 ; L # Lo [53] TAI THAM LETTER HIGH KA..TAI THAM LETTER GREAT SA +1A55 ; L # Mc TAI THAM CONSONANT SIGN MEDIAL RA +1A57 ; L # Mc TAI THAM CONSONANT SIGN LA TANG LAI +1A61 ; L # Mc TAI THAM VOWEL SIGN A +1A63..1A64 ; L # Mc [2] TAI THAM VOWEL SIGN AA..TAI THAM VOWEL SIGN TALL AA +1A6D..1A72 ; L # Mc [6] TAI THAM VOWEL SIGN OY..TAI THAM VOWEL SIGN THAM AI +1A80..1A89 ; L # Nd [10] TAI THAM HORA DIGIT ZERO..TAI THAM HORA DIGIT NINE +1A90..1A99 ; L # Nd [10] TAI THAM THAM DIGIT ZERO..TAI THAM THAM DIGIT NINE +1AA0..1AA6 ; L # Po [7] TAI THAM SIGN WIANG..TAI THAM SIGN REVERSED ROTATED RANA +1AA7 ; L # Lm TAI THAM SIGN MAI YAMOK +1AA8..1AAD ; L # Po [6] TAI THAM SIGN KAAN..TAI THAM SIGN CAANG +1B04 ; L # Mc BALINESE SIGN BISAH +1B05..1B33 ; L # Lo [47] BALINESE LETTER AKARA..BALINESE LETTER HA +1B35 ; L # Mc BALINESE VOWEL SIGN TEDUNG +1B3B ; L # Mc BALINESE VOWEL SIGN RA REPA TEDUNG +1B3D..1B41 ; L # Mc [5] BALINESE VOWEL SIGN LA LENGA TEDUNG..BALINESE VOWEL SIGN TALING REPA TEDUNG +1B43..1B44 ; L # Mc [2] BALINESE VOWEL SIGN PEPET TEDUNG..BALINESE ADEG ADEG +1B45..1B4C ; L # Lo [8] BALINESE LETTER KAF SASAK..BALINESE LETTER ARCHAIC JNYA +1B50..1B59 ; L # Nd [10] BALINESE DIGIT ZERO..BALINESE DIGIT NINE +1B5A..1B60 ; L # Po [7] BALINESE PANTI..BALINESE PAMENENG +1B61..1B6A ; L # So [10] BALINESE MUSICAL SYMBOL DONG..BALINESE MUSICAL SYMBOL DANG GEDE +1B74..1B7C ; L # So [9] BALINESE MUSICAL SYMBOL RIGHT-HAND OPEN DUG..BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING +1B7D..1B7E ; L # Po [2] BALINESE PANTI LANTANG..BALINESE PAMADA LANTANG +1B82 ; L # Mc SUNDANESE SIGN PANGWISAD +1B83..1BA0 ; L # Lo [30] SUNDANESE LETTER A..SUNDANESE LETTER HA +1BA1 ; L # Mc SUNDANESE CONSONANT SIGN PAMINGKAL +1BA6..1BA7 ; L # Mc [2] SUNDANESE VOWEL SIGN PANAELAENG..SUNDANESE VOWEL SIGN PANOLONG +1BAA ; L # Mc SUNDANESE SIGN PAMAAEH +1BAE..1BAF ; L # Lo [2] SUNDANESE LETTER KHA..SUNDANESE LETTER SYA +1BB0..1BB9 ; L # Nd [10] SUNDANESE DIGIT ZERO..SUNDANESE DIGIT NINE +1BBA..1BE5 ; L # Lo [44] SUNDANESE AVAGRAHA..BATAK LETTER U +1BE7 ; L # Mc BATAK VOWEL SIGN E +1BEA..1BEC ; L # Mc [3] BATAK VOWEL SIGN I..BATAK VOWEL SIGN O +1BEE ; L # Mc BATAK VOWEL SIGN U +1BF2..1BF3 ; L # Mc [2] BATAK PANGOLAT..BATAK PANONGONAN +1BFC..1BFF ; L # Po [4] BATAK SYMBOL BINDU NA METEK..BATAK SYMBOL BINDU PANGOLAT +1C00..1C23 ; L # Lo [36] LEPCHA LETTER KA..LEPCHA LETTER A +1C24..1C2B ; L # Mc [8] LEPCHA SUBJOINED LETTER YA..LEPCHA VOWEL SIGN UU +1C34..1C35 ; L # Mc [2] LEPCHA CONSONANT SIGN NYIN-DO..LEPCHA CONSONANT SIGN KANG +1C3B..1C3F ; L # Po [5] LEPCHA PUNCTUATION TA-ROL..LEPCHA PUNCTUATION TSHOOK +1C40..1C49 ; L # Nd [10] LEPCHA DIGIT ZERO..LEPCHA DIGIT NINE +1C4D..1C4F ; L # Lo [3] LEPCHA LETTER TTA..LEPCHA LETTER DDA +1C50..1C59 ; L # Nd [10] OL CHIKI DIGIT ZERO..OL CHIKI DIGIT NINE +1C5A..1C77 ; L # Lo [30] OL CHIKI LETTER LA..OL CHIKI LETTER OH +1C78..1C7D ; L # Lm [6] OL CHIKI MU TTUDDAG..OL CHIKI AHAD +1C7E..1C7F ; L # Po [2] OL CHIKI PUNCTUATION MUCAAD..OL CHIKI PUNCTUATION DOUBLE MUCAAD +1C80..1C88 ; L # L& [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK +1C90..1CBA ; L # L& [43] GEORGIAN MTAVRULI CAPITAL LETTER AN..GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD..1CBF ; L # L& [3] GEORGIAN MTAVRULI CAPITAL LETTER AEN..GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1CC0..1CC7 ; L # Po [8] SUNDANESE PUNCTUATION BINDU SURYA..SUNDANESE PUNCTUATION BINDU BA SATANGA +1CD3 ; L # Po VEDIC SIGN NIHSHVASA +1CE1 ; L # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA +1CE9..1CEC ; L # Lo [4] VEDIC SIGN ANUSVARA ANTARGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL +1CEE..1CF3 ; L # Lo [6] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ROTATED ARDHAVISARGA +1CF5..1CF6 ; L # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA +1CF7 ; L # Mc VEDIC SIGN ATIKRAMA +1CFA ; L # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA +1D00..1D2B ; L # L& [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL +1D2C..1D6A ; L # Lm [63] MODIFIER LETTER CAPITAL A..GREEK SUBSCRIPT SMALL LETTER CHI +1D6B..1D77 ; L # L& [13] LATIN SMALL LETTER UE..LATIN SMALL LETTER TURNED G +1D78 ; L # Lm MODIFIER LETTER CYRILLIC EN +1D79..1D9A ; L # L& [34] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK +1D9B..1DBF ; L # Lm [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA +1E00..1F15 ; L # L& [278] LATIN CAPITAL LETTER A WITH RING BELOW..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA +1F18..1F1D ; L # L& [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F20..1F45 ; L # L& [38] GREEK SMALL LETTER ETA WITH PSILI..GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA +1F48..1F4D ; L # L& [6] GREEK CAPITAL LETTER OMICRON WITH PSILI..GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50..1F57 ; L # L& [8] GREEK SMALL LETTER UPSILON WITH PSILI..GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F59 ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D ; L # L& GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F..1F7D ; L # L& [31] GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI..GREEK SMALL LETTER OMEGA WITH OXIA +1F80..1FB4 ; L # L& [53] GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI..GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6..1FBC ; L # L& [7] GREEK SMALL LETTER ALPHA WITH PERISPOMENI..GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE ; L # L& GREEK PROSGEGRAMMENI +1FC2..1FC4 ; L # L& [3] GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6..1FCC ; L # L& [7] GREEK SMALL LETTER ETA WITH PERISPOMENI..GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD0..1FD3 ; L # L& [4] GREEK SMALL LETTER IOTA WITH VRACHY..GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6..1FDB ; L # L& [6] GREEK SMALL LETTER IOTA WITH PERISPOMENI..GREEK CAPITAL LETTER IOTA WITH OXIA +1FE0..1FEC ; L # L& [13] GREEK SMALL LETTER UPSILON WITH VRACHY..GREEK CAPITAL LETTER RHO WITH DASIA +1FF2..1FF4 ; L # L& [3] GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI..GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6..1FFC ; L # L& [7] GREEK SMALL LETTER OMEGA WITH PERISPOMENI..GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +200E ; L # Cf LEFT-TO-RIGHT MARK +2071 ; L # Lm SUPERSCRIPT LATIN SMALL LETTER I +207F ; L # Lm SUPERSCRIPT LATIN SMALL LETTER N +2090..209C ; L # Lm [13] LATIN SUBSCRIPT SMALL LETTER A..LATIN SUBSCRIPT SMALL LETTER T +2102 ; L # L& DOUBLE-STRUCK CAPITAL C +2107 ; L # L& EULER CONSTANT +210A..2113 ; L # L& [10] SCRIPT SMALL G..SCRIPT SMALL L +2115 ; L # L& DOUBLE-STRUCK CAPITAL N +2119..211D ; L # L& [5] DOUBLE-STRUCK CAPITAL P..DOUBLE-STRUCK CAPITAL R +2124 ; L # L& DOUBLE-STRUCK CAPITAL Z +2126 ; L # L& OHM SIGN +2128 ; L # L& BLACK-LETTER CAPITAL Z +212A..212D ; L # L& [4] KELVIN SIGN..BLACK-LETTER CAPITAL C +212F..2134 ; L # L& [6] SCRIPT SMALL E..SCRIPT SMALL O +2135..2138 ; L # Lo [4] ALEF SYMBOL..DALET SYMBOL +2139 ; L # L& INFORMATION SOURCE +213C..213F ; L # L& [4] DOUBLE-STRUCK SMALL PI..DOUBLE-STRUCK CAPITAL PI +2145..2149 ; L # L& [5] DOUBLE-STRUCK ITALIC CAPITAL D..DOUBLE-STRUCK ITALIC SMALL J +214E ; L # L& TURNED SMALL F +214F ; L # So SYMBOL FOR SAMARITAN SOURCE +2160..2182 ; L # Nl [35] ROMAN NUMERAL ONE..ROMAN NUMERAL TEN THOUSAND +2183..2184 ; L # L& [2] ROMAN NUMERAL REVERSED ONE HUNDRED..LATIN SMALL LETTER REVERSED C +2185..2188 ; L # Nl [4] ROMAN NUMERAL SIX LATE FORM..ROMAN NUMERAL ONE HUNDRED THOUSAND +2336..237A ; L # So [69] APL FUNCTIONAL SYMBOL I-BEAM..APL FUNCTIONAL SYMBOL ALPHA +2395 ; L # So APL FUNCTIONAL SYMBOL QUAD +249C..24E9 ; L # So [78] PARENTHESIZED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z +26AC ; L # So MEDIUM SMALL WHITE CIRCLE +2800..28FF ; L # So [256] BRAILLE PATTERN BLANK..BRAILLE PATTERN DOTS-12345678 +2C00..2C7B ; L # L& [124] GLAGOLITIC CAPITAL LETTER AZU..LATIN LETTER SMALL CAPITAL TURNED E +2C7C..2C7D ; L # Lm [2] LATIN SUBSCRIPT SMALL LETTER J..MODIFIER LETTER CAPITAL V +2C7E..2CE4 ; L # L& [103] LATIN CAPITAL LETTER S WITH SWASH TAIL..COPTIC SYMBOL KAI +2CEB..2CEE ; L # L& [4] COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI..COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA +2CF2..2CF3 ; L # L& [2] COPTIC CAPITAL LETTER BOHAIRIC KHEI..COPTIC SMALL LETTER BOHAIRIC KHEI +2D00..2D25 ; L # L& [38] GEORGIAN SMALL LETTER AN..GEORGIAN SMALL LETTER HOE +2D27 ; L # L& GEORGIAN SMALL LETTER YN +2D2D ; L # L& GEORGIAN SMALL LETTER AEN +2D30..2D67 ; L # Lo [56] TIFINAGH LETTER YA..TIFINAGH LETTER YO +2D6F ; L # Lm TIFINAGH MODIFIER LETTER LABIALIZATION MARK +2D70 ; L # Po TIFINAGH SEPARATOR MARK +2D80..2D96 ; L # Lo [23] ETHIOPIC SYLLABLE LOA..ETHIOPIC SYLLABLE GGWE +2DA0..2DA6 ; L # Lo [7] ETHIOPIC SYLLABLE SSA..ETHIOPIC SYLLABLE SSO +2DA8..2DAE ; L # Lo [7] ETHIOPIC SYLLABLE CCA..ETHIOPIC SYLLABLE CCO +2DB0..2DB6 ; L # Lo [7] ETHIOPIC SYLLABLE ZZA..ETHIOPIC SYLLABLE ZZO +2DB8..2DBE ; L # Lo [7] ETHIOPIC SYLLABLE CCHA..ETHIOPIC SYLLABLE CCHO +2DC0..2DC6 ; L # Lo [7] ETHIOPIC SYLLABLE QYA..ETHIOPIC SYLLABLE QYO +2DC8..2DCE ; L # Lo [7] ETHIOPIC SYLLABLE KYA..ETHIOPIC SYLLABLE KYO +2DD0..2DD6 ; L # Lo [7] ETHIOPIC SYLLABLE XYA..ETHIOPIC SYLLABLE XYO +2DD8..2DDE ; L # Lo [7] ETHIOPIC SYLLABLE GYA..ETHIOPIC SYLLABLE GYO +3005 ; L # Lm IDEOGRAPHIC ITERATION MARK +3006 ; L # Lo IDEOGRAPHIC CLOSING MARK +3007 ; L # Nl IDEOGRAPHIC NUMBER ZERO +3021..3029 ; L # Nl [9] HANGZHOU NUMERAL ONE..HANGZHOU NUMERAL NINE +302E..302F ; L # Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK +3031..3035 ; L # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF +3038..303A ; L # Nl [3] HANGZHOU NUMERAL TEN..HANGZHOU NUMERAL THIRTY +303B ; L # Lm VERTICAL IDEOGRAPHIC ITERATION MARK +303C ; L # Lo MASU MARK +3041..3096 ; L # Lo [86] HIRAGANA LETTER SMALL A..HIRAGANA LETTER SMALL KE +309D..309E ; L # Lm [2] HIRAGANA ITERATION MARK..HIRAGANA VOICED ITERATION MARK +309F ; L # Lo HIRAGANA DIGRAPH YORI +30A1..30FA ; L # Lo [90] KATAKANA LETTER SMALL A..KATAKANA LETTER VO +30FC..30FE ; L # Lm [3] KATAKANA-HIRAGANA PROLONGED SOUND MARK..KATAKANA VOICED ITERATION MARK +30FF ; L # Lo KATAKANA DIGRAPH KOTO +3105..312F ; L # Lo [43] BOPOMOFO LETTER B..BOPOMOFO LETTER NN +3131..318E ; L # Lo [94] HANGUL LETTER KIYEOK..HANGUL LETTER ARAEAE +3190..3191 ; L # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK +3192..3195 ; L # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK +3196..319F ; L # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK +31A0..31BF ; L # Lo [32] BOPOMOFO LETTER BU..BOPOMOFO LETTER AH +31F0..31FF ; L # Lo [16] KATAKANA LETTER SMALL KU..KATAKANA LETTER SMALL RO +3200..321C ; L # So [29] PARENTHESIZED HANGUL KIYEOK..PARENTHESIZED HANGUL CIEUC U +3220..3229 ; L # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN +322A..3247 ; L # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO +3248..324F ; L # No [8] CIRCLED NUMBER TEN ON BLACK SQUARE..CIRCLED NUMBER EIGHTY ON BLACK SQUARE +3260..327B ; L # So [28] CIRCLED HANGUL KIYEOK..CIRCLED HANGUL HIEUH A +327F ; L # So KOREAN STANDARD SYMBOL +3280..3289 ; L # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN +328A..32B0 ; L # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT +32C0..32CB ; L # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER +32D0..3376 ; L # So [167] CIRCLED KATAKANA A..SQUARE PC +337B..33DD ; L # So [99] SQUARE ERA NAME HEISEI..SQUARE WB +33E0..33FE ; L # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE +3400..4DBF ; L # Lo [6592] CJK UNIFIED IDEOGRAPH-3400..CJK UNIFIED IDEOGRAPH-4DBF +4E00..A014 ; L # Lo [21013] CJK UNIFIED IDEOGRAPH-4E00..YI SYLLABLE E +A015 ; L # Lm YI SYLLABLE WU +A016..A48C ; L # Lo [1143] YI SYLLABLE BIT..YI SYLLABLE YYR +A4D0..A4F7 ; L # Lo [40] LISU LETTER BA..LISU LETTER OE +A4F8..A4FD ; L # Lm [6] LISU LETTER TONE MYA TI..LISU LETTER TONE MYA JEU +A4FE..A4FF ; L # Po [2] LISU PUNCTUATION COMMA..LISU PUNCTUATION FULL STOP +A500..A60B ; L # Lo [268] VAI SYLLABLE EE..VAI SYLLABLE NG +A60C ; L # Lm VAI SYLLABLE LENGTHENER +A610..A61F ; L # Lo [16] VAI SYLLABLE NDOLE FA..VAI SYMBOL JONG +A620..A629 ; L # Nd [10] VAI DIGIT ZERO..VAI DIGIT NINE +A62A..A62B ; L # Lo [2] VAI SYLLABLE NDOLE MA..VAI SYLLABLE NDOLE DO +A640..A66D ; L # L& [46] CYRILLIC CAPITAL LETTER ZEMLYA..CYRILLIC SMALL LETTER DOUBLE MONOCULAR O +A66E ; L # Lo CYRILLIC LETTER MULTIOCULAR O +A680..A69B ; L # L& [28] CYRILLIC CAPITAL LETTER DWE..CYRILLIC SMALL LETTER CROSSED O +A69C..A69D ; L # Lm [2] MODIFIER LETTER CYRILLIC HARD SIGN..MODIFIER LETTER CYRILLIC SOFT SIGN +A6A0..A6E5 ; L # Lo [70] BAMUM LETTER A..BAMUM LETTER KI +A6E6..A6EF ; L # Nl [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM +A6F2..A6F7 ; L # Po [6] BAMUM NJAEMLI..BAMUM QUESTION MARK +A722..A76F ; L # L& [78] LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF..LATIN SMALL LETTER CON +A770 ; L # Lm MODIFIER LETTER US +A771..A787 ; L # L& [23] LATIN SMALL LETTER DUM..LATIN SMALL LETTER INSULAR T +A789..A78A ; L # Sk [2] MODIFIER LETTER COLON..MODIFIER LETTER SHORT EQUALS SIGN +A78B..A78E ; L # L& [4] LATIN CAPITAL LETTER SALTILLO..LATIN SMALL LETTER L WITH RETROFLEX HOOK AND BELT +A78F ; L # Lo LATIN LETTER SINOLOGICAL DOT +A790..A7CA ; L # L& [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY +A7D0..A7D1 ; L # L& [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G +A7D3 ; L # L& LATIN SMALL LETTER DOUBLE THORN +A7D5..A7D9 ; L # L& [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S +A7F2..A7F4 ; L # Lm [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q +A7F5..A7F6 ; L # L& [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H +A7F7 ; L # Lo LATIN EPIGRAPHIC LETTER SIDEWAYS I +A7F8..A7F9 ; L # Lm [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE +A7FA ; L # L& LATIN LETTER SMALL CAPITAL TURNED M +A7FB..A801 ; L # Lo [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I +A803..A805 ; L # Lo [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O +A807..A80A ; L # Lo [4] SYLOTI NAGRI LETTER KO..SYLOTI NAGRI LETTER GHO +A80C..A822 ; L # Lo [23] SYLOTI NAGRI LETTER CO..SYLOTI NAGRI LETTER HO +A823..A824 ; L # Mc [2] SYLOTI NAGRI VOWEL SIGN A..SYLOTI NAGRI VOWEL SIGN I +A827 ; L # Mc SYLOTI NAGRI VOWEL SIGN OO +A830..A835 ; L # No [6] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE SIXTEENTHS +A836..A837 ; L # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK +A840..A873 ; L # Lo [52] PHAGS-PA LETTER KA..PHAGS-PA LETTER CANDRABINDU +A880..A881 ; L # Mc [2] SAURASHTRA SIGN ANUSVARA..SAURASHTRA SIGN VISARGA +A882..A8B3 ; L # Lo [50] SAURASHTRA LETTER A..SAURASHTRA LETTER LLA +A8B4..A8C3 ; L # Mc [16] SAURASHTRA CONSONANT SIGN HAARU..SAURASHTRA VOWEL SIGN AU +A8CE..A8CF ; L # Po [2] SAURASHTRA DANDA..SAURASHTRA DOUBLE DANDA +A8D0..A8D9 ; L # Nd [10] SAURASHTRA DIGIT ZERO..SAURASHTRA DIGIT NINE +A8F2..A8F7 ; L # Lo [6] DEVANAGARI SIGN SPACING CANDRABINDU..DEVANAGARI SIGN CANDRABINDU AVAGRAHA +A8F8..A8FA ; L # Po [3] DEVANAGARI SIGN PUSHPIKA..DEVANAGARI CARET +A8FB ; L # Lo DEVANAGARI HEADSTROKE +A8FC ; L # Po DEVANAGARI SIGN SIDDHAM +A8FD..A8FE ; L # Lo [2] DEVANAGARI JAIN OM..DEVANAGARI LETTER AY +A900..A909 ; L # Nd [10] KAYAH LI DIGIT ZERO..KAYAH LI DIGIT NINE +A90A..A925 ; L # Lo [28] KAYAH LI LETTER KA..KAYAH LI LETTER OO +A92E..A92F ; L # Po [2] KAYAH LI SIGN CWI..KAYAH LI SIGN SHYA +A930..A946 ; L # Lo [23] REJANG LETTER KA..REJANG LETTER A +A952..A953 ; L # Mc [2] REJANG CONSONANT SIGN H..REJANG VIRAMA +A95F ; L # Po REJANG SECTION MARK +A960..A97C ; L # Lo [29] HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH +A983 ; L # Mc JAVANESE SIGN WIGNYAN +A984..A9B2 ; L # Lo [47] JAVANESE LETTER A..JAVANESE LETTER HA +A9B4..A9B5 ; L # Mc [2] JAVANESE VOWEL SIGN TARUNG..JAVANESE VOWEL SIGN TOLONG +A9BA..A9BB ; L # Mc [2] JAVANESE VOWEL SIGN TALING..JAVANESE VOWEL SIGN DIRGA MURE +A9BE..A9C0 ; L # Mc [3] JAVANESE CONSONANT SIGN PENGKAL..JAVANESE PANGKON +A9C1..A9CD ; L # Po [13] JAVANESE LEFT RERENGGAN..JAVANESE TURNED PADA PISELEH +A9CF ; L # Lm JAVANESE PANGRANGKEP +A9D0..A9D9 ; L # Nd [10] JAVANESE DIGIT ZERO..JAVANESE DIGIT NINE +A9DE..A9DF ; L # Po [2] JAVANESE PADA TIRTA TUMETES..JAVANESE PADA ISEN-ISEN +A9E0..A9E4 ; L # Lo [5] MYANMAR LETTER SHAN GHA..MYANMAR LETTER SHAN BHA +A9E6 ; L # Lm MYANMAR MODIFIER LETTER SHAN REDUPLICATION +A9E7..A9EF ; L # Lo [9] MYANMAR LETTER TAI LAING NYA..MYANMAR LETTER TAI LAING NNA +A9F0..A9F9 ; L # Nd [10] MYANMAR TAI LAING DIGIT ZERO..MYANMAR TAI LAING DIGIT NINE +A9FA..A9FE ; L # Lo [5] MYANMAR LETTER TAI LAING LLA..MYANMAR LETTER TAI LAING BHA +AA00..AA28 ; L # Lo [41] CHAM LETTER A..CHAM LETTER HA +AA2F..AA30 ; L # Mc [2] CHAM VOWEL SIGN O..CHAM VOWEL SIGN AI +AA33..AA34 ; L # Mc [2] CHAM CONSONANT SIGN YA..CHAM CONSONANT SIGN RA +AA40..AA42 ; L # Lo [3] CHAM LETTER FINAL K..CHAM LETTER FINAL NG +AA44..AA4B ; L # Lo [8] CHAM LETTER FINAL CH..CHAM LETTER FINAL SS +AA4D ; L # Mc CHAM CONSONANT SIGN FINAL H +AA50..AA59 ; L # Nd [10] CHAM DIGIT ZERO..CHAM DIGIT NINE +AA5C..AA5F ; L # Po [4] CHAM PUNCTUATION SPIRAL..CHAM PUNCTUATION TRIPLE DANDA +AA60..AA6F ; L # Lo [16] MYANMAR LETTER KHAMTI GA..MYANMAR LETTER KHAMTI FA +AA70 ; L # Lm MYANMAR MODIFIER LETTER KHAMTI REDUPLICATION +AA71..AA76 ; L # Lo [6] MYANMAR LETTER KHAMTI XA..MYANMAR LOGOGRAM KHAMTI HM +AA77..AA79 ; L # So [3] MYANMAR SYMBOL AITON EXCLAMATION..MYANMAR SYMBOL AITON TWO +AA7A ; L # Lo MYANMAR LETTER AITON RA +AA7B ; L # Mc MYANMAR SIGN PAO KAREN TONE +AA7D ; L # Mc MYANMAR SIGN TAI LAING TONE-5 +AA7E..AAAF ; L # Lo [50] MYANMAR LETTER SHWE PALAUNG CHA..TAI VIET LETTER HIGH O +AAB1 ; L # Lo TAI VIET VOWEL AA +AAB5..AAB6 ; L # Lo [2] TAI VIET VOWEL E..TAI VIET VOWEL O +AAB9..AABD ; L # Lo [5] TAI VIET VOWEL UEA..TAI VIET VOWEL AN +AAC0 ; L # Lo TAI VIET TONE MAI NUENG +AAC2 ; L # Lo TAI VIET TONE MAI SONG +AADB..AADC ; L # Lo [2] TAI VIET SYMBOL KON..TAI VIET SYMBOL NUENG +AADD ; L # Lm TAI VIET SYMBOL SAM +AADE..AADF ; L # Po [2] TAI VIET SYMBOL HO HOI..TAI VIET SYMBOL KOI KOI +AAE0..AAEA ; L # Lo [11] MEETEI MAYEK LETTER E..MEETEI MAYEK LETTER SSA +AAEB ; L # Mc MEETEI MAYEK VOWEL SIGN II +AAEE..AAEF ; L # Mc [2] MEETEI MAYEK VOWEL SIGN AU..MEETEI MAYEK VOWEL SIGN AAU +AAF0..AAF1 ; L # Po [2] MEETEI MAYEK CHEIKHAN..MEETEI MAYEK AHANG KHUDAM +AAF2 ; L # Lo MEETEI MAYEK ANJI +AAF3..AAF4 ; L # Lm [2] MEETEI MAYEK SYLLABLE REPETITION MARK..MEETEI MAYEK WORD REPETITION MARK +AAF5 ; L # Mc MEETEI MAYEK VOWEL SIGN VISARGA +AB01..AB06 ; L # Lo [6] ETHIOPIC SYLLABLE TTHU..ETHIOPIC SYLLABLE TTHO +AB09..AB0E ; L # Lo [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDHO +AB11..AB16 ; L # Lo [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO +AB20..AB26 ; L # Lo [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO +AB28..AB2E ; L # Lo [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO +AB30..AB5A ; L # L& [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG +AB5B ; L # Sk MODIFIER BREVE WITH INVERTED BREVE +AB5C..AB5F ; L # Lm [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK +AB60..AB68 ; L # L& [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE +AB69 ; L # Lm MODIFIER LETTER SMALL TURNED W +AB70..ABBF ; L # L& [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA +ABC0..ABE2 ; L # Lo [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM +ABE3..ABE4 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN ONAP..MEETEI MAYEK VOWEL SIGN INAP +ABE6..ABE7 ; L # Mc [2] MEETEI MAYEK VOWEL SIGN YENAP..MEETEI MAYEK VOWEL SIGN SOUNAP +ABE9..ABEA ; L # Mc [2] MEETEI MAYEK VOWEL SIGN CHEINAP..MEETEI MAYEK VOWEL SIGN NUNG +ABEB ; L # Po MEETEI MAYEK CHEIKHEI +ABEC ; L # Mc MEETEI MAYEK LUM IYEK +ABF0..ABF9 ; L # Nd [10] MEETEI MAYEK DIGIT ZERO..MEETEI MAYEK DIGIT NINE +AC00..D7A3 ; L # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH +D7B0..D7C6 ; L # Lo [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E +D7CB..D7FB ; L # Lo [49] HANGUL JONGSEONG NIEUN-RIEUL..HANGUL JONGSEONG PHIEUPH-THIEUTH +E000..F8FF ; L # Co [6400] .. +F900..FA6D ; L # Lo [366] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA6D +FA70..FAD9 ; L # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 +FB00..FB06 ; L # L& [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST +FB13..FB17 ; L # L& [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH +FF21..FF3A ; L # L& [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER Z +FF41..FF5A ; L # L& [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER Z +FF66..FF6F ; L # Lo [10] HALFWIDTH KATAKANA LETTER WO..HALFWIDTH KATAKANA LETTER SMALL TU +FF70 ; L # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +FF71..FF9D ; L # Lo [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAKANA LETTER N +FF9E..FF9F ; L # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +FFA0..FFBE ; L # Lo [31] HALFWIDTH HANGUL FILLER..HALFWIDTH HANGUL LETTER HIEUH +FFC2..FFC7 ; L # Lo [6] HALFWIDTH HANGUL LETTER A..HALFWIDTH HANGUL LETTER E +FFCA..FFCF ; L # Lo [6] HALFWIDTH HANGUL LETTER YEO..HALFWIDTH HANGUL LETTER OE +FFD2..FFD7 ; L # Lo [6] HALFWIDTH HANGUL LETTER YO..HALFWIDTH HANGUL LETTER YU +FFDA..FFDC ; L # Lo [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL LETTER I +10000..1000B ; L # Lo [12] LINEAR B SYLLABLE B008 A..LINEAR B SYLLABLE B046 JE +1000D..10026 ; L # Lo [26] LINEAR B SYLLABLE B036 JO..LINEAR B SYLLABLE B032 QO +10028..1003A ; L # Lo [19] LINEAR B SYLLABLE B060 RA..LINEAR B SYLLABLE B042 WO +1003C..1003D ; L # Lo [2] LINEAR B SYLLABLE B017 ZA..LINEAR B SYLLABLE B074 ZE +1003F..1004D ; L # Lo [15] LINEAR B SYLLABLE B020 ZO..LINEAR B SYLLABLE B091 TWO +10050..1005D ; L # Lo [14] LINEAR B SYMBOL B018..LINEAR B SYMBOL B089 +10080..100FA ; L # Lo [123] LINEAR B IDEOGRAM B100 MAN..LINEAR B IDEOGRAM VESSEL B305 +10100 ; L # Po AEGEAN WORD SEPARATOR LINE +10102 ; L # Po AEGEAN CHECK MARK +10107..10133 ; L # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND +10137..1013F ; L # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT +1018D..1018E ; L # So [2] GREEK INDICTION SIGN..NOMISMA SIGN +101D0..101FC ; L # So [45] PHAISTOS DISC SIGN PEDESTRIAN..PHAISTOS DISC SIGN WAVY BAND +10280..1029C ; L # Lo [29] LYCIAN LETTER A..LYCIAN LETTER X +102A0..102D0 ; L # Lo [49] CARIAN LETTER A..CARIAN LETTER UUU3 +10300..1031F ; L # Lo [32] OLD ITALIC LETTER A..OLD ITALIC LETTER ESS +10320..10323 ; L # No [4] OLD ITALIC NUMERAL ONE..OLD ITALIC NUMERAL FIFTY +1032D..10340 ; L # Lo [20] OLD ITALIC LETTER YE..GOTHIC LETTER PAIRTHRA +10341 ; L # Nl GOTHIC LETTER NINETY +10342..10349 ; L # Lo [8] GOTHIC LETTER RAIDA..GOTHIC LETTER OTHAL +1034A ; L # Nl GOTHIC LETTER NINE HUNDRED +10350..10375 ; L # Lo [38] OLD PERMIC LETTER AN..OLD PERMIC LETTER IA +10380..1039D ; L # Lo [30] UGARITIC LETTER ALPA..UGARITIC LETTER SSU +1039F ; L # Po UGARITIC WORD DIVIDER +103A0..103C3 ; L # Lo [36] OLD PERSIAN SIGN A..OLD PERSIAN SIGN HA +103C8..103CF ; L # Lo [8] OLD PERSIAN SIGN AURAMAZDAA..OLD PERSIAN SIGN BUUMISH +103D0 ; L # Po OLD PERSIAN WORD DIVIDER +103D1..103D5 ; L # Nl [5] OLD PERSIAN NUMBER ONE..OLD PERSIAN NUMBER HUNDRED +10400..1044F ; L # L& [80] DESERET CAPITAL LETTER LONG I..DESERET SMALL LETTER EW +10450..1049D ; L # Lo [78] SHAVIAN LETTER PEEP..OSMANYA LETTER OO +104A0..104A9 ; L # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE +104B0..104D3 ; L # L& [36] OSAGE CAPITAL LETTER A..OSAGE CAPITAL LETTER ZHA +104D8..104FB ; L # L& [36] OSAGE SMALL LETTER A..OSAGE SMALL LETTER ZHA +10500..10527 ; L # Lo [40] ELBASAN LETTER A..ELBASAN LETTER KHE +10530..10563 ; L # Lo [52] CAUCASIAN ALBANIAN LETTER ALT..CAUCASIAN ALBANIAN LETTER KIW +1056F ; L # Po CAUCASIAN ALBANIAN CITATION MARK +10570..1057A ; L # L& [11] VITHKUQI CAPITAL LETTER A..VITHKUQI CAPITAL LETTER GA +1057C..1058A ; L # L& [15] VITHKUQI CAPITAL LETTER HA..VITHKUQI CAPITAL LETTER RE +1058C..10592 ; L # L& [7] VITHKUQI CAPITAL LETTER SE..VITHKUQI CAPITAL LETTER XE +10594..10595 ; L # L& [2] VITHKUQI CAPITAL LETTER Y..VITHKUQI CAPITAL LETTER ZE +10597..105A1 ; L # L& [11] VITHKUQI SMALL LETTER A..VITHKUQI SMALL LETTER GA +105A3..105B1 ; L # L& [15] VITHKUQI SMALL LETTER HA..VITHKUQI SMALL LETTER RE +105B3..105B9 ; L # L& [7] VITHKUQI SMALL LETTER SE..VITHKUQI SMALL LETTER XE +105BB..105BC ; L # L& [2] VITHKUQI SMALL LETTER Y..VITHKUQI SMALL LETTER ZE +10600..10736 ; L # Lo [311] LINEAR A SIGN AB001..LINEAR A SIGN A664 +10740..10755 ; L # Lo [22] LINEAR A SIGN A701 A..LINEAR A SIGN A732 JE +10760..10767 ; L # Lo [8] LINEAR A SIGN A800..LINEAR A SIGN A807 +10780..10785 ; L # Lm [6] MODIFIER LETTER SMALL CAPITAL AA..MODIFIER LETTER SMALL B WITH HOOK +10787..107B0 ; L # Lm [42] MODIFIER LETTER SMALL DZ DIGRAPH..MODIFIER LETTER SMALL V WITH RIGHT HOOK +107B2..107BA ; L # Lm [9] MODIFIER LETTER SMALL CAPITAL Y..MODIFIER LETTER SMALL S WITH CURL +11000 ; L # Mc BRAHMI SIGN CANDRABINDU +11002 ; L # Mc BRAHMI SIGN VISARGA +11003..11037 ; L # Lo [53] BRAHMI SIGN JIHVAMULIYA..BRAHMI LETTER OLD TAMIL NNNA +11047..1104D ; L # Po [7] BRAHMI DANDA..BRAHMI PUNCTUATION LOTUS +11066..1106F ; L # Nd [10] BRAHMI DIGIT ZERO..BRAHMI DIGIT NINE +11071..11072 ; L # Lo [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O +11075 ; L # Lo BRAHMI LETTER OLD TAMIL LLA +11082 ; L # Mc KAITHI SIGN VISARGA +11083..110AF ; L # Lo [45] KAITHI LETTER A..KAITHI LETTER HA +110B0..110B2 ; L # Mc [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II +110B7..110B8 ; L # Mc [2] KAITHI VOWEL SIGN O..KAITHI VOWEL SIGN AU +110BB..110BC ; L # Po [2] KAITHI ABBREVIATION SIGN..KAITHI ENUMERATION SIGN +110BD ; L # Cf KAITHI NUMBER SIGN +110BE..110C1 ; L # Po [4] KAITHI SECTION MARK..KAITHI DOUBLE DANDA +110CD ; L # Cf KAITHI NUMBER SIGN ABOVE +110D0..110E8 ; L # Lo [25] SORA SOMPENG LETTER SAH..SORA SOMPENG LETTER MAE +110F0..110F9 ; L # Nd [10] SORA SOMPENG DIGIT ZERO..SORA SOMPENG DIGIT NINE +11103..11126 ; L # Lo [36] CHAKMA LETTER AA..CHAKMA LETTER HAA +1112C ; L # Mc CHAKMA VOWEL SIGN E +11136..1113F ; L # Nd [10] CHAKMA DIGIT ZERO..CHAKMA DIGIT NINE +11140..11143 ; L # Po [4] CHAKMA SECTION MARK..CHAKMA QUESTION MARK +11144 ; L # Lo CHAKMA LETTER LHAA +11145..11146 ; L # Mc [2] CHAKMA VOWEL SIGN AA..CHAKMA VOWEL SIGN EI +11147 ; L # Lo CHAKMA LETTER VAA +11150..11172 ; L # Lo [35] MAHAJANI LETTER A..MAHAJANI LETTER RRA +11174..11175 ; L # Po [2] MAHAJANI ABBREVIATION SIGN..MAHAJANI SECTION MARK +11176 ; L # Lo MAHAJANI LIGATURE SHRI +11182 ; L # Mc SHARADA SIGN VISARGA +11183..111B2 ; L # Lo [48] SHARADA LETTER A..SHARADA LETTER HA +111B3..111B5 ; L # Mc [3] SHARADA VOWEL SIGN AA..SHARADA VOWEL SIGN II +111BF..111C0 ; L # Mc [2] SHARADA VOWEL SIGN AU..SHARADA SIGN VIRAMA +111C1..111C4 ; L # Lo [4] SHARADA SIGN AVAGRAHA..SHARADA OM +111C5..111C8 ; L # Po [4] SHARADA DANDA..SHARADA SEPARATOR +111CD ; L # Po SHARADA SUTRA MARK +111CE ; L # Mc SHARADA VOWEL SIGN PRISHTHAMATRA E +111D0..111D9 ; L # Nd [10] SHARADA DIGIT ZERO..SHARADA DIGIT NINE +111DA ; L # Lo SHARADA EKAM +111DB ; L # Po SHARADA SIGN SIDDHAM +111DC ; L # Lo SHARADA HEADSTROKE +111DD..111DF ; L # Po [3] SHARADA CONTINUATION SIGN..SHARADA SECTION MARK-2 +111E1..111F4 ; L # No [20] SINHALA ARCHAIC DIGIT ONE..SINHALA ARCHAIC NUMBER ONE THOUSAND +11200..11211 ; L # Lo [18] KHOJKI LETTER A..KHOJKI LETTER JJA +11213..1122B ; L # Lo [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA +1122C..1122E ; L # Mc [3] KHOJKI VOWEL SIGN AA..KHOJKI VOWEL SIGN II +11232..11233 ; L # Mc [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU +11235 ; L # Mc KHOJKI SIGN VIRAMA +11238..1123D ; L # Po [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN +11280..11286 ; L # Lo [7] MULTANI LETTER A..MULTANI LETTER GA +11288 ; L # Lo MULTANI LETTER GHA +1128A..1128D ; L # Lo [4] MULTANI LETTER CA..MULTANI LETTER JJA +1128F..1129D ; L # Lo [15] MULTANI LETTER NYA..MULTANI LETTER BA +1129F..112A8 ; L # Lo [10] MULTANI LETTER BHA..MULTANI LETTER RHA +112A9 ; L # Po MULTANI SECTION MARK +112B0..112DE ; L # Lo [47] KHUDAWADI LETTER A..KHUDAWADI LETTER HA +112E0..112E2 ; L # Mc [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II +112F0..112F9 ; L # Nd [10] KHUDAWADI DIGIT ZERO..KHUDAWADI DIGIT NINE +11302..11303 ; L # Mc [2] GRANTHA SIGN ANUSVARA..GRANTHA SIGN VISARGA +11305..1130C ; L # Lo [8] GRANTHA LETTER A..GRANTHA LETTER VOCALIC L +1130F..11310 ; L # Lo [2] GRANTHA LETTER EE..GRANTHA LETTER AI +11313..11328 ; L # Lo [22] GRANTHA LETTER OO..GRANTHA LETTER NA +1132A..11330 ; L # Lo [7] GRANTHA LETTER PA..GRANTHA LETTER RA +11332..11333 ; L # Lo [2] GRANTHA LETTER LA..GRANTHA LETTER LLA +11335..11339 ; L # Lo [5] GRANTHA LETTER VA..GRANTHA LETTER HA +1133D ; L # Lo GRANTHA SIGN AVAGRAHA +1133E..1133F ; L # Mc [2] GRANTHA VOWEL SIGN AA..GRANTHA VOWEL SIGN I +11341..11344 ; L # Mc [4] GRANTHA VOWEL SIGN U..GRANTHA VOWEL SIGN VOCALIC RR +11347..11348 ; L # Mc [2] GRANTHA VOWEL SIGN EE..GRANTHA VOWEL SIGN AI +1134B..1134D ; L # Mc [3] GRANTHA VOWEL SIGN OO..GRANTHA SIGN VIRAMA +11350 ; L # Lo GRANTHA OM +11357 ; L # Mc GRANTHA AU LENGTH MARK +1135D..11361 ; L # Lo [5] GRANTHA SIGN PLUTA..GRANTHA LETTER VOCALIC LL +11362..11363 ; L # Mc [2] GRANTHA VOWEL SIGN VOCALIC L..GRANTHA VOWEL SIGN VOCALIC LL +11400..11434 ; L # Lo [53] NEWA LETTER A..NEWA LETTER HA +11435..11437 ; L # Mc [3] NEWA VOWEL SIGN AA..NEWA VOWEL SIGN II +11440..11441 ; L # Mc [2] NEWA VOWEL SIGN O..NEWA VOWEL SIGN AU +11445 ; L # Mc NEWA SIGN VISARGA +11447..1144A ; L # Lo [4] NEWA SIGN AVAGRAHA..NEWA SIDDHI +1144B..1144F ; L # Po [5] NEWA DANDA..NEWA ABBREVIATION SIGN +11450..11459 ; L # Nd [10] NEWA DIGIT ZERO..NEWA DIGIT NINE +1145A..1145B ; L # Po [2] NEWA DOUBLE COMMA..NEWA PLACEHOLDER MARK +1145D ; L # Po NEWA INSERTION SIGN +1145F..11461 ; L # Lo [3] NEWA LETTER VEDIC ANUSVARA..NEWA SIGN UPADHMANIYA +11480..114AF ; L # Lo [48] TIRHUTA ANJI..TIRHUTA LETTER HA +114B0..114B2 ; L # Mc [3] TIRHUTA VOWEL SIGN AA..TIRHUTA VOWEL SIGN II +114B9 ; L # Mc TIRHUTA VOWEL SIGN E +114BB..114BE ; L # Mc [4] TIRHUTA VOWEL SIGN AI..TIRHUTA VOWEL SIGN AU +114C1 ; L # Mc TIRHUTA SIGN VISARGA +114C4..114C5 ; L # Lo [2] TIRHUTA SIGN AVAGRAHA..TIRHUTA GVANG +114C6 ; L # Po TIRHUTA ABBREVIATION SIGN +114C7 ; L # Lo TIRHUTA OM +114D0..114D9 ; L # Nd [10] TIRHUTA DIGIT ZERO..TIRHUTA DIGIT NINE +11580..115AE ; L # Lo [47] SIDDHAM LETTER A..SIDDHAM LETTER HA +115AF..115B1 ; L # Mc [3] SIDDHAM VOWEL SIGN AA..SIDDHAM VOWEL SIGN II +115B8..115BB ; L # Mc [4] SIDDHAM VOWEL SIGN E..SIDDHAM VOWEL SIGN AU +115BE ; L # Mc SIDDHAM SIGN VISARGA +115C1..115D7 ; L # Po [23] SIDDHAM SIGN SIDDHAM..SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES +115D8..115DB ; L # Lo [4] SIDDHAM LETTER THREE-CIRCLE ALTERNATE I..SIDDHAM LETTER ALTERNATE U +11600..1162F ; L # Lo [48] MODI LETTER A..MODI LETTER LLA +11630..11632 ; L # Mc [3] MODI VOWEL SIGN AA..MODI VOWEL SIGN II +1163B..1163C ; L # Mc [2] MODI VOWEL SIGN O..MODI VOWEL SIGN AU +1163E ; L # Mc MODI SIGN VISARGA +11641..11643 ; L # Po [3] MODI DANDA..MODI ABBREVIATION SIGN +11644 ; L # Lo MODI SIGN HUVA +11650..11659 ; L # Nd [10] MODI DIGIT ZERO..MODI DIGIT NINE +11680..116AA ; L # Lo [43] TAKRI LETTER A..TAKRI LETTER RRA +116AC ; L # Mc TAKRI SIGN VISARGA +116AE..116AF ; L # Mc [2] TAKRI VOWEL SIGN I..TAKRI VOWEL SIGN II +116B6 ; L # Mc TAKRI SIGN VIRAMA +116B8 ; L # Lo TAKRI LETTER ARCHAIC KHA +116B9 ; L # Po TAKRI ABBREVIATION SIGN +116C0..116C9 ; L # Nd [10] TAKRI DIGIT ZERO..TAKRI DIGIT NINE +11700..1171A ; L # Lo [27] AHOM LETTER KA..AHOM LETTER ALTERNATE BA +11720..11721 ; L # Mc [2] AHOM VOWEL SIGN A..AHOM VOWEL SIGN AA +11726 ; L # Mc AHOM VOWEL SIGN E +11730..11739 ; L # Nd [10] AHOM DIGIT ZERO..AHOM DIGIT NINE +1173A..1173B ; L # No [2] AHOM NUMBER TEN..AHOM NUMBER TWENTY +1173C..1173E ; L # Po [3] AHOM SIGN SMALL SECTION..AHOM SIGN RULAI +1173F ; L # So AHOM SYMBOL VI +11740..11746 ; L # Lo [7] AHOM LETTER CA..AHOM LETTER LLA +11800..1182B ; L # Lo [44] DOGRA LETTER A..DOGRA LETTER RRA +1182C..1182E ; L # Mc [3] DOGRA VOWEL SIGN AA..DOGRA VOWEL SIGN II +11838 ; L # Mc DOGRA SIGN VISARGA +1183B ; L # Po DOGRA ABBREVIATION SIGN +118A0..118DF ; L # L& [64] WARANG CITI CAPITAL LETTER NGAA..WARANG CITI SMALL LETTER VIYO +118E0..118E9 ; L # Nd [10] WARANG CITI DIGIT ZERO..WARANG CITI DIGIT NINE +118EA..118F2 ; L # No [9] WARANG CITI NUMBER TEN..WARANG CITI NUMBER NINETY +118FF..11906 ; L # Lo [8] WARANG CITI OM..DIVES AKURU LETTER E +11909 ; L # Lo DIVES AKURU LETTER O +1190C..11913 ; L # Lo [8] DIVES AKURU LETTER KA..DIVES AKURU LETTER JA +11915..11916 ; L # Lo [2] DIVES AKURU LETTER NYA..DIVES AKURU LETTER TTA +11918..1192F ; L # Lo [24] DIVES AKURU LETTER DDA..DIVES AKURU LETTER ZA +11930..11935 ; L # Mc [6] DIVES AKURU VOWEL SIGN AA..DIVES AKURU VOWEL SIGN E +11937..11938 ; L # Mc [2] DIVES AKURU VOWEL SIGN AI..DIVES AKURU VOWEL SIGN O +1193D ; L # Mc DIVES AKURU SIGN HALANTA +1193F ; L # Lo DIVES AKURU PREFIXED NASAL SIGN +11940 ; L # Mc DIVES AKURU MEDIAL YA +11941 ; L # Lo DIVES AKURU INITIAL RA +11942 ; L # Mc DIVES AKURU MEDIAL RA +11944..11946 ; L # Po [3] DIVES AKURU DOUBLE DANDA..DIVES AKURU END OF TEXT MARK +11950..11959 ; L # Nd [10] DIVES AKURU DIGIT ZERO..DIVES AKURU DIGIT NINE +119A0..119A7 ; L # Lo [8] NANDINAGARI LETTER A..NANDINAGARI LETTER VOCALIC RR +119AA..119D0 ; L # Lo [39] NANDINAGARI LETTER E..NANDINAGARI LETTER RRA +119D1..119D3 ; L # Mc [3] NANDINAGARI VOWEL SIGN AA..NANDINAGARI VOWEL SIGN II +119DC..119DF ; L # Mc [4] NANDINAGARI VOWEL SIGN O..NANDINAGARI SIGN VISARGA +119E1 ; L # Lo NANDINAGARI SIGN AVAGRAHA +119E2 ; L # Po NANDINAGARI SIGN SIDDHAM +119E3 ; L # Lo NANDINAGARI HEADSTROKE +119E4 ; L # Mc NANDINAGARI VOWEL SIGN PRISHTHAMATRA E +11A00 ; L # Lo ZANABAZAR SQUARE LETTER A +11A07..11A08 ; L # Mn [2] ZANABAZAR SQUARE VOWEL SIGN AI..ZANABAZAR SQUARE VOWEL SIGN AU +11A0B..11A32 ; L # Lo [40] ZANABAZAR SQUARE LETTER KA..ZANABAZAR SQUARE LETTER KSSA +11A39 ; L # Mc ZANABAZAR SQUARE SIGN VISARGA +11A3A ; L # Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA +11A3F..11A46 ; L # Po [8] ZANABAZAR SQUARE INITIAL HEAD MARK..ZANABAZAR SQUARE CLOSING DOUBLE-LINED HEAD MARK +11A50 ; L # Lo SOYOMBO LETTER A +11A57..11A58 ; L # Mc [2] SOYOMBO VOWEL SIGN AI..SOYOMBO VOWEL SIGN AU +11A5C..11A89 ; L # Lo [46] SOYOMBO LETTER KA..SOYOMBO CLUSTER-INITIAL LETTER SA +11A97 ; L # Mc SOYOMBO SIGN VISARGA +11A9A..11A9C ; L # Po [3] SOYOMBO MARK TSHEG..SOYOMBO MARK DOUBLE SHAD +11A9D ; L # Lo SOYOMBO MARK PLUTA +11A9E..11AA2 ; L # Po [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2 +11AB0..11AF8 ; L # Lo [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL +11C00..11C08 ; L # Lo [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L +11C0A..11C2E ; L # Lo [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA +11C2F ; L # Mc BHAIKSUKI VOWEL SIGN AA +11C3E ; L # Mc BHAIKSUKI SIGN VISARGA +11C3F ; L # Mn BHAIKSUKI SIGN VIRAMA +11C40 ; L # Lo BHAIKSUKI SIGN AVAGRAHA +11C41..11C45 ; L # Po [5] BHAIKSUKI DANDA..BHAIKSUKI GAP FILLER-2 +11C50..11C59 ; L # Nd [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE +11C5A..11C6C ; L # No [19] BHAIKSUKI NUMBER ONE..BHAIKSUKI HUNDREDS UNIT MARK +11C70..11C71 ; L # Po [2] MARCHEN HEAD MARK..MARCHEN MARK SHAD +11C72..11C8F ; L # Lo [30] MARCHEN LETTER KA..MARCHEN LETTER A +11CA9 ; L # Mc MARCHEN SUBJOINED LETTER YA +11CB1 ; L # Mc MARCHEN VOWEL SIGN I +11CB4 ; L # Mc MARCHEN VOWEL SIGN O +11D00..11D06 ; L # Lo [7] MASARAM GONDI LETTER A..MASARAM GONDI LETTER E +11D08..11D09 ; L # Lo [2] MASARAM GONDI LETTER AI..MASARAM GONDI LETTER O +11D0B..11D30 ; L # Lo [38] MASARAM GONDI LETTER AU..MASARAM GONDI LETTER TRA +11D46 ; L # Lo MASARAM GONDI REPHA +11D50..11D59 ; L # Nd [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE +11D60..11D65 ; L # Lo [6] GUNJALA GONDI LETTER A..GUNJALA GONDI LETTER UU +11D67..11D68 ; L # Lo [2] GUNJALA GONDI LETTER EE..GUNJALA GONDI LETTER AI +11D6A..11D89 ; L # Lo [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA +11D8A..11D8E ; L # Mc [5] GUNJALA GONDI VOWEL SIGN AA..GUNJALA GONDI VOWEL SIGN UU +11D93..11D94 ; L # Mc [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU +11D96 ; L # Mc GUNJALA GONDI SIGN VISARGA +11D98 ; L # Lo GUNJALA GONDI OM +11DA0..11DA9 ; L # Nd [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE +11EE0..11EF2 ; L # Lo [19] MAKASAR LETTER KA..MAKASAR ANGKA +11EF5..11EF6 ; L # Mc [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O +11EF7..11EF8 ; L # Po [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION +11FB0 ; L # Lo LISU LETTER YHA +11FC0..11FD4 ; L # No [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH +11FFF ; L # Po TAMIL PUNCTUATION END OF TEXT +12000..12399 ; L # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U +12400..1246E ; L # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM +12470..12474 ; L # Po [5] CUNEIFORM PUNCTUATION SIGN OLD ASSYRIAN WORD DIVIDER..CUNEIFORM PUNCTUATION SIGN DIAGONAL QUADCOLON +12480..12543 ; L # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU +12F90..12FF0 ; L # Lo [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114 +12FF1..12FF2 ; L # Po [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302 +13000..1342E ; L # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032 +13430..13438 ; L # Cf [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT +14400..14646 ; L # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530 +16800..16A38 ; L # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ +16A40..16A5E ; L # Lo [31] MRO LETTER TA..MRO LETTER TEK +16A60..16A69 ; L # Nd [10] MRO DIGIT ZERO..MRO DIGIT NINE +16A6E..16A6F ; L # Po [2] MRO DANDA..MRO DOUBLE DANDA +16A70..16ABE ; L # Lo [79] TANGSA LETTER OZ..TANGSA LETTER ZA +16AC0..16AC9 ; L # Nd [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE +16AD0..16AED ; L # Lo [30] BASSA VAH LETTER ENNI..BASSA VAH LETTER I +16AF5 ; L # Po BASSA VAH FULL STOP +16B00..16B2F ; L # Lo [48] PAHAWH HMONG VOWEL KEEB..PAHAWH HMONG CONSONANT CAU +16B37..16B3B ; L # Po [5] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS FEEM +16B3C..16B3F ; L # So [4] PAHAWH HMONG SIGN XYEEM NTXIV..PAHAWH HMONG SIGN XYEEM FAIB +16B40..16B43 ; L # Lm [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM +16B44 ; L # Po PAHAWH HMONG SIGN XAUS +16B45 ; L # So PAHAWH HMONG SIGN CIM TSOV ROG +16B50..16B59 ; L # Nd [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE +16B5B..16B61 ; L # No [7] PAHAWH HMONG NUMBER TENS..PAHAWH HMONG NUMBER TRILLIONS +16B63..16B77 ; L # Lo [21] PAHAWH HMONG SIGN VOS LUB..PAHAWH HMONG SIGN CIM NRES TOS +16B7D..16B8F ; L # Lo [19] PAHAWH HMONG CLAN SIGN TSHEEJ..PAHAWH HMONG CLAN SIGN VWJ +16E40..16E7F ; L # L& [64] MEDEFAIDRIN CAPITAL LETTER M..MEDEFAIDRIN SMALL LETTER Y +16E80..16E96 ; L # No [23] MEDEFAIDRIN DIGIT ZERO..MEDEFAIDRIN DIGIT THREE ALTERNATE FORM +16E97..16E9A ; L # Po [4] MEDEFAIDRIN COMMA..MEDEFAIDRIN EXCLAMATION OH +16F00..16F4A ; L # Lo [75] MIAO LETTER PA..MIAO LETTER RTE +16F50 ; L # Lo MIAO LETTER NASALIZATION +16F51..16F87 ; L # Mc [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI +16F93..16F9F ; L # Lm [13] MIAO LETTER TONE-2..MIAO LETTER REFORMED TONE-8 +16FE0..16FE1 ; L # Lm [2] TANGUT ITERATION MARK..NUSHU ITERATION MARK +16FE3 ; L # Lm OLD CHINESE ITERATION MARK +16FF0..16FF1 ; L # Mc [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY +17000..187F7 ; L # Lo [6136] TANGUT IDEOGRAPH-17000..TANGUT IDEOGRAPH-187F7 +18800..18CD5 ; L # Lo [1238] TANGUT COMPONENT-001..KHITAN SMALL SCRIPT CHARACTER-18CD5 +18D00..18D08 ; L # Lo [9] TANGUT IDEOGRAPH-18D00..TANGUT IDEOGRAPH-18D08 +1AFF0..1AFF3 ; L # Lm [4] KATAKANA LETTER MINNAN TONE-2..KATAKANA LETTER MINNAN TONE-5 +1AFF5..1AFFB ; L # Lm [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5 +1AFFD..1AFFE ; L # Lm [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8 +1B000..1B122 ; L # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU +1B150..1B152 ; L # Lo [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO +1B164..1B167 ; L # Lo [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N +1B170..1B2FB ; L # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB +1BC00..1BC6A ; L # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M +1BC70..1BC7C ; L # Lo [13] DUPLOYAN AFFIX LEFT HORIZONTAL SECANT..DUPLOYAN AFFIX ATTACHED TANGENT HOOK +1BC80..1BC88 ; L # Lo [9] DUPLOYAN AFFIX HIGH ACUTE..DUPLOYAN AFFIX HIGH VERTICAL +1BC90..1BC99 ; L # Lo [10] DUPLOYAN AFFIX LOW ACUTE..DUPLOYAN AFFIX LOW ARROW +1BC9C ; L # So DUPLOYAN SIGN O WITH CROSS +1BC9F ; L # Po DUPLOYAN PUNCTUATION CHINOOK FULL STOP +1CF50..1CFC3 ; L # So [116] ZNAMENNY NEUME KRYUK..ZNAMENNY NEUME PAUK +1D000..1D0F5 ; L # So [246] BYZANTINE MUSICAL SYMBOL PSILI..BYZANTINE MUSICAL SYMBOL GORGON NEO KATO +1D100..1D126 ; L # So [39] MUSICAL SYMBOL SINGLE BARLINE..MUSICAL SYMBOL DRUM CLEF-2 +1D129..1D164 ; L # So [60] MUSICAL SYMBOL MULTIPLE MEASURE REST..MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE +1D165..1D166 ; L # Mc [2] MUSICAL SYMBOL COMBINING STEM..MUSICAL SYMBOL COMBINING SPRECHGESANG STEM +1D16A..1D16C ; L # So [3] MUSICAL SYMBOL FINGERED TREMOLO-1..MUSICAL SYMBOL FINGERED TREMOLO-3 +1D16D..1D172 ; L # Mc [6] MUSICAL SYMBOL COMBINING AUGMENTATION DOT..MUSICAL SYMBOL COMBINING FLAG-5 +1D183..1D184 ; L # So [2] MUSICAL SYMBOL ARPEGGIATO UP..MUSICAL SYMBOL ARPEGGIATO DOWN +1D18C..1D1A9 ; L # So [30] MUSICAL SYMBOL RINFORZANDO..MUSICAL SYMBOL DEGREE SLASH +1D1AE..1D1E8 ; L # So [59] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KIEVAN FLAT SIGN +1D2E0..1D2F3 ; L # No [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN +1D360..1D378 ; L # No [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE +1D400..1D454 ; L # L& [85] MATHEMATICAL BOLD CAPITAL A..MATHEMATICAL ITALIC SMALL G +1D456..1D49C ; L # L& [71] MATHEMATICAL ITALIC SMALL I..MATHEMATICAL SCRIPT CAPITAL A +1D49E..1D49F ; L # L& [2] MATHEMATICAL SCRIPT CAPITAL C..MATHEMATICAL SCRIPT CAPITAL D +1D4A2 ; L # L& MATHEMATICAL SCRIPT CAPITAL G +1D4A5..1D4A6 ; L # L& [2] MATHEMATICAL SCRIPT CAPITAL J..MATHEMATICAL SCRIPT CAPITAL K +1D4A9..1D4AC ; L # L& [4] MATHEMATICAL SCRIPT CAPITAL N..MATHEMATICAL SCRIPT CAPITAL Q +1D4AE..1D4B9 ; L # L& [12] MATHEMATICAL SCRIPT CAPITAL S..MATHEMATICAL SCRIPT SMALL D +1D4BB ; L # L& MATHEMATICAL SCRIPT SMALL F +1D4BD..1D4C3 ; L # L& [7] MATHEMATICAL SCRIPT SMALL H..MATHEMATICAL SCRIPT SMALL N +1D4C5..1D505 ; L # L& [65] MATHEMATICAL SCRIPT SMALL P..MATHEMATICAL FRAKTUR CAPITAL B +1D507..1D50A ; L # L& [4] MATHEMATICAL FRAKTUR CAPITAL D..MATHEMATICAL FRAKTUR CAPITAL G +1D50D..1D514 ; L # L& [8] MATHEMATICAL FRAKTUR CAPITAL J..MATHEMATICAL FRAKTUR CAPITAL Q +1D516..1D51C ; L # L& [7] MATHEMATICAL FRAKTUR CAPITAL S..MATHEMATICAL FRAKTUR CAPITAL Y +1D51E..1D539 ; L # L& [28] MATHEMATICAL FRAKTUR SMALL A..MATHEMATICAL DOUBLE-STRUCK CAPITAL B +1D53B..1D53E ; L # L& [4] MATHEMATICAL DOUBLE-STRUCK CAPITAL D..MATHEMATICAL DOUBLE-STRUCK CAPITAL G +1D540..1D544 ; L # L& [5] MATHEMATICAL DOUBLE-STRUCK CAPITAL I..MATHEMATICAL DOUBLE-STRUCK CAPITAL M +1D546 ; L # L& MATHEMATICAL DOUBLE-STRUCK CAPITAL O +1D54A..1D550 ; L # L& [7] MATHEMATICAL DOUBLE-STRUCK CAPITAL S..MATHEMATICAL DOUBLE-STRUCK CAPITAL Y +1D552..1D6A5 ; L # L& [340] MATHEMATICAL DOUBLE-STRUCK SMALL A..MATHEMATICAL ITALIC SMALL DOTLESS J +1D6A8..1D6C0 ; L # L& [25] MATHEMATICAL BOLD CAPITAL ALPHA..MATHEMATICAL BOLD CAPITAL OMEGA +1D6C1 ; L # Sm MATHEMATICAL BOLD NABLA +1D6C2..1D6DA ; L # L& [25] MATHEMATICAL BOLD SMALL ALPHA..MATHEMATICAL BOLD SMALL OMEGA +1D6DC..1D6FA ; L # L& [31] MATHEMATICAL BOLD EPSILON SYMBOL..MATHEMATICAL ITALIC CAPITAL OMEGA +1D6FB ; L # Sm MATHEMATICAL ITALIC NABLA +1D6FC..1D714 ; L # L& [25] MATHEMATICAL ITALIC SMALL ALPHA..MATHEMATICAL ITALIC SMALL OMEGA +1D716..1D734 ; L # L& [31] MATHEMATICAL ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD ITALIC CAPITAL OMEGA +1D735 ; L # Sm MATHEMATICAL BOLD ITALIC NABLA +1D736..1D74E ; L # L& [25] MATHEMATICAL BOLD ITALIC SMALL ALPHA..MATHEMATICAL BOLD ITALIC SMALL OMEGA +1D750..1D76E ; L # L& [31] MATHEMATICAL BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD CAPITAL OMEGA +1D76F ; L # Sm MATHEMATICAL SANS-SERIF BOLD NABLA +1D770..1D788 ; L # L& [25] MATHEMATICAL SANS-SERIF BOLD SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD SMALL OMEGA +1D78A..1D7A8 ; L # L& [31] MATHEMATICAL SANS-SERIF BOLD EPSILON SYMBOL..MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL OMEGA +1D7A9 ; L # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC NABLA +1D7AA..1D7C2 ; L # L& [25] MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL ALPHA..MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL OMEGA +1D7C4..1D7CB ; L # L& [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA +1D800..1D9FF ; L # So [512] SIGNWRITING HAND-FIST INDEX..SIGNWRITING HEAD +1DA37..1DA3A ; L # So [4] SIGNWRITING AIR BLOW SMALL ROTATIONS..SIGNWRITING BREATH EXHALE +1DA6D..1DA74 ; L # So [8] SIGNWRITING SHOULDER HIP SPINE..SIGNWRITING TORSO-FLOORPLANE TWISTING +1DA76..1DA83 ; L # So [14] SIGNWRITING LIMB COMBINATION..SIGNWRITING LOCATION DEPTH +1DA85..1DA86 ; L # So [2] SIGNWRITING LOCATION TORSO..SIGNWRITING LOCATION LIMBS DIGITS +1DA87..1DA8B ; L # Po [5] SIGNWRITING COMMA..SIGNWRITING PARENTHESIS +1DF00..1DF09 ; L # L& [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK +1DF0A ; L # Lo LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK +1DF0B..1DF1E ; L # L& [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL +1E100..1E12C ; L # Lo [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W +1E137..1E13D ; L # Lm [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER +1E140..1E149 ; L # Nd [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE +1E14E ; L # Lo NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ +1E14F ; L # So NYIAKENG PUACHUE HMONG CIRCLED CA +1E290..1E2AD ; L # Lo [30] TOTO LETTER PA..TOTO LETTER A +1E2C0..1E2EB ; L # Lo [44] WANCHO LETTER AA..WANCHO LETTER YIH +1E2F0..1E2F9 ; L # Nd [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE +1E7E0..1E7E6 ; L # Lo [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO +1E7E8..1E7EB ; L # Lo [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE +1E7ED..1E7EE ; L # Lo [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE +1E7F0..1E7FE ; L # Lo [15] ETHIOPIC SYLLABLE GURAGE QWI..ETHIOPIC SYLLABLE GURAGE PWEE +1F110..1F12E ; L # So [31] PARENTHESIZED LATIN CAPITAL LETTER A..CIRCLED WZ +1F130..1F169 ; L # So [58] SQUARED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z +1F170..1F1AC ; L # So [61] NEGATIVE SQUARED LATIN CAPITAL LETTER A..SQUARED VOD +1F1E6..1F202 ; L # So [29] REGIONAL INDICATOR SYMBOL LETTER A..SQUARED KATAKANA SA +1F210..1F23B ; L # So [44] SQUARED CJK UNIFIED IDEOGRAPH-624B..SQUARED CJK UNIFIED IDEOGRAPH-914D +1F240..1F248 ; L # So [9] TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C..TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557 +1F250..1F251 ; L # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT +20000..2A6DF ; L # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF +2A700..2B738 ; L # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738 +2B740..2B81D ; L # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D +2B820..2CEA1 ; L # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1 +2CEB0..2EBE0 ; L # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0 +2F800..2FA1D ; L # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D +30000..3134A ; L # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A +F0000..FFFFD ; L # Co [65534] .. +100000..10FFFD; L # Co [65534] .. + +# The above property value applies to 825575 code points not listed here. +# Total code points: 1096333 + +# ================================================ + +# Bidi_Class=Right_To_Left + +0590 ; R # Cn +05BE ; R # Pd HEBREW PUNCTUATION MAQAF +05C0 ; R # Po HEBREW PUNCTUATION PASEQ +05C3 ; R # Po HEBREW PUNCTUATION SOF PASUQ +05C6 ; R # Po HEBREW PUNCTUATION NUN HAFUKHA +05C8..05CF ; R # Cn [8] .. +05D0..05EA ; R # Lo [27] HEBREW LETTER ALEF..HEBREW LETTER TAV +05EB..05EE ; R # Cn [4] .. +05EF..05F2 ; R # Lo [4] HEBREW YOD TRIANGLE..HEBREW LIGATURE YIDDISH DOUBLE YOD +05F3..05F4 ; R # Po [2] HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATION GERSHAYIM +05F5..05FF ; R # Cn [11] .. +07C0..07C9 ; R # Nd [10] NKO DIGIT ZERO..NKO DIGIT NINE +07CA..07EA ; R # Lo [33] NKO LETTER A..NKO LETTER JONA RA +07F4..07F5 ; R # Lm [2] NKO HIGH TONE APOSTROPHE..NKO LOW TONE APOSTROPHE +07FA ; R # Lm NKO LAJANYALAN +07FB..07FC ; R # Cn [2] .. +07FE..07FF ; R # Sc [2] NKO DOROME SIGN..NKO TAMAN SIGN +0800..0815 ; R # Lo [22] SAMARITAN LETTER ALAF..SAMARITAN LETTER TAAF +081A ; R # Lm SAMARITAN MODIFIER LETTER EPENTHETIC YUT +0824 ; R # Lm SAMARITAN MODIFIER LETTER SHORT A +0828 ; R # Lm SAMARITAN MODIFIER LETTER I +082E..082F ; R # Cn [2] .. +0830..083E ; R # Po [15] SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUNCTUATION ANNAAU +083F ; R # Cn +0840..0858 ; R # Lo [25] MANDAIC LETTER HALQA..MANDAIC LETTER AIN +085C..085D ; R # Cn [2] .. +085E ; R # Po MANDAIC PUNCTUATION +085F ; R # Cn +200F ; R # Cf RIGHT-TO-LEFT MARK +FB1D ; R # Lo HEBREW LETTER YOD WITH HIRIQ +FB1F..FB28 ; R # Lo [10] HEBREW LIGATURE YIDDISH YOD YOD PATAH..HEBREW LETTER WIDE TAV +FB2A..FB36 ; R # Lo [13] HEBREW LETTER SHIN WITH SHIN DOT..HEBREW LETTER ZAYIN WITH DAGESH +FB37 ; R # Cn +FB38..FB3C ; R # Lo [5] HEBREW LETTER TET WITH DAGESH..HEBREW LETTER LAMED WITH DAGESH +FB3D ; R # Cn +FB3E ; R # Lo HEBREW LETTER MEM WITH DAGESH +FB3F ; R # Cn +FB40..FB41 ; R # Lo [2] HEBREW LETTER NUN WITH DAGESH..HEBREW LETTER SAMEKH WITH DAGESH +FB42 ; R # Cn +FB43..FB44 ; R # Lo [2] HEBREW LETTER FINAL PE WITH DAGESH..HEBREW LETTER PE WITH DAGESH +FB45 ; R # Cn +FB46..FB4F ; R # Lo [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW LIGATURE ALEF LAMED +10800..10805 ; R # Lo [6] CYPRIOT SYLLABLE A..CYPRIOT SYLLABLE JA +10806..10807 ; R # Cn [2] .. +10808 ; R # Lo CYPRIOT SYLLABLE JO +10809 ; R # Cn +1080A..10835 ; R # Lo [44] CYPRIOT SYLLABLE KA..CYPRIOT SYLLABLE WO +10836 ; R # Cn +10837..10838 ; R # Lo [2] CYPRIOT SYLLABLE XA..CYPRIOT SYLLABLE XE +10839..1083B ; R # Cn [3] .. +1083C ; R # Lo CYPRIOT SYLLABLE ZA +1083D..1083E ; R # Cn [2] .. +1083F..10855 ; R # Lo [23] CYPRIOT SYLLABLE ZO..IMPERIAL ARAMAIC LETTER TAW +10856 ; R # Cn +10857 ; R # Po IMPERIAL ARAMAIC SECTION SIGN +10858..1085F ; R # No [8] IMPERIAL ARAMAIC NUMBER ONE..IMPERIAL ARAMAIC NUMBER TEN THOUSAND +10860..10876 ; R # Lo [23] PALMYRENE LETTER ALEPH..PALMYRENE LETTER TAW +10877..10878 ; R # So [2] PALMYRENE LEFT-POINTING FLEURON..PALMYRENE RIGHT-POINTING FLEURON +10879..1087F ; R # No [7] PALMYRENE NUMBER ONE..PALMYRENE NUMBER TWENTY +10880..1089E ; R # Lo [31] NABATAEAN LETTER FINAL ALEPH..NABATAEAN LETTER TAW +1089F..108A6 ; R # Cn [8] .. +108A7..108AF ; R # No [9] NABATAEAN NUMBER ONE..NABATAEAN NUMBER ONE HUNDRED +108B0..108DF ; R # Cn [48] .. +108E0..108F2 ; R # Lo [19] HATRAN LETTER ALEPH..HATRAN LETTER QOPH +108F3 ; R # Cn +108F4..108F5 ; R # Lo [2] HATRAN LETTER SHIN..HATRAN LETTER TAW +108F6..108FA ; R # Cn [5] .. +108FB..108FF ; R # No [5] HATRAN NUMBER ONE..HATRAN NUMBER ONE HUNDRED +10900..10915 ; R # Lo [22] PHOENICIAN LETTER ALF..PHOENICIAN LETTER TAU +10916..1091B ; R # No [6] PHOENICIAN NUMBER ONE..PHOENICIAN NUMBER THREE +1091C..1091E ; R # Cn [3] .. +10920..10939 ; R # Lo [26] LYDIAN LETTER A..LYDIAN LETTER C +1093A..1093E ; R # Cn [5] .. +1093F ; R # Po LYDIAN TRIANGULAR MARK +10940..1097F ; R # Cn [64] .. +10980..109B7 ; R # Lo [56] MEROITIC HIEROGLYPHIC LETTER A..MEROITIC CURSIVE LETTER DA +109B8..109BB ; R # Cn [4] .. +109BC..109BD ; R # No [2] MEROITIC CURSIVE FRACTION ELEVEN TWELFTHS..MEROITIC CURSIVE FRACTION ONE HALF +109BE..109BF ; R # Lo [2] MEROITIC CURSIVE LOGOGRAM RMT..MEROITIC CURSIVE LOGOGRAM IMN +109C0..109CF ; R # No [16] MEROITIC CURSIVE NUMBER ONE..MEROITIC CURSIVE NUMBER SEVENTY +109D0..109D1 ; R # Cn [2] .. +109D2..109FF ; R # No [46] MEROITIC CURSIVE NUMBER ONE HUNDRED..MEROITIC CURSIVE FRACTION TEN TWELFTHS +10A00 ; R # Lo KHAROSHTHI LETTER A +10A04 ; R # Cn +10A07..10A0B ; R # Cn [5] .. +10A10..10A13 ; R # Lo [4] KHAROSHTHI LETTER KA..KHAROSHTHI LETTER GHA +10A14 ; R # Cn +10A15..10A17 ; R # Lo [3] KHAROSHTHI LETTER CA..KHAROSHTHI LETTER JA +10A18 ; R # Cn +10A19..10A35 ; R # Lo [29] KHAROSHTHI LETTER NYA..KHAROSHTHI LETTER VHA +10A36..10A37 ; R # Cn [2] .. +10A3B..10A3E ; R # Cn [4] .. +10A40..10A48 ; R # No [9] KHAROSHTHI DIGIT ONE..KHAROSHTHI FRACTION ONE HALF +10A49..10A4F ; R # Cn [7] .. +10A50..10A58 ; R # Po [9] KHAROSHTHI PUNCTUATION DOT..KHAROSHTHI PUNCTUATION LINES +10A59..10A5F ; R # Cn [7] .. +10A60..10A7C ; R # Lo [29] OLD SOUTH ARABIAN LETTER HE..OLD SOUTH ARABIAN LETTER THETH +10A7D..10A7E ; R # No [2] OLD SOUTH ARABIAN NUMBER ONE..OLD SOUTH ARABIAN NUMBER FIFTY +10A7F ; R # Po OLD SOUTH ARABIAN NUMERIC INDICATOR +10A80..10A9C ; R # Lo [29] OLD NORTH ARABIAN LETTER HEH..OLD NORTH ARABIAN LETTER ZAH +10A9D..10A9F ; R # No [3] OLD NORTH ARABIAN NUMBER ONE..OLD NORTH ARABIAN NUMBER TWENTY +10AA0..10ABF ; R # Cn [32] .. +10AC0..10AC7 ; R # Lo [8] MANICHAEAN LETTER ALEPH..MANICHAEAN LETTER WAW +10AC8 ; R # So MANICHAEAN SIGN UD +10AC9..10AE4 ; R # Lo [28] MANICHAEAN LETTER ZAYIN..MANICHAEAN LETTER TAW +10AE7..10AEA ; R # Cn [4] .. +10AEB..10AEF ; R # No [5] MANICHAEAN NUMBER ONE..MANICHAEAN NUMBER ONE HUNDRED +10AF0..10AF6 ; R # Po [7] MANICHAEAN PUNCTUATION STAR..MANICHAEAN PUNCTUATION LINE FILLER +10AF7..10AFF ; R # Cn [9] .. +10B00..10B35 ; R # Lo [54] AVESTAN LETTER A..AVESTAN LETTER HE +10B36..10B38 ; R # Cn [3] .. +10B40..10B55 ; R # Lo [22] INSCRIPTIONAL PARTHIAN LETTER ALEPH..INSCRIPTIONAL PARTHIAN LETTER TAW +10B56..10B57 ; R # Cn [2] .. +10B58..10B5F ; R # No [8] INSCRIPTIONAL PARTHIAN NUMBER ONE..INSCRIPTIONAL PARTHIAN NUMBER ONE THOUSAND +10B60..10B72 ; R # Lo [19] INSCRIPTIONAL PAHLAVI LETTER ALEPH..INSCRIPTIONAL PAHLAVI LETTER TAW +10B73..10B77 ; R # Cn [5] .. +10B78..10B7F ; R # No [8] INSCRIPTIONAL PAHLAVI NUMBER ONE..INSCRIPTIONAL PAHLAVI NUMBER ONE THOUSAND +10B80..10B91 ; R # Lo [18] PSALTER PAHLAVI LETTER ALEPH..PSALTER PAHLAVI LETTER TAW +10B92..10B98 ; R # Cn [7] .. +10B99..10B9C ; R # Po [4] PSALTER PAHLAVI SECTION MARK..PSALTER PAHLAVI FOUR DOTS WITH DOT +10B9D..10BA8 ; R # Cn [12] .. +10BA9..10BAF ; R # No [7] PSALTER PAHLAVI NUMBER ONE..PSALTER PAHLAVI NUMBER ONE HUNDRED +10BB0..10BFF ; R # Cn [80] .. +10C00..10C48 ; R # Lo [73] OLD TURKIC LETTER ORKHON A..OLD TURKIC LETTER ORKHON BASH +10C49..10C7F ; R # Cn [55] .. +10C80..10CB2 ; R # L& [51] OLD HUNGARIAN CAPITAL LETTER A..OLD HUNGARIAN CAPITAL LETTER US +10CB3..10CBF ; R # Cn [13] .. +10CC0..10CF2 ; R # L& [51] OLD HUNGARIAN SMALL LETTER A..OLD HUNGARIAN SMALL LETTER US +10CF3..10CF9 ; R # Cn [7] .. +10CFA..10CFF ; R # No [6] OLD HUNGARIAN NUMBER ONE..OLD HUNGARIAN NUMBER ONE THOUSAND +10D40..10E5F ; R # Cn [288] .. +10E7F ; R # Cn +10E80..10EA9 ; R # Lo [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET +10EAA ; R # Cn +10EAD ; R # Pd YEZIDI HYPHENATION MARK +10EAE..10EAF ; R # Cn [2] .. +10EB0..10EB1 ; R # Lo [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE +10EB2..10EFF ; R # Cn [78] .. +10F00..10F1C ; R # Lo [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL +10F1D..10F26 ; R # No [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF +10F27 ; R # Lo OLD SOGDIAN LIGATURE AYIN-DALETH +10F28..10F2F ; R # Cn [8] .. +10F70..10F81 ; R # Lo [18] OLD UYGHUR LETTER ALEPH..OLD UYGHUR LETTER LESH +10F86..10F89 ; R # Po [4] OLD UYGHUR PUNCTUATION BAR..OLD UYGHUR PUNCTUATION FOUR DOTS +10F8A..10FAF ; R # Cn [38] .. +10FB0..10FC4 ; R # Lo [21] CHORASMIAN LETTER ALEPH..CHORASMIAN LETTER TAW +10FC5..10FCB ; R # No [7] CHORASMIAN NUMBER ONE..CHORASMIAN NUMBER ONE HUNDRED +10FCC..10FDF ; R # Cn [20] .. +10FE0..10FF6 ; R # Lo [23] ELYMAIC LETTER ALEPH..ELYMAIC LIGATURE ZAYIN-YODH +10FF7..10FFF ; R # Cn [9] .. +1E800..1E8C4 ; R # Lo [197] MENDE KIKAKUI SYLLABLE M001 KI..MENDE KIKAKUI SYLLABLE M060 NYON +1E8C5..1E8C6 ; R # Cn [2] .. +1E8C7..1E8CF ; R # No [9] MENDE KIKAKUI DIGIT ONE..MENDE KIKAKUI DIGIT NINE +1E8D7..1E8FF ; R # Cn [41] .. +1E900..1E943 ; R # L& [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA +1E94B ; R # Lm ADLAM NASALIZATION MARK +1E94C..1E94F ; R # Cn [4] .. +1E950..1E959 ; R # Nd [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE +1E95A..1E95D ; R # Cn [4] .. +1E95E..1E95F ; R # Po [2] ADLAM INITIAL EXCLAMATION MARK..ADLAM INITIAL QUESTION MARK +1E960..1EC6F ; R # Cn [784] .. +1ECC0..1ECFF ; R # Cn [64] .. +1ED50..1EDFF ; R # Cn [176] .. +1EF00..1EFFF ; R # Cn [256] .. + +# Total code points: 3711 + +# ================================================ + +# Bidi_Class=European_Number + +0030..0039 ; EN # Nd [10] DIGIT ZERO..DIGIT NINE +00B2..00B3 ; EN # No [2] SUPERSCRIPT TWO..SUPERSCRIPT THREE +00B9 ; EN # No SUPERSCRIPT ONE +06F0..06F9 ; EN # Nd [10] EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED ARABIC-INDIC DIGIT NINE +2070 ; EN # No SUPERSCRIPT ZERO +2074..2079 ; EN # No [6] SUPERSCRIPT FOUR..SUPERSCRIPT NINE +2080..2089 ; EN # No [10] SUBSCRIPT ZERO..SUBSCRIPT NINE +2488..249B ; EN # No [20] DIGIT ONE FULL STOP..NUMBER TWENTY FULL STOP +FF10..FF19 ; EN # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE +102E1..102FB ; EN # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED +1D7CE..1D7FF ; EN # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE +1F100..1F10A ; EN # No [11] DIGIT ZERO FULL STOP..DIGIT NINE COMMA +1FBF0..1FBF9 ; EN # Nd [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE + +# Total code points: 168 + +# ================================================ + +# Bidi_Class=European_Separator + +002B ; ES # Sm PLUS SIGN +002D ; ES # Pd HYPHEN-MINUS +207A..207B ; ES # Sm [2] SUPERSCRIPT PLUS SIGN..SUPERSCRIPT MINUS +208A..208B ; ES # Sm [2] SUBSCRIPT PLUS SIGN..SUBSCRIPT MINUS +2212 ; ES # Sm MINUS SIGN +FB29 ; ES # Sm HEBREW LETTER ALTERNATIVE PLUS SIGN +FE62 ; ES # Sm SMALL PLUS SIGN +FE63 ; ES # Pd SMALL HYPHEN-MINUS +FF0B ; ES # Sm FULLWIDTH PLUS SIGN +FF0D ; ES # Pd FULLWIDTH HYPHEN-MINUS + +# Total code points: 12 + +# ================================================ + +# Bidi_Class=European_Terminator + +0023 ; ET # Po NUMBER SIGN +0024 ; ET # Sc DOLLAR SIGN +0025 ; ET # Po PERCENT SIGN +00A2..00A5 ; ET # Sc [4] CENT SIGN..YEN SIGN +00B0 ; ET # So DEGREE SIGN +00B1 ; ET # Sm PLUS-MINUS SIGN +058F ; ET # Sc ARMENIAN DRAM SIGN +0609..060A ; ET # Po [2] ARABIC-INDIC PER MILLE SIGN..ARABIC-INDIC PER TEN THOUSAND SIGN +066A ; ET # Po ARABIC PERCENT SIGN +09F2..09F3 ; ET # Sc [2] BENGALI RUPEE MARK..BENGALI RUPEE SIGN +09FB ; ET # Sc BENGALI GANDA MARK +0AF1 ; ET # Sc GUJARATI RUPEE SIGN +0BF9 ; ET # Sc TAMIL RUPEE SIGN +0E3F ; ET # Sc THAI CURRENCY SYMBOL BAHT +17DB ; ET # Sc KHMER CURRENCY SYMBOL RIEL +2030..2034 ; ET # Po [5] PER MILLE SIGN..TRIPLE PRIME +20A0..20C0 ; ET # Sc [33] EURO-CURRENCY SIGN..SOM SIGN +20C1..20CF ; ET # Cn [15] .. +212E ; ET # So ESTIMATED SYMBOL +2213 ; ET # Sm MINUS-OR-PLUS SIGN +A838 ; ET # Sc NORTH INDIC RUPEE MARK +A839 ; ET # So NORTH INDIC QUANTITY MARK +FE5F ; ET # Po SMALL NUMBER SIGN +FE69 ; ET # Sc SMALL DOLLAR SIGN +FE6A ; ET # Po SMALL PERCENT SIGN +FF03 ; ET # Po FULLWIDTH NUMBER SIGN +FF04 ; ET # Sc FULLWIDTH DOLLAR SIGN +FF05 ; ET # Po FULLWIDTH PERCENT SIGN +FFE0..FFE1 ; ET # Sc [2] FULLWIDTH CENT SIGN..FULLWIDTH POUND SIGN +FFE5..FFE6 ; ET # Sc [2] FULLWIDTH YEN SIGN..FULLWIDTH WON SIGN +11FDD..11FE0 ; ET # Sc [4] TAMIL SIGN KAACU..TAMIL SIGN VARAAKAN +1E2FF ; ET # Sc WANCHO NGUN SIGN + +# Total code points: 92 + +# ================================================ + +# Bidi_Class=Arabic_Number + +0600..0605 ; AN # Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE +0660..0669 ; AN # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE +066B..066C ; AN # Po [2] ARABIC DECIMAL SEPARATOR..ARABIC THOUSANDS SEPARATOR +06DD ; AN # Cf ARABIC END OF AYAH +0890..0891 ; AN # Cf [2] ARABIC POUND MARK ABOVE..ARABIC PIASTRE MARK ABOVE +08E2 ; AN # Cf ARABIC DISPUTED END OF AYAH +10D30..10D39 ; AN # Nd [10] HANIFI ROHINGYA DIGIT ZERO..HANIFI ROHINGYA DIGIT NINE +10E60..10E7E ; AN # No [31] RUMI DIGIT ONE..RUMI FRACTION TWO THIRDS + +# Total code points: 63 + +# ================================================ + +# Bidi_Class=Common_Separator + +002C ; CS # Po COMMA +002E..002F ; CS # Po [2] FULL STOP..SOLIDUS +003A ; CS # Po COLON +00A0 ; CS # Zs NO-BREAK SPACE +060C ; CS # Po ARABIC COMMA +202F ; CS # Zs NARROW NO-BREAK SPACE +2044 ; CS # Sm FRACTION SLASH +FE50 ; CS # Po SMALL COMMA +FE52 ; CS # Po SMALL FULL STOP +FE55 ; CS # Po SMALL COLON +FF0C ; CS # Po FULLWIDTH COMMA +FF0E..FF0F ; CS # Po [2] FULLWIDTH FULL STOP..FULLWIDTH SOLIDUS +FF1A ; CS # Po FULLWIDTH COLON + +# Total code points: 15 + +# ================================================ + +# Bidi_Class=Paragraph_Separator + +000A ; B # Cc +000D ; B # Cc +001C..001E ; B # Cc [3] .. +0085 ; B # Cc +2029 ; B # Zp PARAGRAPH SEPARATOR + +# Total code points: 7 + +# ================================================ + +# Bidi_Class=Segment_Separator + +0009 ; S # Cc +000B ; S # Cc +001F ; S # Cc + +# Total code points: 3 + +# ================================================ + +# Bidi_Class=White_Space + +000C ; WS # Cc +0020 ; WS # Zs SPACE +1680 ; WS # Zs OGHAM SPACE MARK +2000..200A ; WS # Zs [11] EN QUAD..HAIR SPACE +2028 ; WS # Zl LINE SEPARATOR +205F ; WS # Zs MEDIUM MATHEMATICAL SPACE +3000 ; WS # Zs IDEOGRAPHIC SPACE + +# Total code points: 17 + +# ================================================ + +# Bidi_Class=Other_Neutral + +0021..0022 ; ON # Po [2] EXCLAMATION MARK..QUOTATION MARK +0026..0027 ; ON # Po [2] AMPERSAND..APOSTROPHE +0028 ; ON # Ps LEFT PARENTHESIS +0029 ; ON # Pe RIGHT PARENTHESIS +002A ; ON # Po ASTERISK +003B ; ON # Po SEMICOLON +003C..003E ; ON # Sm [3] LESS-THAN SIGN..GREATER-THAN SIGN +003F..0040 ; ON # Po [2] QUESTION MARK..COMMERCIAL AT +005B ; ON # Ps LEFT SQUARE BRACKET +005C ; ON # Po REVERSE SOLIDUS +005D ; ON # Pe RIGHT SQUARE BRACKET +005E ; ON # Sk CIRCUMFLEX ACCENT +005F ; ON # Pc LOW LINE +0060 ; ON # Sk GRAVE ACCENT +007B ; ON # Ps LEFT CURLY BRACKET +007C ; ON # Sm VERTICAL LINE +007D ; ON # Pe RIGHT CURLY BRACKET +007E ; ON # Sm TILDE +00A1 ; ON # Po INVERTED EXCLAMATION MARK +00A6 ; ON # So BROKEN BAR +00A7 ; ON # Po SECTION SIGN +00A8 ; ON # Sk DIAERESIS +00A9 ; ON # So COPYRIGHT SIGN +00AB ; ON # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK +00AC ; ON # Sm NOT SIGN +00AE ; ON # So REGISTERED SIGN +00AF ; ON # Sk MACRON +00B4 ; ON # Sk ACUTE ACCENT +00B6..00B7 ; ON # Po [2] PILCROW SIGN..MIDDLE DOT +00B8 ; ON # Sk CEDILLA +00BB ; ON # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK +00BC..00BE ; ON # No [3] VULGAR FRACTION ONE QUARTER..VULGAR FRACTION THREE QUARTERS +00BF ; ON # Po INVERTED QUESTION MARK +00D7 ; ON # Sm MULTIPLICATION SIGN +00F7 ; ON # Sm DIVISION SIGN +02B9..02BA ; ON # Lm [2] MODIFIER LETTER PRIME..MODIFIER LETTER DOUBLE PRIME +02C2..02C5 ; ON # Sk [4] MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LETTER DOWN ARROWHEAD +02C6..02CF ; ON # Lm [10] MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER LETTER LOW ACUTE ACCENT +02D2..02DF ; ON # Sk [14] MODIFIER LETTER CENTRED RIGHT HALF RING..MODIFIER LETTER CROSS ACCENT +02E5..02EB ; ON # Sk [7] MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER YANG DEPARTING TONE MARK +02EC ; ON # Lm MODIFIER LETTER VOICING +02ED ; ON # Sk MODIFIER LETTER UNASPIRATED +02EF..02FF ; ON # Sk [17] MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER LETTER LOW LEFT ARROW +0374 ; ON # Lm GREEK NUMERAL SIGN +0375 ; ON # Sk GREEK LOWER NUMERAL SIGN +037E ; ON # Po GREEK QUESTION MARK +0384..0385 ; ON # Sk [2] GREEK TONOS..GREEK DIALYTIKA TONOS +0387 ; ON # Po GREEK ANO TELEIA +03F6 ; ON # Sm GREEK REVERSED LUNATE EPSILON SYMBOL +058A ; ON # Pd ARMENIAN HYPHEN +058D..058E ; ON # So [2] RIGHT-FACING ARMENIAN ETERNITY SIGN..LEFT-FACING ARMENIAN ETERNITY SIGN +0606..0607 ; ON # Sm [2] ARABIC-INDIC CUBE ROOT..ARABIC-INDIC FOURTH ROOT +060E..060F ; ON # So [2] ARABIC POETIC VERSE SIGN..ARABIC SIGN MISRA +06DE ; ON # So ARABIC START OF RUB EL HIZB +06E9 ; ON # So ARABIC PLACE OF SAJDAH +07F6 ; ON # So NKO SYMBOL OO DENNEN +07F7..07F9 ; ON # Po [3] NKO SYMBOL GBAKURUNEN..NKO EXCLAMATION MARK +0BF3..0BF8 ; ON # So [6] TAMIL DAY SIGN..TAMIL AS ABOVE SIGN +0BFA ; ON # So TAMIL NUMBER SIGN +0C78..0C7E ; ON # No [7] TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR..TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR +0F3A ; ON # Ps TIBETAN MARK GUG RTAGS GYON +0F3B ; ON # Pe TIBETAN MARK GUG RTAGS GYAS +0F3C ; ON # Ps TIBETAN MARK ANG KHANG GYON +0F3D ; ON # Pe TIBETAN MARK ANG KHANG GYAS +1390..1399 ; ON # So [10] ETHIOPIC TONAL MARK YIZET..ETHIOPIC TONAL MARK KURT +1400 ; ON # Pd CANADIAN SYLLABICS HYPHEN +169B ; ON # Ps OGHAM FEATHER MARK +169C ; ON # Pe OGHAM REVERSED FEATHER MARK +17F0..17F9 ; ON # No [10] KHMER SYMBOL LEK ATTAK SON..KHMER SYMBOL LEK ATTAK PRAM-BUON +1800..1805 ; ON # Po [6] MONGOLIAN BIRGA..MONGOLIAN FOUR DOTS +1806 ; ON # Pd MONGOLIAN TODO SOFT HYPHEN +1807..180A ; ON # Po [4] MONGOLIAN SIBE SYLLABLE BOUNDARY MARKER..MONGOLIAN NIRUGU +1940 ; ON # So LIMBU SIGN LOO +1944..1945 ; ON # Po [2] LIMBU EXCLAMATION MARK..LIMBU QUESTION MARK +19DE..19FF ; ON # So [34] NEW TAI LUE SIGN LAE..KHMER SYMBOL DAP-PRAM ROC +1FBD ; ON # Sk GREEK KORONIS +1FBF..1FC1 ; ON # Sk [3] GREEK PSILI..GREEK DIALYTIKA AND PERISPOMENI +1FCD..1FCF ; ON # Sk [3] GREEK PSILI AND VARIA..GREEK PSILI AND PERISPOMENI +1FDD..1FDF ; ON # Sk [3] GREEK DASIA AND VARIA..GREEK DASIA AND PERISPOMENI +1FED..1FEF ; ON # Sk [3] GREEK DIALYTIKA AND VARIA..GREEK VARIA +1FFD..1FFE ; ON # Sk [2] GREEK OXIA..GREEK DASIA +2010..2015 ; ON # Pd [6] HYPHEN..HORIZONTAL BAR +2016..2017 ; ON # Po [2] DOUBLE VERTICAL LINE..DOUBLE LOW LINE +2018 ; ON # Pi LEFT SINGLE QUOTATION MARK +2019 ; ON # Pf RIGHT SINGLE QUOTATION MARK +201A ; ON # Ps SINGLE LOW-9 QUOTATION MARK +201B..201C ; ON # Pi [2] SINGLE HIGH-REVERSED-9 QUOTATION MARK..LEFT DOUBLE QUOTATION MARK +201D ; ON # Pf RIGHT DOUBLE QUOTATION MARK +201E ; ON # Ps DOUBLE LOW-9 QUOTATION MARK +201F ; ON # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK +2020..2027 ; ON # Po [8] DAGGER..HYPHENATION POINT +2035..2038 ; ON # Po [4] REVERSED PRIME..CARET +2039 ; ON # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK +203A ; ON # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK +203B..203E ; ON # Po [4] REFERENCE MARK..OVERLINE +203F..2040 ; ON # Pc [2] UNDERTIE..CHARACTER TIE +2041..2043 ; ON # Po [3] CARET INSERTION POINT..HYPHEN BULLET +2045 ; ON # Ps LEFT SQUARE BRACKET WITH QUILL +2046 ; ON # Pe RIGHT SQUARE BRACKET WITH QUILL +2047..2051 ; ON # Po [11] DOUBLE QUESTION MARK..TWO ASTERISKS ALIGNED VERTICALLY +2052 ; ON # Sm COMMERCIAL MINUS SIGN +2053 ; ON # Po SWUNG DASH +2054 ; ON # Pc INVERTED UNDERTIE +2055..205E ; ON # Po [10] FLOWER PUNCTUATION MARK..VERTICAL FOUR DOTS +207C ; ON # Sm SUPERSCRIPT EQUALS SIGN +207D ; ON # Ps SUPERSCRIPT LEFT PARENTHESIS +207E ; ON # Pe SUPERSCRIPT RIGHT PARENTHESIS +208C ; ON # Sm SUBSCRIPT EQUALS SIGN +208D ; ON # Ps SUBSCRIPT LEFT PARENTHESIS +208E ; ON # Pe SUBSCRIPT RIGHT PARENTHESIS +2100..2101 ; ON # So [2] ACCOUNT OF..ADDRESSED TO THE SUBJECT +2103..2106 ; ON # So [4] DEGREE CELSIUS..CADA UNA +2108..2109 ; ON # So [2] SCRUPLE..DEGREE FAHRENHEIT +2114 ; ON # So L B BAR SYMBOL +2116..2117 ; ON # So [2] NUMERO SIGN..SOUND RECORDING COPYRIGHT +2118 ; ON # Sm SCRIPT CAPITAL P +211E..2123 ; ON # So [6] PRESCRIPTION TAKE..VERSICLE +2125 ; ON # So OUNCE SIGN +2127 ; ON # So INVERTED OHM SIGN +2129 ; ON # So TURNED GREEK SMALL LETTER IOTA +213A..213B ; ON # So [2] ROTATED CAPITAL Q..FACSIMILE SIGN +2140..2144 ; ON # Sm [5] DOUBLE-STRUCK N-ARY SUMMATION..TURNED SANS-SERIF CAPITAL Y +214A ; ON # So PROPERTY LINE +214B ; ON # Sm TURNED AMPERSAND +214C..214D ; ON # So [2] PER SIGN..AKTIESELSKAB +2150..215F ; ON # No [16] VULGAR FRACTION ONE SEVENTH..FRACTION NUMERATOR ONE +2189 ; ON # No VULGAR FRACTION ZERO THIRDS +218A..218B ; ON # So [2] TURNED DIGIT TWO..TURNED DIGIT THREE +2190..2194 ; ON # Sm [5] LEFTWARDS ARROW..LEFT RIGHT ARROW +2195..2199 ; ON # So [5] UP DOWN ARROW..SOUTH WEST ARROW +219A..219B ; ON # Sm [2] LEFTWARDS ARROW WITH STROKE..RIGHTWARDS ARROW WITH STROKE +219C..219F ; ON # So [4] LEFTWARDS WAVE ARROW..UPWARDS TWO HEADED ARROW +21A0 ; ON # Sm RIGHTWARDS TWO HEADED ARROW +21A1..21A2 ; ON # So [2] DOWNWARDS TWO HEADED ARROW..LEFTWARDS ARROW WITH TAIL +21A3 ; ON # Sm RIGHTWARDS ARROW WITH TAIL +21A4..21A5 ; ON # So [2] LEFTWARDS ARROW FROM BAR..UPWARDS ARROW FROM BAR +21A6 ; ON # Sm RIGHTWARDS ARROW FROM BAR +21A7..21AD ; ON # So [7] DOWNWARDS ARROW FROM BAR..LEFT RIGHT WAVE ARROW +21AE ; ON # Sm LEFT RIGHT ARROW WITH STROKE +21AF..21CD ; ON # So [31] DOWNWARDS ZIGZAG ARROW..LEFTWARDS DOUBLE ARROW WITH STROKE +21CE..21CF ; ON # Sm [2] LEFT RIGHT DOUBLE ARROW WITH STROKE..RIGHTWARDS DOUBLE ARROW WITH STROKE +21D0..21D1 ; ON # So [2] LEFTWARDS DOUBLE ARROW..UPWARDS DOUBLE ARROW +21D2 ; ON # Sm RIGHTWARDS DOUBLE ARROW +21D3 ; ON # So DOWNWARDS DOUBLE ARROW +21D4 ; ON # Sm LEFT RIGHT DOUBLE ARROW +21D5..21F3 ; ON # So [31] UP DOWN DOUBLE ARROW..UP DOWN WHITE ARROW +21F4..2211 ; ON # Sm [30] RIGHT ARROW WITH SMALL CIRCLE..N-ARY SUMMATION +2214..22FF ; ON # Sm [236] DOT PLUS..Z NOTATION BAG MEMBERSHIP +2300..2307 ; ON # So [8] DIAMETER SIGN..WAVY LINE +2308 ; ON # Ps LEFT CEILING +2309 ; ON # Pe RIGHT CEILING +230A ; ON # Ps LEFT FLOOR +230B ; ON # Pe RIGHT FLOOR +230C..231F ; ON # So [20] BOTTOM RIGHT CROP..BOTTOM RIGHT CORNER +2320..2321 ; ON # Sm [2] TOP HALF INTEGRAL..BOTTOM HALF INTEGRAL +2322..2328 ; ON # So [7] FROWN..KEYBOARD +2329 ; ON # Ps LEFT-POINTING ANGLE BRACKET +232A ; ON # Pe RIGHT-POINTING ANGLE BRACKET +232B..2335 ; ON # So [11] ERASE TO THE LEFT..COUNTERSINK +237B ; ON # So NOT CHECK MARK +237C ; ON # Sm RIGHT ANGLE WITH DOWNWARDS ZIGZAG ARROW +237D..2394 ; ON # So [24] SHOULDERED OPEN BOX..SOFTWARE-FUNCTION SYMBOL +2396..239A ; ON # So [5] DECIMAL SEPARATOR KEY SYMBOL..CLEAR SCREEN SYMBOL +239B..23B3 ; ON # Sm [25] LEFT PARENTHESIS UPPER HOOK..SUMMATION BOTTOM +23B4..23DB ; ON # So [40] TOP SQUARE BRACKET..FUSE +23DC..23E1 ; ON # Sm [6] TOP PARENTHESIS..BOTTOM TORTOISE SHELL BRACKET +23E2..2426 ; ON # So [69] WHITE TRAPEZIUM..SYMBOL FOR SUBSTITUTE FORM TWO +2440..244A ; ON # So [11] OCR HOOK..OCR DOUBLE BACKSLASH +2460..2487 ; ON # No [40] CIRCLED DIGIT ONE..PARENTHESIZED NUMBER TWENTY +24EA..24FF ; ON # No [22] CIRCLED DIGIT ZERO..NEGATIVE CIRCLED DIGIT ZERO +2500..25B6 ; ON # So [183] BOX DRAWINGS LIGHT HORIZONTAL..BLACK RIGHT-POINTING TRIANGLE +25B7 ; ON # Sm WHITE RIGHT-POINTING TRIANGLE +25B8..25C0 ; ON # So [9] BLACK RIGHT-POINTING SMALL TRIANGLE..BLACK LEFT-POINTING TRIANGLE +25C1 ; ON # Sm WHITE LEFT-POINTING TRIANGLE +25C2..25F7 ; ON # So [54] BLACK LEFT-POINTING SMALL TRIANGLE..WHITE CIRCLE WITH UPPER RIGHT QUADRANT +25F8..25FF ; ON # Sm [8] UPPER LEFT TRIANGLE..LOWER RIGHT TRIANGLE +2600..266E ; ON # So [111] BLACK SUN WITH RAYS..MUSIC NATURAL SIGN +266F ; ON # Sm MUSIC SHARP SIGN +2670..26AB ; ON # So [60] WEST SYRIAC CROSS..MEDIUM BLACK CIRCLE +26AD..2767 ; ON # So [187] MARRIAGE SYMBOL..ROTATED FLORAL HEART BULLET +2768 ; ON # Ps MEDIUM LEFT PARENTHESIS ORNAMENT +2769 ; ON # Pe MEDIUM RIGHT PARENTHESIS ORNAMENT +276A ; ON # Ps MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT +276B ; ON # Pe MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT +276C ; ON # Ps MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT +276D ; ON # Pe MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT +276E ; ON # Ps HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT +276F ; ON # Pe HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT +2770 ; ON # Ps HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT +2771 ; ON # Pe HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT +2772 ; ON # Ps LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT +2773 ; ON # Pe LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT +2774 ; ON # Ps MEDIUM LEFT CURLY BRACKET ORNAMENT +2775 ; ON # Pe MEDIUM RIGHT CURLY BRACKET ORNAMENT +2776..2793 ; ON # No [30] DINGBAT NEGATIVE CIRCLED DIGIT ONE..DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN +2794..27BF ; ON # So [44] HEAVY WIDE-HEADED RIGHTWARDS ARROW..DOUBLE CURLY LOOP +27C0..27C4 ; ON # Sm [5] THREE DIMENSIONAL ANGLE..OPEN SUPERSET +27C5 ; ON # Ps LEFT S-SHAPED BAG DELIMITER +27C6 ; ON # Pe RIGHT S-SHAPED BAG DELIMITER +27C7..27E5 ; ON # Sm [31] OR WITH DOT INSIDE..WHITE SQUARE WITH RIGHTWARDS TICK +27E6 ; ON # Ps MATHEMATICAL LEFT WHITE SQUARE BRACKET +27E7 ; ON # Pe MATHEMATICAL RIGHT WHITE SQUARE BRACKET +27E8 ; ON # Ps MATHEMATICAL LEFT ANGLE BRACKET +27E9 ; ON # Pe MATHEMATICAL RIGHT ANGLE BRACKET +27EA ; ON # Ps MATHEMATICAL LEFT DOUBLE ANGLE BRACKET +27EB ; ON # Pe MATHEMATICAL RIGHT DOUBLE ANGLE BRACKET +27EC ; ON # Ps MATHEMATICAL LEFT WHITE TORTOISE SHELL BRACKET +27ED ; ON # Pe MATHEMATICAL RIGHT WHITE TORTOISE SHELL BRACKET +27EE ; ON # Ps MATHEMATICAL LEFT FLATTENED PARENTHESIS +27EF ; ON # Pe MATHEMATICAL RIGHT FLATTENED PARENTHESIS +27F0..27FF ; ON # Sm [16] UPWARDS QUADRUPLE ARROW..LONG RIGHTWARDS SQUIGGLE ARROW +2900..2982 ; ON # Sm [131] RIGHTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE..Z NOTATION TYPE COLON +2983 ; ON # Ps LEFT WHITE CURLY BRACKET +2984 ; ON # Pe RIGHT WHITE CURLY BRACKET +2985 ; ON # Ps LEFT WHITE PARENTHESIS +2986 ; ON # Pe RIGHT WHITE PARENTHESIS +2987 ; ON # Ps Z NOTATION LEFT IMAGE BRACKET +2988 ; ON # Pe Z NOTATION RIGHT IMAGE BRACKET +2989 ; ON # Ps Z NOTATION LEFT BINDING BRACKET +298A ; ON # Pe Z NOTATION RIGHT BINDING BRACKET +298B ; ON # Ps LEFT SQUARE BRACKET WITH UNDERBAR +298C ; ON # Pe RIGHT SQUARE BRACKET WITH UNDERBAR +298D ; ON # Ps LEFT SQUARE BRACKET WITH TICK IN TOP CORNER +298E ; ON # Pe RIGHT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +298F ; ON # Ps LEFT SQUARE BRACKET WITH TICK IN BOTTOM CORNER +2990 ; ON # Pe RIGHT SQUARE BRACKET WITH TICK IN TOP CORNER +2991 ; ON # Ps LEFT ANGLE BRACKET WITH DOT +2992 ; ON # Pe RIGHT ANGLE BRACKET WITH DOT +2993 ; ON # Ps LEFT ARC LESS-THAN BRACKET +2994 ; ON # Pe RIGHT ARC GREATER-THAN BRACKET +2995 ; ON # Ps DOUBLE LEFT ARC GREATER-THAN BRACKET +2996 ; ON # Pe DOUBLE RIGHT ARC LESS-THAN BRACKET +2997 ; ON # Ps LEFT BLACK TORTOISE SHELL BRACKET +2998 ; ON # Pe RIGHT BLACK TORTOISE SHELL BRACKET +2999..29D7 ; ON # Sm [63] DOTTED FENCE..BLACK HOURGLASS +29D8 ; ON # Ps LEFT WIGGLY FENCE +29D9 ; ON # Pe RIGHT WIGGLY FENCE +29DA ; ON # Ps LEFT DOUBLE WIGGLY FENCE +29DB ; ON # Pe RIGHT DOUBLE WIGGLY FENCE +29DC..29FB ; ON # Sm [32] INCOMPLETE INFINITY..TRIPLE PLUS +29FC ; ON # Ps LEFT-POINTING CURVED ANGLE BRACKET +29FD ; ON # Pe RIGHT-POINTING CURVED ANGLE BRACKET +29FE..2AFF ; ON # Sm [258] TINY..N-ARY WHITE VERTICAL BAR +2B00..2B2F ; ON # So [48] NORTH EAST WHITE ARROW..WHITE VERTICAL ELLIPSE +2B30..2B44 ; ON # Sm [21] LEFT ARROW WITH SMALL CIRCLE..RIGHTWARDS ARROW THROUGH SUPERSET +2B45..2B46 ; ON # So [2] LEFTWARDS QUADRUPLE ARROW..RIGHTWARDS QUADRUPLE ARROW +2B47..2B4C ; ON # Sm [6] REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW..RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR +2B4D..2B73 ; ON # So [39] DOWNWARDS TRIANGLE-HEADED ZIGZAG ARROW..DOWNWARDS TRIANGLE-HEADED ARROW TO BAR +2B76..2B95 ; ON # So [32] NORTH WEST TRIANGLE-HEADED ARROW TO BAR..RIGHTWARDS BLACK ARROW +2B97..2BFF ; ON # So [105] SYMBOL FOR TYPE A ELECTRONICS..HELLSCHREIBER PAUSE SYMBOL +2CE5..2CEA ; ON # So [6] COPTIC SYMBOL MI RO..COPTIC SYMBOL SHIMA SIMA +2CF9..2CFC ; ON # Po [4] COPTIC OLD NUBIAN FULL STOP..COPTIC OLD NUBIAN VERSE DIVIDER +2CFD ; ON # No COPTIC FRACTION ONE HALF +2CFE..2CFF ; ON # Po [2] COPTIC FULL STOP..COPTIC MORPHOLOGICAL DIVIDER +2E00..2E01 ; ON # Po [2] RIGHT ANGLE SUBSTITUTION MARKER..RIGHT ANGLE DOTTED SUBSTITUTION MARKER +2E02 ; ON # Pi LEFT SUBSTITUTION BRACKET +2E03 ; ON # Pf RIGHT SUBSTITUTION BRACKET +2E04 ; ON # Pi LEFT DOTTED SUBSTITUTION BRACKET +2E05 ; ON # Pf RIGHT DOTTED SUBSTITUTION BRACKET +2E06..2E08 ; ON # Po [3] RAISED INTERPOLATION MARKER..DOTTED TRANSPOSITION MARKER +2E09 ; ON # Pi LEFT TRANSPOSITION BRACKET +2E0A ; ON # Pf RIGHT TRANSPOSITION BRACKET +2E0B ; ON # Po RAISED SQUARE +2E0C ; ON # Pi LEFT RAISED OMISSION BRACKET +2E0D ; ON # Pf RIGHT RAISED OMISSION BRACKET +2E0E..2E16 ; ON # Po [9] EDITORIAL CORONIS..DOTTED RIGHT-POINTING ANGLE +2E17 ; ON # Pd DOUBLE OBLIQUE HYPHEN +2E18..2E19 ; ON # Po [2] INVERTED INTERROBANG..PALM BRANCH +2E1A ; ON # Pd HYPHEN WITH DIAERESIS +2E1B ; ON # Po TILDE WITH RING ABOVE +2E1C ; ON # Pi LEFT LOW PARAPHRASE BRACKET +2E1D ; ON # Pf RIGHT LOW PARAPHRASE BRACKET +2E1E..2E1F ; ON # Po [2] TILDE WITH DOT ABOVE..TILDE WITH DOT BELOW +2E20 ; ON # Pi LEFT VERTICAL BAR WITH QUILL +2E21 ; ON # Pf RIGHT VERTICAL BAR WITH QUILL +2E22 ; ON # Ps TOP LEFT HALF BRACKET +2E23 ; ON # Pe TOP RIGHT HALF BRACKET +2E24 ; ON # Ps BOTTOM LEFT HALF BRACKET +2E25 ; ON # Pe BOTTOM RIGHT HALF BRACKET +2E26 ; ON # Ps LEFT SIDEWAYS U BRACKET +2E27 ; ON # Pe RIGHT SIDEWAYS U BRACKET +2E28 ; ON # Ps LEFT DOUBLE PARENTHESIS +2E29 ; ON # Pe RIGHT DOUBLE PARENTHESIS +2E2A..2E2E ; ON # Po [5] TWO DOTS OVER ONE DOT PUNCTUATION..REVERSED QUESTION MARK +2E2F ; ON # Lm VERTICAL TILDE +2E30..2E39 ; ON # Po [10] RING POINT..TOP HALF SECTION SIGN +2E3A..2E3B ; ON # Pd [2] TWO-EM DASH..THREE-EM DASH +2E3C..2E3F ; ON # Po [4] STENOGRAPHIC FULL STOP..CAPITULUM +2E40 ; ON # Pd DOUBLE HYPHEN +2E41 ; ON # Po REVERSED COMMA +2E42 ; ON # Ps DOUBLE LOW-REVERSED-9 QUOTATION MARK +2E43..2E4F ; ON # Po [13] DASH WITH LEFT UPTURN..CORNISH VERSE DIVIDER +2E50..2E51 ; ON # So [2] CROSS PATTY WITH RIGHT CROSSBAR..CROSS PATTY WITH LEFT CROSSBAR +2E52..2E54 ; ON # Po [3] TIRONIAN SIGN CAPITAL ET..MEDIEVAL QUESTION MARK +2E55 ; ON # Ps LEFT SQUARE BRACKET WITH STROKE +2E56 ; ON # Pe RIGHT SQUARE BRACKET WITH STROKE +2E57 ; ON # Ps LEFT SQUARE BRACKET WITH DOUBLE STROKE +2E58 ; ON # Pe RIGHT SQUARE BRACKET WITH DOUBLE STROKE +2E59 ; ON # Ps TOP HALF LEFT PARENTHESIS +2E5A ; ON # Pe TOP HALF RIGHT PARENTHESIS +2E5B ; ON # Ps BOTTOM HALF LEFT PARENTHESIS +2E5C ; ON # Pe BOTTOM HALF RIGHT PARENTHESIS +2E5D ; ON # Pd OBLIQUE HYPHEN +2E80..2E99 ; ON # So [26] CJK RADICAL REPEAT..CJK RADICAL RAP +2E9B..2EF3 ; ON # So [89] CJK RADICAL CHOKE..CJK RADICAL C-SIMPLIFIED TURTLE +2F00..2FD5 ; ON # So [214] KANGXI RADICAL ONE..KANGXI RADICAL FLUTE +2FF0..2FFB ; ON # So [12] IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT..IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID +3001..3003 ; ON # Po [3] IDEOGRAPHIC COMMA..DITTO MARK +3004 ; ON # So JAPANESE INDUSTRIAL STANDARD SYMBOL +3008 ; ON # Ps LEFT ANGLE BRACKET +3009 ; ON # Pe RIGHT ANGLE BRACKET +300A ; ON # Ps LEFT DOUBLE ANGLE BRACKET +300B ; ON # Pe RIGHT DOUBLE ANGLE BRACKET +300C ; ON # Ps LEFT CORNER BRACKET +300D ; ON # Pe RIGHT CORNER BRACKET +300E ; ON # Ps LEFT WHITE CORNER BRACKET +300F ; ON # Pe RIGHT WHITE CORNER BRACKET +3010 ; ON # Ps LEFT BLACK LENTICULAR BRACKET +3011 ; ON # Pe RIGHT BLACK LENTICULAR BRACKET +3012..3013 ; ON # So [2] POSTAL MARK..GETA MARK +3014 ; ON # Ps LEFT TORTOISE SHELL BRACKET +3015 ; ON # Pe RIGHT TORTOISE SHELL BRACKET +3016 ; ON # Ps LEFT WHITE LENTICULAR BRACKET +3017 ; ON # Pe RIGHT WHITE LENTICULAR BRACKET +3018 ; ON # Ps LEFT WHITE TORTOISE SHELL BRACKET +3019 ; ON # Pe RIGHT WHITE TORTOISE SHELL BRACKET +301A ; ON # Ps LEFT WHITE SQUARE BRACKET +301B ; ON # Pe RIGHT WHITE SQUARE BRACKET +301C ; ON # Pd WAVE DASH +301D ; ON # Ps REVERSED DOUBLE PRIME QUOTATION MARK +301E..301F ; ON # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK +3020 ; ON # So POSTAL MARK FACE +3030 ; ON # Pd WAVY DASH +3036..3037 ; ON # So [2] CIRCLED POSTAL MARK..IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL +303D ; ON # Po PART ALTERNATION MARK +303E..303F ; ON # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE +309B..309C ; ON # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +30A0 ; ON # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN +30FB ; ON # Po KATAKANA MIDDLE DOT +31C0..31E3 ; ON # So [36] CJK STROKE T..CJK STROKE Q +321D..321E ; ON # So [2] PARENTHESIZED KOREAN CHARACTER OJEON..PARENTHESIZED KOREAN CHARACTER O HU +3250 ; ON # So PARTNERSHIP SIGN +3251..325F ; ON # No [15] CIRCLED NUMBER TWENTY ONE..CIRCLED NUMBER THIRTY FIVE +327C..327E ; ON # So [3] CIRCLED KOREAN CHARACTER CHAMKO..CIRCLED HANGUL IEUNG U +32B1..32BF ; ON # No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY +32CC..32CF ; ON # So [4] SQUARE HG..LIMITED LIABILITY SIGN +3377..337A ; ON # So [4] SQUARE DM..SQUARE IU +33DE..33DF ; ON # So [2] SQUARE V OVER M..SQUARE A OVER M +33FF ; ON # So SQUARE GAL +4DC0..4DFF ; ON # So [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION +A490..A4C6 ; ON # So [55] YI RADICAL QOT..YI RADICAL KE +A60D..A60F ; ON # Po [3] VAI COMMA..VAI QUESTION MARK +A673 ; ON # Po SLAVONIC ASTERISK +A67E ; ON # Po CYRILLIC KAVYKA +A67F ; ON # Lm CYRILLIC PAYEROK +A700..A716 ; ON # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR +A717..A71F ; ON # Lm [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK +A720..A721 ; ON # Sk [2] MODIFIER LETTER STRESS AND HIGH TONE..MODIFIER LETTER STRESS AND LOW TONE +A788 ; ON # Lm MODIFIER LETTER LOW CIRCUMFLEX ACCENT +A828..A82B ; ON # So [4] SYLOTI NAGRI POETRY MARK-1..SYLOTI NAGRI POETRY MARK-4 +A874..A877 ; ON # Po [4] PHAGS-PA SINGLE HEAD MARK..PHAGS-PA MARK DOUBLE SHAD +AB6A..AB6B ; ON # Sk [2] MODIFIER LETTER LEFT TACK..MODIFIER LETTER RIGHT TACK +FD3E ; ON # Pe ORNATE LEFT PARENTHESIS +FD3F ; ON # Ps ORNATE RIGHT PARENTHESIS +FD40..FD4F ; ON # So [16] ARABIC LIGATURE RAHIMAHU ALLAAH..ARABIC LIGATURE RAHIMAHUM ALLAAH +FDCF ; ON # So ARABIC LIGATURE SALAAMUHU ALAYNAA +FDFD..FDFF ; ON # So [3] ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM..ARABIC LIGATURE AZZA WA JALL +FE10..FE16 ; ON # Po [7] PRESENTATION FORM FOR VERTICAL COMMA..PRESENTATION FORM FOR VERTICAL QUESTION MARK +FE17 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET +FE18 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET +FE19 ; ON # Po PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS +FE30 ; ON # Po PRESENTATION FORM FOR VERTICAL TWO DOT LEADER +FE31..FE32 ; ON # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH +FE33..FE34 ; ON # Pc [2] PRESENTATION FORM FOR VERTICAL LOW LINE..PRESENTATION FORM FOR VERTICAL WAVY LOW LINE +FE35 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS +FE36 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS +FE37 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET +FE38 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET +FE39 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET +FE3A ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET +FE3B ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET +FE3C ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET +FE3D ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET +FE3E ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET +FE3F ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET +FE40 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET +FE41 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET +FE42 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET +FE43 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET +FE44 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET +FE45..FE46 ; ON # Po [2] SESAME DOT..WHITE SESAME DOT +FE47 ; ON # Ps PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET +FE48 ; ON # Pe PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET +FE49..FE4C ; ON # Po [4] DASHED OVERLINE..DOUBLE WAVY OVERLINE +FE4D..FE4F ; ON # Pc [3] DASHED LOW LINE..WAVY LOW LINE +FE51 ; ON # Po SMALL IDEOGRAPHIC COMMA +FE54 ; ON # Po SMALL SEMICOLON +FE56..FE57 ; ON # Po [2] SMALL QUESTION MARK..SMALL EXCLAMATION MARK +FE58 ; ON # Pd SMALL EM DASH +FE59 ; ON # Ps SMALL LEFT PARENTHESIS +FE5A ; ON # Pe SMALL RIGHT PARENTHESIS +FE5B ; ON # Ps SMALL LEFT CURLY BRACKET +FE5C ; ON # Pe SMALL RIGHT CURLY BRACKET +FE5D ; ON # Ps SMALL LEFT TORTOISE SHELL BRACKET +FE5E ; ON # Pe SMALL RIGHT TORTOISE SHELL BRACKET +FE60..FE61 ; ON # Po [2] SMALL AMPERSAND..SMALL ASTERISK +FE64..FE66 ; ON # Sm [3] SMALL LESS-THAN SIGN..SMALL EQUALS SIGN +FE68 ; ON # Po SMALL REVERSE SOLIDUS +FE6B ; ON # Po SMALL COMMERCIAL AT +FF01..FF02 ; ON # Po [2] FULLWIDTH EXCLAMATION MARK..FULLWIDTH QUOTATION MARK +FF06..FF07 ; ON # Po [2] FULLWIDTH AMPERSAND..FULLWIDTH APOSTROPHE +FF08 ; ON # Ps FULLWIDTH LEFT PARENTHESIS +FF09 ; ON # Pe FULLWIDTH RIGHT PARENTHESIS +FF0A ; ON # Po FULLWIDTH ASTERISK +FF1B ; ON # Po FULLWIDTH SEMICOLON +FF1C..FF1E ; ON # Sm [3] FULLWIDTH LESS-THAN SIGN..FULLWIDTH GREATER-THAN SIGN +FF1F..FF20 ; ON # Po [2] FULLWIDTH QUESTION MARK..FULLWIDTH COMMERCIAL AT +FF3B ; ON # Ps FULLWIDTH LEFT SQUARE BRACKET +FF3C ; ON # Po FULLWIDTH REVERSE SOLIDUS +FF3D ; ON # Pe FULLWIDTH RIGHT SQUARE BRACKET +FF3E ; ON # Sk FULLWIDTH CIRCUMFLEX ACCENT +FF3F ; ON # Pc FULLWIDTH LOW LINE +FF40 ; ON # Sk FULLWIDTH GRAVE ACCENT +FF5B ; ON # Ps FULLWIDTH LEFT CURLY BRACKET +FF5C ; ON # Sm FULLWIDTH VERTICAL LINE +FF5D ; ON # Pe FULLWIDTH RIGHT CURLY BRACKET +FF5E ; ON # Sm FULLWIDTH TILDE +FF5F ; ON # Ps FULLWIDTH LEFT WHITE PARENTHESIS +FF60 ; ON # Pe FULLWIDTH RIGHT WHITE PARENTHESIS +FF61 ; ON # Po HALFWIDTH IDEOGRAPHIC FULL STOP +FF62 ; ON # Ps HALFWIDTH LEFT CORNER BRACKET +FF63 ; ON # Pe HALFWIDTH RIGHT CORNER BRACKET +FF64..FF65 ; ON # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT +FFE2 ; ON # Sm FULLWIDTH NOT SIGN +FFE3 ; ON # Sk FULLWIDTH MACRON +FFE4 ; ON # So FULLWIDTH BROKEN BAR +FFE8 ; ON # So HALFWIDTH FORMS LIGHT VERTICAL +FFE9..FFEC ; ON # Sm [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS ARROW +FFED..FFEE ; ON # So [2] HALFWIDTH BLACK SQUARE..HALFWIDTH WHITE CIRCLE +FFF9..FFFB ; ON # Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR +FFFC..FFFD ; ON # So [2] OBJECT REPLACEMENT CHARACTER..REPLACEMENT CHARACTER +10101 ; ON # Po AEGEAN WORD SEPARATOR DOT +10140..10174 ; ON # Nl [53] GREEK ACROPHONIC ATTIC ONE QUARTER..GREEK ACROPHONIC STRATIAN FIFTY MNAS +10175..10178 ; ON # No [4] GREEK ONE HALF SIGN..GREEK THREE QUARTERS SIGN +10179..10189 ; ON # So [17] GREEK YEAR SIGN..GREEK TRYBLION BASE SIGN +1018A..1018B ; ON # No [2] GREEK ZERO SIGN..GREEK ONE QUARTER SIGN +1018C ; ON # So GREEK SINUSOID SIGN +10190..1019C ; ON # So [13] ROMAN SEXTANS SIGN..ASCIA SYMBOL +101A0 ; ON # So GREEK SYMBOL TAU RHO +1091F ; ON # Po PHOENICIAN WORD SEPARATOR +10B39..10B3F ; ON # Po [7] AVESTAN ABBREVIATION MARK..LARGE ONE RING OVER TWO RINGS PUNCTUATION +11052..11065 ; ON # No [20] BRAHMI NUMBER ONE..BRAHMI NUMBER ONE THOUSAND +11660..1166C ; ON # Po [13] MONGOLIAN BIRGA WITH ORNAMENT..MONGOLIAN TURNED SWIRL BIRGA WITH DOUBLE ORNAMENT +11FD5..11FDC ; ON # So [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI +11FE1..11FF1 ; ON # So [17] TAMIL SIGN PAARAM..TAMIL SIGN VAKAIYARAA +16FE2 ; ON # Po OLD CHINESE HOOK MARK +1D1E9..1D1EA ; ON # So [2] MUSICAL SYMBOL SORI..MUSICAL SYMBOL KORON +1D200..1D241 ; ON # So [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54 +1D245 ; ON # So GREEK MUSICAL LEIMMA +1D300..1D356 ; ON # So [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING +1D6DB ; ON # Sm MATHEMATICAL BOLD PARTIAL DIFFERENTIAL +1D715 ; ON # Sm MATHEMATICAL ITALIC PARTIAL DIFFERENTIAL +1D74F ; ON # Sm MATHEMATICAL BOLD ITALIC PARTIAL DIFFERENTIAL +1D789 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD PARTIAL DIFFERENTIAL +1D7C3 ; ON # Sm MATHEMATICAL SANS-SERIF BOLD ITALIC PARTIAL DIFFERENTIAL +1EEF0..1EEF1 ; ON # Sm [2] ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL..ARABIC MATHEMATICAL OPERATOR HAH WITH DAL +1F000..1F02B ; ON # So [44] MAHJONG TILE EAST WIND..MAHJONG TILE BACK +1F030..1F093 ; ON # So [100] DOMINO TILE HORIZONTAL BACK..DOMINO TILE VERTICAL-06-06 +1F0A0..1F0AE ; ON # So [15] PLAYING CARD BACK..PLAYING CARD KING OF SPADES +1F0B1..1F0BF ; ON # So [15] PLAYING CARD ACE OF HEARTS..PLAYING CARD RED JOKER +1F0C1..1F0CF ; ON # So [15] PLAYING CARD ACE OF DIAMONDS..PLAYING CARD BLACK JOKER +1F0D1..1F0F5 ; ON # So [37] PLAYING CARD ACE OF CLUBS..PLAYING CARD TRUMP-21 +1F10B..1F10C ; ON # No [2] DINGBAT CIRCLED SANS-SERIF DIGIT ZERO..DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ZERO +1F10D..1F10F ; ON # So [3] CIRCLED ZERO WITH SLASH..CIRCLED DOLLAR SIGN WITH OVERLAID BACKSLASH +1F12F ; ON # So COPYLEFT SYMBOL +1F16A..1F16F ; ON # So [6] RAISED MC SIGN..CIRCLED HUMAN FIGURE +1F1AD ; ON # So MASK WORK SYMBOL +1F260..1F265 ; ON # So [6] ROUNDED SYMBOL FOR FU..ROUNDED SYMBOL FOR CAI +1F300..1F3FA ; ON # So [251] CYCLONE..AMPHORA +1F3FB..1F3FF ; ON # Sk [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6 +1F400..1F6D7 ; ON # So [728] RAT..ELEVATOR +1F6DD..1F6EC ; ON # So [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING +1F6F0..1F6FC ; ON # So [13] SATELLITE..ROLLER SKATE +1F700..1F773 ; ON # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE +1F780..1F7D8 ; ON # So [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE +1F7E0..1F7EB ; ON # So [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE +1F7F0 ; ON # So HEAVY EQUALS SIGN +1F800..1F80B ; ON # So [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD +1F810..1F847 ; ON # So [56] LEFTWARDS ARROW WITH SMALL EQUILATERAL ARROWHEAD..DOWNWARDS HEAVY ARROW +1F850..1F859 ; ON # So [10] LEFTWARDS SANS-SERIF ARROW..UP DOWN SANS-SERIF ARROW +1F860..1F887 ; ON # So [40] WIDE-HEADED LEFTWARDS LIGHT BARB ARROW..WIDE-HEADED SOUTH WEST VERY HEAVY BARB ARROW +1F890..1F8AD ; ON # So [30] LEFTWARDS TRIANGLE ARROWHEAD..WHITE ARROW SHAFT WIDTH TWO THIRDS +1F8B0..1F8B1 ; ON # So [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST +1F900..1FA53 ; ON # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP +1FA60..1FA6D ; ON # So [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER +1FA70..1FA74 ; ON # So [5] BALLET SHOES..THONG SANDAL +1FA78..1FA7C ; ON # So [5] DROP OF BLOOD..CRUTCH +1FA80..1FA86 ; ON # So [7] YO-YO..NESTING DOLLS +1FA90..1FAAC ; ON # So [29] RINGED PLANET..HAMSA +1FAB0..1FABA ; ON # So [11] FLY..NEST WITH EGGS +1FAC0..1FAC5 ; ON # So [6] ANATOMICAL HEART..PERSON WITH CROWN +1FAD0..1FAD9 ; ON # So [10] BLUEBERRIES..JAR +1FAE0..1FAE7 ; ON # So [8] MELTING FACE..BUBBLES +1FAF0..1FAF6 ; ON # So [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS +1FB00..1FB92 ; ON # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK +1FB94..1FBCA ; ON # So [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON + +# Total code points: 6000 + +# ================================================ + +# Bidi_Class=Boundary_Neutral + +0000..0008 ; BN # Cc [9] .. +000E..001B ; BN # Cc [14] .. +007F..0084 ; BN # Cc [6] .. +0086..009F ; BN # Cc [26] .. +00AD ; BN # Cf SOFT HYPHEN +180E ; BN # Cf MONGOLIAN VOWEL SEPARATOR +200B..200D ; BN # Cf [3] ZERO WIDTH SPACE..ZERO WIDTH JOINER +2060..2064 ; BN # Cf [5] WORD JOINER..INVISIBLE PLUS +2065 ; BN # Cn +206A..206F ; BN # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES +FDD0..FDEF ; BN # Cn [32] .. +FEFF ; BN # Cf ZERO WIDTH NO-BREAK SPACE +FFF0..FFF8 ; BN # Cn [9] .. +FFFE..FFFF ; BN # Cn [2] .. +1BCA0..1BCA3 ; BN # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP +1D173..1D17A ; BN # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE +1FFFE..1FFFF ; BN # Cn [2] .. +2FFFE..2FFFF ; BN # Cn [2] .. +3FFFE..3FFFF ; BN # Cn [2] .. +4FFFE..4FFFF ; BN # Cn [2] .. +5FFFE..5FFFF ; BN # Cn [2] .. +6FFFE..6FFFF ; BN # Cn [2] .. +7FFFE..7FFFF ; BN # Cn [2] .. +8FFFE..8FFFF ; BN # Cn [2] .. +9FFFE..9FFFF ; BN # Cn [2] .. +AFFFE..AFFFF ; BN # Cn [2] .. +BFFFE..BFFFF ; BN # Cn [2] .. +CFFFE..CFFFF ; BN # Cn [2] .. +DFFFE..E0000 ; BN # Cn [3] .. +E0001 ; BN # Cf LANGUAGE TAG +E0002..E001F ; BN # Cn [30] .. +E0020..E007F ; BN # Cf [96] TAG SPACE..CANCEL TAG +E0080..E00FF ; BN # Cn [128] .. +E01F0..E0FFF ; BN # Cn [3600] .. +EFFFE..EFFFF ; BN # Cn [2] .. +FFFFE..FFFFF ; BN # Cn [2] .. +10FFFE..10FFFF; BN # Cn [2] .. + +# Total code points: 4016 + +# ================================================ + +# Bidi_Class=Nonspacing_Mark + +0300..036F ; NSM # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X +0483..0487 ; NSM # Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE +0488..0489 ; NSM # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN +0591..05BD ; NSM # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG +05BF ; NSM # Mn HEBREW POINT RAFE +05C1..05C2 ; NSM # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT +05C4..05C5 ; NSM # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT +05C7 ; NSM # Mn HEBREW POINT QAMATS QATAN +0610..061A ; NSM # Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA +064B..065F ; NSM # Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW +0670 ; NSM # Mn ARABIC LETTER SUPERSCRIPT ALEF +06D6..06DC ; NSM # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN +06DF..06E4 ; NSM # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA +06E7..06E8 ; NSM # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON +06EA..06ED ; NSM # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM +0711 ; NSM # Mn SYRIAC LETTER SUPERSCRIPT ALAPH +0730..074A ; NSM # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH +07A6..07B0 ; NSM # Mn [11] THAANA ABAFILI..THAANA SUKUN +07EB..07F3 ; NSM # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE +07FD ; NSM # Mn NKO DANTAYALAN +0816..0819 ; NSM # Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH +081B..0823 ; NSM # Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A +0825..0827 ; NSM # Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U +0829..082D ; NSM # Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA +0859..085B ; NSM # Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK +0898..089F ; NSM # Mn [8] ARABIC SMALL HIGH WORD AL-JUZ..ARABIC HALF MADDA OVER MADDA +08CA..08E1 ; NSM # Mn [24] ARABIC SMALL HIGH FARSI YEH..ARABIC SMALL HIGH SIGN SAFHA +08E3..0902 ; NSM # Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA +093A ; NSM # Mn DEVANAGARI VOWEL SIGN OE +093C ; NSM # Mn DEVANAGARI SIGN NUKTA +0941..0948 ; NSM # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI +094D ; NSM # Mn DEVANAGARI SIGN VIRAMA +0951..0957 ; NSM # Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE +0962..0963 ; NSM # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL +0981 ; NSM # Mn BENGALI SIGN CANDRABINDU +09BC ; NSM # Mn BENGALI SIGN NUKTA +09C1..09C4 ; NSM # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR +09CD ; NSM # Mn BENGALI SIGN VIRAMA +09E2..09E3 ; NSM # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL +09FE ; NSM # Mn BENGALI SANDHI MARK +0A01..0A02 ; NSM # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI +0A3C ; NSM # Mn GURMUKHI SIGN NUKTA +0A41..0A42 ; NSM # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU +0A47..0A48 ; NSM # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI +0A4B..0A4D ; NSM # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA +0A51 ; NSM # Mn GURMUKHI SIGN UDAAT +0A70..0A71 ; NSM # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK +0A75 ; NSM # Mn GURMUKHI SIGN YAKASH +0A81..0A82 ; NSM # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA +0ABC ; NSM # Mn GUJARATI SIGN NUKTA +0AC1..0AC5 ; NSM # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E +0AC7..0AC8 ; NSM # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI +0ACD ; NSM # Mn GUJARATI SIGN VIRAMA +0AE2..0AE3 ; NSM # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL +0AFA..0AFF ; NSM # Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE +0B01 ; NSM # Mn ORIYA SIGN CANDRABINDU +0B3C ; NSM # Mn ORIYA SIGN NUKTA +0B3F ; NSM # Mn ORIYA VOWEL SIGN I +0B41..0B44 ; NSM # Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR +0B4D ; NSM # Mn ORIYA SIGN VIRAMA +0B55..0B56 ; NSM # Mn [2] ORIYA SIGN OVERLINE..ORIYA AI LENGTH MARK +0B62..0B63 ; NSM # Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL +0B82 ; NSM # Mn TAMIL SIGN ANUSVARA +0BC0 ; NSM # Mn TAMIL VOWEL SIGN II +0BCD ; NSM # Mn TAMIL SIGN VIRAMA +0C00 ; NSM # Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE +0C04 ; NSM # Mn TELUGU SIGN COMBINING ANUSVARA ABOVE +0C3C ; NSM # Mn TELUGU SIGN NUKTA +0C3E..0C40 ; NSM # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II +0C46..0C48 ; NSM # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI +0C4A..0C4D ; NSM # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA +0C55..0C56 ; NSM # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK +0C62..0C63 ; NSM # Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +0C81 ; NSM # Mn KANNADA SIGN CANDRABINDU +0CBC ; NSM # Mn KANNADA SIGN NUKTA +0CCC..0CCD ; NSM # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA +0CE2..0CE3 ; NSM # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +0D00..0D01 ; NSM # Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU +0D3B..0D3C ; NSM # Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA +0D41..0D44 ; NSM # Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR +0D4D ; NSM # Mn MALAYALAM SIGN VIRAMA +0D62..0D63 ; NSM # Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL +0D81 ; NSM # Mn SINHALA SIGN CANDRABINDU +0DCA ; NSM # Mn SINHALA SIGN AL-LAKUNA +0DD2..0DD4 ; NSM # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA +0DD6 ; NSM # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA +0E31 ; NSM # Mn THAI CHARACTER MAI HAN-AKAT +0E34..0E3A ; NSM # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU +0E47..0E4E ; NSM # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN +0EB1 ; NSM # Mn LAO VOWEL SIGN MAI KAN +0EB4..0EBC ; NSM # Mn [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO +0EC8..0ECD ; NSM # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA +0F18..0F19 ; NSM # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS +0F35 ; NSM # Mn TIBETAN MARK NGAS BZUNG NYI ZLA +0F37 ; NSM # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS +0F39 ; NSM # Mn TIBETAN MARK TSA -PHRU +0F71..0F7E ; NSM # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO +0F80..0F84 ; NSM # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA +0F86..0F87 ; NSM # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS +0F8D..0F97 ; NSM # Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA +0F99..0FBC ; NSM # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA +0FC6 ; NSM # Mn TIBETAN SYMBOL PADMA GDAN +102D..1030 ; NSM # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU +1032..1037 ; NSM # Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW +1039..103A ; NSM # Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT +103D..103E ; NSM # Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA +1058..1059 ; NSM # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL +105E..1060 ; NSM # Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA +1071..1074 ; NSM # Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE +1082 ; NSM # Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA +1085..1086 ; NSM # Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y +108D ; NSM # Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE +109D ; NSM # Mn MYANMAR VOWEL SIGN AITON AI +135D..135F ; NSM # Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK +1712..1714 ; NSM # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA +1732..1733 ; NSM # Mn [2] HANUNOO VOWEL SIGN I..HANUNOO VOWEL SIGN U +1752..1753 ; NSM # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U +1772..1773 ; NSM # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U +17B4..17B5 ; NSM # Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA +17B7..17BD ; NSM # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA +17C6 ; NSM # Mn KHMER SIGN NIKAHIT +17C9..17D3 ; NSM # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT +17DD ; NSM # Mn KHMER SIGN ATTHACAN +180B..180D ; NSM # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE +180F ; NSM # Mn MONGOLIAN FREE VARIATION SELECTOR FOUR +1885..1886 ; NSM # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA +18A9 ; NSM # Mn MONGOLIAN LETTER ALI GALI DAGALGA +1920..1922 ; NSM # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U +1927..1928 ; NSM # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O +1932 ; NSM # Mn LIMBU SMALL LETTER ANUSVARA +1939..193B ; NSM # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I +1A17..1A18 ; NSM # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U +1A1B ; NSM # Mn BUGINESE VOWEL SIGN AE +1A56 ; NSM # Mn TAI THAM CONSONANT SIGN MEDIAL LA +1A58..1A5E ; NSM # Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA +1A60 ; NSM # Mn TAI THAM SIGN SAKOT +1A62 ; NSM # Mn TAI THAM VOWEL SIGN MAI SAT +1A65..1A6C ; NSM # Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW +1A73..1A7C ; NSM # Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN +1A7F ; NSM # Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT +1AB0..1ABD ; NSM # Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW +1ABE ; NSM # Me COMBINING PARENTHESES OVERLAY +1ABF..1ACE ; NSM # Mn [16] COMBINING LATIN SMALL LETTER W BELOW..COMBINING LATIN SMALL LETTER INSULAR T +1B00..1B03 ; NSM # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG +1B34 ; NSM # Mn BALINESE SIGN REREKAN +1B36..1B3A ; NSM # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA +1B3C ; NSM # Mn BALINESE VOWEL SIGN LA LENGA +1B42 ; NSM # Mn BALINESE VOWEL SIGN PEPET +1B6B..1B73 ; NSM # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG +1B80..1B81 ; NSM # Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR +1BA2..1BA5 ; NSM # Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU +1BA8..1BA9 ; NSM # Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG +1BAB..1BAD ; NSM # Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA +1BE6 ; NSM # Mn BATAK SIGN TOMPI +1BE8..1BE9 ; NSM # Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE +1BED ; NSM # Mn BATAK VOWEL SIGN KARO O +1BEF..1BF1 ; NSM # Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H +1C2C..1C33 ; NSM # Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T +1C36..1C37 ; NSM # Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA +1CD0..1CD2 ; NSM # Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA +1CD4..1CE0 ; NSM # Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA +1CE2..1CE8 ; NSM # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL +1CED ; NSM # Mn VEDIC SIGN TIRYAK +1CF4 ; NSM # Mn VEDIC TONE CANDRA ABOVE +1CF8..1CF9 ; NSM # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +1DC0..1DFF ; NSM # Mn [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW +20D0..20DC ; NSM # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE +20DD..20E0 ; NSM # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH +20E1 ; NSM # Mn COMBINING LEFT RIGHT ARROW ABOVE +20E2..20E4 ; NSM # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE +20E5..20F0 ; NSM # Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE +2CEF..2CF1 ; NSM # Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS +2D7F ; NSM # Mn TIFINAGH CONSONANT JOINER +2DE0..2DFF ; NSM # Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS +302A..302D ; NSM # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK +3099..309A ; NSM # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +A66F ; NSM # Mn COMBINING CYRILLIC VZMET +A670..A672 ; NSM # Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN +A674..A67D ; NSM # Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK +A69E..A69F ; NSM # Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E +A6F0..A6F1 ; NSM # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS +A802 ; NSM # Mn SYLOTI NAGRI SIGN DVISVARA +A806 ; NSM # Mn SYLOTI NAGRI SIGN HASANTA +A80B ; NSM # Mn SYLOTI NAGRI SIGN ANUSVARA +A825..A826 ; NSM # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E +A82C ; NSM # Mn SYLOTI NAGRI SIGN ALTERNATE HASANTA +A8C4..A8C5 ; NSM # Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU +A8E0..A8F1 ; NSM # Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA +A8FF ; NSM # Mn DEVANAGARI VOWEL SIGN AY +A926..A92D ; NSM # Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU +A947..A951 ; NSM # Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R +A980..A982 ; NSM # Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR +A9B3 ; NSM # Mn JAVANESE SIGN CECAK TELU +A9B6..A9B9 ; NSM # Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT +A9BC..A9BD ; NSM # Mn [2] JAVANESE VOWEL SIGN PEPET..JAVANESE CONSONANT SIGN KERET +A9E5 ; NSM # Mn MYANMAR SIGN SHAN SAW +AA29..AA2E ; NSM # Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE +AA31..AA32 ; NSM # Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE +AA35..AA36 ; NSM # Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA +AA43 ; NSM # Mn CHAM CONSONANT SIGN FINAL NG +AA4C ; NSM # Mn CHAM CONSONANT SIGN FINAL M +AA7C ; NSM # Mn MYANMAR SIGN TAI LAING TONE-2 +AAB0 ; NSM # Mn TAI VIET MAI KANG +AAB2..AAB4 ; NSM # Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U +AAB7..AAB8 ; NSM # Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA +AABE..AABF ; NSM # Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK +AAC1 ; NSM # Mn TAI VIET TONE MAI THO +AAEC..AAED ; NSM # Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI +AAF6 ; NSM # Mn MEETEI MAYEK VIRAMA +ABE5 ; NSM # Mn MEETEI MAYEK VOWEL SIGN ANAP +ABE8 ; NSM # Mn MEETEI MAYEK VOWEL SIGN UNAP +ABED ; NSM # Mn MEETEI MAYEK APUN IYEK +FB1E ; NSM # Mn HEBREW POINT JUDEO-SPANISH VARIKA +FE00..FE0F ; NSM # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 +FE20..FE2F ; NSM # Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF +101FD ; NSM # Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +102E0 ; NSM # Mn COPTIC EPACT THOUSANDS MARK +10376..1037A ; NSM # Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII +10A01..10A03 ; NSM # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R +10A05..10A06 ; NSM # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O +10A0C..10A0F ; NSM # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA +10A38..10A3A ; NSM # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW +10A3F ; NSM # Mn KHAROSHTHI VIRAMA +10AE5..10AE6 ; NSM # Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW +10D24..10D27 ; NSM # Mn [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI +10EAB..10EAC ; NSM # Mn [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK +10F46..10F50 ; NSM # Mn [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW +10F82..10F85 ; NSM # Mn [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW +11001 ; NSM # Mn BRAHMI SIGN ANUSVARA +11038..11046 ; NSM # Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA +11070 ; NSM # Mn BRAHMI SIGN OLD TAMIL VIRAMA +11073..11074 ; NSM # Mn [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O +1107F..11081 ; NSM # Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA +110B3..110B6 ; NSM # Mn [4] KAITHI VOWEL SIGN U..KAITHI VOWEL SIGN AI +110B9..110BA ; NSM # Mn [2] KAITHI SIGN VIRAMA..KAITHI SIGN NUKTA +110C2 ; NSM # Mn KAITHI VOWEL SIGN VOCALIC R +11100..11102 ; NSM # Mn [3] CHAKMA SIGN CANDRABINDU..CHAKMA SIGN VISARGA +11127..1112B ; NSM # Mn [5] CHAKMA VOWEL SIGN A..CHAKMA VOWEL SIGN UU +1112D..11134 ; NSM # Mn [8] CHAKMA VOWEL SIGN AI..CHAKMA MAAYYAA +11173 ; NSM # Mn MAHAJANI SIGN NUKTA +11180..11181 ; NSM # Mn [2] SHARADA SIGN CANDRABINDU..SHARADA SIGN ANUSVARA +111B6..111BE ; NSM # Mn [9] SHARADA VOWEL SIGN U..SHARADA VOWEL SIGN O +111C9..111CC ; NSM # Mn [4] SHARADA SANDHI MARK..SHARADA EXTRA SHORT VOWEL MARK +111CF ; NSM # Mn SHARADA SIGN INVERTED CANDRABINDU +1122F..11231 ; NSM # Mn [3] KHOJKI VOWEL SIGN U..KHOJKI VOWEL SIGN AI +11234 ; NSM # Mn KHOJKI SIGN ANUSVARA +11236..11237 ; NSM # Mn [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA +1123E ; NSM # Mn KHOJKI SIGN SUKUN +112DF ; NSM # Mn KHUDAWADI SIGN ANUSVARA +112E3..112EA ; NSM # Mn [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA +11300..11301 ; NSM # Mn [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU +1133B..1133C ; NSM # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA +11340 ; NSM # Mn GRANTHA VOWEL SIGN II +11366..1136C ; NSM # Mn [7] COMBINING GRANTHA DIGIT ZERO..COMBINING GRANTHA DIGIT SIX +11370..11374 ; NSM # Mn [5] COMBINING GRANTHA LETTER A..COMBINING GRANTHA LETTER PA +11438..1143F ; NSM # Mn [8] NEWA VOWEL SIGN U..NEWA VOWEL SIGN AI +11442..11444 ; NSM # Mn [3] NEWA SIGN VIRAMA..NEWA SIGN ANUSVARA +11446 ; NSM # Mn NEWA SIGN NUKTA +1145E ; NSM # Mn NEWA SANDHI MARK +114B3..114B8 ; NSM # Mn [6] TIRHUTA VOWEL SIGN U..TIRHUTA VOWEL SIGN VOCALIC LL +114BA ; NSM # Mn TIRHUTA VOWEL SIGN SHORT E +114BF..114C0 ; NSM # Mn [2] TIRHUTA SIGN CANDRABINDU..TIRHUTA SIGN ANUSVARA +114C2..114C3 ; NSM # Mn [2] TIRHUTA SIGN VIRAMA..TIRHUTA SIGN NUKTA +115B2..115B5 ; NSM # Mn [4] SIDDHAM VOWEL SIGN U..SIDDHAM VOWEL SIGN VOCALIC RR +115BC..115BD ; NSM # Mn [2] SIDDHAM SIGN CANDRABINDU..SIDDHAM SIGN ANUSVARA +115BF..115C0 ; NSM # Mn [2] SIDDHAM SIGN VIRAMA..SIDDHAM SIGN NUKTA +115DC..115DD ; NSM # Mn [2] SIDDHAM VOWEL SIGN ALTERNATE U..SIDDHAM VOWEL SIGN ALTERNATE UU +11633..1163A ; NSM # Mn [8] MODI VOWEL SIGN U..MODI VOWEL SIGN AI +1163D ; NSM # Mn MODI SIGN ANUSVARA +1163F..11640 ; NSM # Mn [2] MODI SIGN VIRAMA..MODI SIGN ARDHACANDRA +116AB ; NSM # Mn TAKRI SIGN ANUSVARA +116AD ; NSM # Mn TAKRI VOWEL SIGN AA +116B0..116B5 ; NSM # Mn [6] TAKRI VOWEL SIGN U..TAKRI VOWEL SIGN AU +116B7 ; NSM # Mn TAKRI SIGN NUKTA +1171D..1171F ; NSM # Mn [3] AHOM CONSONANT SIGN MEDIAL LA..AHOM CONSONANT SIGN MEDIAL LIGATING RA +11722..11725 ; NSM # Mn [4] AHOM VOWEL SIGN I..AHOM VOWEL SIGN UU +11727..1172B ; NSM # Mn [5] AHOM VOWEL SIGN AW..AHOM SIGN KILLER +1182F..11837 ; NSM # Mn [9] DOGRA VOWEL SIGN U..DOGRA SIGN ANUSVARA +11839..1183A ; NSM # Mn [2] DOGRA SIGN VIRAMA..DOGRA SIGN NUKTA +1193B..1193C ; NSM # Mn [2] DIVES AKURU SIGN ANUSVARA..DIVES AKURU SIGN CANDRABINDU +1193E ; NSM # Mn DIVES AKURU VIRAMA +11943 ; NSM # Mn DIVES AKURU SIGN NUKTA +119D4..119D7 ; NSM # Mn [4] NANDINAGARI VOWEL SIGN U..NANDINAGARI VOWEL SIGN VOCALIC RR +119DA..119DB ; NSM # Mn [2] NANDINAGARI VOWEL SIGN E..NANDINAGARI VOWEL SIGN AI +119E0 ; NSM # Mn NANDINAGARI SIGN VIRAMA +11A01..11A06 ; NSM # Mn [6] ZANABAZAR SQUARE VOWEL SIGN I..ZANABAZAR SQUARE VOWEL SIGN O +11A09..11A0A ; NSM # Mn [2] ZANABAZAR SQUARE VOWEL SIGN REVERSED I..ZANABAZAR SQUARE VOWEL LENGTH MARK +11A33..11A38 ; NSM # Mn [6] ZANABAZAR SQUARE FINAL CONSONANT MARK..ZANABAZAR SQUARE SIGN ANUSVARA +11A3B..11A3E ; NSM # Mn [4] ZANABAZAR SQUARE CLUSTER-FINAL LETTER YA..ZANABAZAR SQUARE CLUSTER-FINAL LETTER VA +11A47 ; NSM # Mn ZANABAZAR SQUARE SUBJOINER +11A51..11A56 ; NSM # Mn [6] SOYOMBO VOWEL SIGN I..SOYOMBO VOWEL SIGN OE +11A59..11A5B ; NSM # Mn [3] SOYOMBO VOWEL SIGN VOCALIC R..SOYOMBO VOWEL LENGTH MARK +11A8A..11A96 ; NSM # Mn [13] SOYOMBO FINAL CONSONANT SIGN G..SOYOMBO SIGN ANUSVARA +11A98..11A99 ; NSM # Mn [2] SOYOMBO GEMINATION MARK..SOYOMBO SUBJOINER +11C30..11C36 ; NSM # Mn [7] BHAIKSUKI VOWEL SIGN I..BHAIKSUKI VOWEL SIGN VOCALIC L +11C38..11C3D ; NSM # Mn [6] BHAIKSUKI VOWEL SIGN E..BHAIKSUKI SIGN ANUSVARA +11C92..11CA7 ; NSM # Mn [22] MARCHEN SUBJOINED LETTER KA..MARCHEN SUBJOINED LETTER ZA +11CAA..11CB0 ; NSM # Mn [7] MARCHEN SUBJOINED LETTER RA..MARCHEN VOWEL SIGN AA +11CB2..11CB3 ; NSM # Mn [2] MARCHEN VOWEL SIGN U..MARCHEN VOWEL SIGN E +11CB5..11CB6 ; NSM # Mn [2] MARCHEN SIGN ANUSVARA..MARCHEN SIGN CANDRABINDU +11D31..11D36 ; NSM # Mn [6] MASARAM GONDI VOWEL SIGN AA..MASARAM GONDI VOWEL SIGN VOCALIC R +11D3A ; NSM # Mn MASARAM GONDI VOWEL SIGN E +11D3C..11D3D ; NSM # Mn [2] MASARAM GONDI VOWEL SIGN AI..MASARAM GONDI VOWEL SIGN O +11D3F..11D45 ; NSM # Mn [7] MASARAM GONDI VOWEL SIGN AU..MASARAM GONDI VIRAMA +11D47 ; NSM # Mn MASARAM GONDI RA-KARA +11D90..11D91 ; NSM # Mn [2] GUNJALA GONDI VOWEL SIGN EE..GUNJALA GONDI VOWEL SIGN AI +11D95 ; NSM # Mn GUNJALA GONDI SIGN ANUSVARA +11D97 ; NSM # Mn GUNJALA GONDI VIRAMA +11EF3..11EF4 ; NSM # Mn [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U +16AF0..16AF4 ; NSM # Mn [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE +16B30..16B36 ; NSM # Mn [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM +16F4F ; NSM # Mn MIAO SIGN CONSONANT MODIFIER BAR +16F8F..16F92 ; NSM # Mn [4] MIAO TONE RIGHT..MIAO TONE BELOW +16FE4 ; NSM # Mn KHITAN SMALL SCRIPT FILLER +1BC9D..1BC9E ; NSM # Mn [2] DUPLOYAN THICK LETTER SELECTOR..DUPLOYAN DOUBLE MARK +1CF00..1CF2D ; NSM # Mn [46] ZNAMENNY COMBINING MARK GORAZDO NIZKO S KRYZHEM ON LEFT..ZNAMENNY COMBINING MARK KRYZH ON LEFT +1CF30..1CF46 ; NSM # Mn [23] ZNAMENNY COMBINING TONAL RANGE MARK MRACHNO..ZNAMENNY PRIZNAK MODIFIER ROG +1D167..1D169 ; NSM # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 +1D17B..1D182 ; NSM # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE +1D185..1D18B ; NSM # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE +1D1AA..1D1AD ; NSM # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO +1D242..1D244 ; NSM # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME +1DA00..1DA36 ; NSM # Mn [55] SIGNWRITING HEAD RIM..SIGNWRITING AIR SUCKING IN +1DA3B..1DA6C ; NSM # Mn [50] SIGNWRITING MOUTH CLOSED NEUTRAL..SIGNWRITING EXCITEMENT +1DA75 ; NSM # Mn SIGNWRITING UPPER BODY TILTING FROM HIP JOINTS +1DA84 ; NSM # Mn SIGNWRITING LOCATION HEAD NECK +1DA9B..1DA9F ; NSM # Mn [5] SIGNWRITING FILL MODIFIER-2..SIGNWRITING FILL MODIFIER-6 +1DAA1..1DAAF ; NSM # Mn [15] SIGNWRITING ROTATION MODIFIER-2..SIGNWRITING ROTATION MODIFIER-16 +1E000..1E006 ; NSM # Mn [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE +1E008..1E018 ; NSM # Mn [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU +1E01B..1E021 ; NSM # Mn [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI +1E023..1E024 ; NSM # Mn [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS +1E026..1E02A ; NSM # Mn [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA +1E130..1E136 ; NSM # Mn [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D +1E2AE ; NSM # Mn TOTO SIGN RISING TONE +1E2EC..1E2EF ; NSM # Mn [4] WANCHO TONE TUP..WANCHO TONE KOINI +1E8D0..1E8D6 ; NSM # Mn [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS +1E944..1E94A ; NSM # Mn [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA +E0100..E01EF ; NSM # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 + +# Total code points: 1958 + +# ================================================ + +# Bidi_Class=Arabic_Letter + +0608 ; AL # Sm ARABIC RAY +060B ; AL # Sc AFGHANI SIGN +060D ; AL # Po ARABIC DATE SEPARATOR +061B ; AL # Po ARABIC SEMICOLON +061C ; AL # Cf ARABIC LETTER MARK +061D..061F ; AL # Po [3] ARABIC END OF TEXT MARK..ARABIC QUESTION MARK +0620..063F ; AL # Lo [32] ARABIC LETTER KASHMIRI YEH..ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE +0640 ; AL # Lm ARABIC TATWEEL +0641..064A ; AL # Lo [10] ARABIC LETTER FEH..ARABIC LETTER YEH +066D ; AL # Po ARABIC FIVE POINTED STAR +066E..066F ; AL # Lo [2] ARABIC LETTER DOTLESS BEH..ARABIC LETTER DOTLESS QAF +0671..06D3 ; AL # Lo [99] ARABIC LETTER ALEF WASLA..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE +06D4 ; AL # Po ARABIC FULL STOP +06D5 ; AL # Lo ARABIC LETTER AE +06E5..06E6 ; AL # Lm [2] ARABIC SMALL WAW..ARABIC SMALL YEH +06EE..06EF ; AL # Lo [2] ARABIC LETTER DAL WITH INVERTED V..ARABIC LETTER REH WITH INVERTED V +06FA..06FC ; AL # Lo [3] ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC LETTER GHAIN WITH DOT BELOW +06FD..06FE ; AL # So [2] ARABIC SIGN SINDHI AMPERSAND..ARABIC SIGN SINDHI POSTPOSITION MEN +06FF ; AL # Lo ARABIC LETTER HEH WITH INVERTED V +0700..070D ; AL # Po [14] SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN ASTERISCUS +070E ; AL # Cn +070F ; AL # Cf SYRIAC ABBREVIATION MARK +0710 ; AL # Lo SYRIAC LETTER ALAPH +0712..072F ; AL # Lo [30] SYRIAC LETTER BETH..SYRIAC LETTER PERSIAN DHALATH +074B..074C ; AL # Cn [2] .. +074D..07A5 ; AL # Lo [89] SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER WAAVU +07B1 ; AL # Lo THAANA LETTER NAA +07B2..07BF ; AL # Cn [14] .. +0860..086A ; AL # Lo [11] SYRIAC LETTER MALAYALAM NGA..SYRIAC LETTER MALAYALAM SSA +086B..086F ; AL # Cn [5] .. +0870..0887 ; AL # Lo [24] ARABIC LETTER ALEF WITH ATTACHED FATHA..ARABIC BASELINE ROUND DOT +0888 ; AL # Sk ARABIC RAISED ROUND DOT +0889..088E ; AL # Lo [6] ARABIC LETTER NOON WITH INVERTED SMALL V..ARABIC VERTICAL TAIL +088F ; AL # Cn +0892..0897 ; AL # Cn [6] .. +08A0..08C8 ; AL # Lo [41] ARABIC LETTER BEH WITH SMALL V BELOW..ARABIC LETTER GRAF +08C9 ; AL # Lm ARABIC SMALL FARSI YEH +FB50..FBB1 ; AL # Lo [98] ARABIC LETTER ALEF WASLA ISOLATED FORM..ARABIC LETTER YEH BARREE WITH HAMZA ABOVE FINAL FORM +FBB2..FBC2 ; AL # Sk [17] ARABIC SYMBOL DOT ABOVE..ARABIC SYMBOL WASLA ABOVE +FBC3..FBD2 ; AL # Cn [16] .. +FBD3..FD3D ; AL # Lo [363] ARABIC LETTER NG ISOLATED FORM..ARABIC LIGATURE ALEF WITH FATHATAN ISOLATED FORM +FD50..FD8F ; AL # Lo [64] ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM..ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM +FD90..FD91 ; AL # Cn [2] .. +FD92..FDC7 ; AL # Lo [54] ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM..ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM +FDC8..FDCE ; AL # Cn [7] .. +FDF0..FDFB ; AL # Lo [12] ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM..ARABIC LIGATURE JALLAJALALOUHOU +FDFC ; AL # Sc RIAL SIGN +FE70..FE74 ; AL # Lo [5] ARABIC FATHATAN ISOLATED FORM..ARABIC KASRATAN ISOLATED FORM +FE75 ; AL # Cn +FE76..FEFC ; AL # Lo [135] ARABIC FATHA ISOLATED FORM..ARABIC LIGATURE LAM WITH ALEF FINAL FORM +FEFD..FEFE ; AL # Cn [2]