feat: 9.5.9
This commit is contained in:
		
							parent
							
								
									cb1753732b
								
							
						
					
					
						commit
						35f43a7909
					
				
					 1084 changed files with 558985 additions and 0 deletions
				
			
		
							
								
								
									
										1624
									
								
								unicode/UNIDATA/CaseFolding.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1624
									
								
								unicode/UNIDATA/CaseFolding.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										221
									
								
								unicode/UNIDATA/CompositionExclusions.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								unicode/UNIDATA/CompositionExclusions.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,221 @@ | |||
| # CompositionExclusions-14.0.0.txt | ||||
| # Date: 2021-03-30, 23:59:00 GMT [KW, LI] | ||||
| # © 2021 Unicode®, Inc. | ||||
| # For terms of use, see https://www.unicode.org/terms_of_use.html | ||||
| # | ||||
| # Unicode Character Database | ||||
| # For documentation, see https://www.unicode.org/reports/tr44/ | ||||
| # | ||||
| # This file lists the characters for the Composition Exclusion Table | ||||
| # defined in UAX #15, Unicode Normalization Forms. | ||||
| # | ||||
| # This file is a normative contributory data file in the | ||||
| # Unicode Character Database. | ||||
| # | ||||
| # For more information, see | ||||
| # https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table | ||||
| # | ||||
| # For a full derivation of composition exclusions, see the derived property | ||||
| # Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # | ||||
| 
 | ||||
| # ================================================ | ||||
| # (1) Script Specifics | ||||
| # | ||||
| # This list of characters cannot be derived from the UnicodeData.txt file. | ||||
| # | ||||
| # Included are the following subcategories: | ||||
| # | ||||
| # - Many precomposed characters using a nukta diacritic in the Devanagari, | ||||
| #   Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts. | ||||
| # - Tibetan letters and subjoined letters with decompositions including  | ||||
| #   U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA. | ||||
| # - Two two-part Tibetan vowel signs involving top and bottom pieces. | ||||
| # - A large collection of compatibility precomposed characters for Hebrew | ||||
| #   involving dagesh and/or other combining marks. | ||||
| # | ||||
| # This list is unlikely to grow. | ||||
| # | ||||
| # ================================================ | ||||
| 
 | ||||
| 0958    #  DEVANAGARI LETTER QA | ||||
| 0959    #  DEVANAGARI LETTER KHHA | ||||
| 095A    #  DEVANAGARI LETTER GHHA | ||||
| 095B    #  DEVANAGARI LETTER ZA | ||||
| 095C    #  DEVANAGARI LETTER DDDHA | ||||
| 095D    #  DEVANAGARI LETTER RHA | ||||
| 095E    #  DEVANAGARI LETTER FA | ||||
| 095F    #  DEVANAGARI LETTER YYA | ||||
| 09DC    #  BENGALI LETTER RRA | ||||
| 09DD    #  BENGALI LETTER RHA | ||||
| 09DF    #  BENGALI LETTER YYA | ||||
| 0A33    #  GURMUKHI LETTER LLA | ||||
| 0A36    #  GURMUKHI LETTER SHA | ||||
| 0A59    #  GURMUKHI LETTER KHHA | ||||
| 0A5A    #  GURMUKHI LETTER GHHA | ||||
| 0A5B    #  GURMUKHI LETTER ZA | ||||
| 0A5E    #  GURMUKHI LETTER FA | ||||
| 0B5C    #  ORIYA LETTER RRA | ||||
| 0B5D    #  ORIYA LETTER RHA | ||||
| 0F43    #  TIBETAN LETTER GHA | ||||
| 0F4D    #  TIBETAN LETTER DDHA | ||||
| 0F52    #  TIBETAN LETTER DHA | ||||
| 0F57    #  TIBETAN LETTER BHA | ||||
| 0F5C    #  TIBETAN LETTER DZHA | ||||
| 0F69    #  TIBETAN LETTER KSSA | ||||
| 0F76    #  TIBETAN VOWEL SIGN VOCALIC R | ||||
| 0F78    #  TIBETAN VOWEL SIGN VOCALIC L | ||||
| 0F93    #  TIBETAN SUBJOINED LETTER GHA | ||||
| 0F9D    #  TIBETAN SUBJOINED LETTER DDHA | ||||
| 0FA2    #  TIBETAN SUBJOINED LETTER DHA | ||||
| 0FA7    #  TIBETAN SUBJOINED LETTER BHA | ||||
| 0FAC    #  TIBETAN SUBJOINED LETTER DZHA | ||||
| 0FB9    #  TIBETAN SUBJOINED LETTER KSSA | ||||
| FB1D    #  HEBREW LETTER YOD WITH HIRIQ | ||||
| FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH | ||||
| FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT | ||||
| FB2B    #  HEBREW LETTER SHIN WITH SIN DOT | ||||
| FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT | ||||
| FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT | ||||
| FB2E    #  HEBREW LETTER ALEF WITH PATAH | ||||
| FB2F    #  HEBREW LETTER ALEF WITH QAMATS | ||||
| FB30    #  HEBREW LETTER ALEF WITH MAPIQ | ||||
| FB31    #  HEBREW LETTER BET WITH DAGESH | ||||
| FB32    #  HEBREW LETTER GIMEL WITH DAGESH | ||||
| FB33    #  HEBREW LETTER DALET WITH DAGESH | ||||
| FB34    #  HEBREW LETTER HE WITH MAPIQ | ||||
| FB35    #  HEBREW LETTER VAV WITH DAGESH | ||||
| FB36    #  HEBREW LETTER ZAYIN WITH DAGESH | ||||
| FB38    #  HEBREW LETTER TET WITH DAGESH | ||||
| FB39    #  HEBREW LETTER YOD WITH DAGESH | ||||
| FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH | ||||
| FB3B    #  HEBREW LETTER KAF WITH DAGESH | ||||
| FB3C    #  HEBREW LETTER LAMED WITH DAGESH | ||||
| FB3E    #  HEBREW LETTER MEM WITH DAGESH | ||||
| FB40    #  HEBREW LETTER NUN WITH DAGESH | ||||
| FB41    #  HEBREW LETTER SAMEKH WITH DAGESH | ||||
| FB43    #  HEBREW LETTER FINAL PE WITH DAGESH | ||||
| FB44    #  HEBREW LETTER PE WITH DAGESH | ||||
| FB46    #  HEBREW LETTER TSADI WITH DAGESH | ||||
| FB47    #  HEBREW LETTER QOF WITH DAGESH | ||||
| FB48    #  HEBREW LETTER RESH WITH DAGESH | ||||
| FB49    #  HEBREW LETTER SHIN WITH DAGESH | ||||
| FB4A    #  HEBREW LETTER TAV WITH DAGESH | ||||
| FB4B    #  HEBREW LETTER VAV WITH HOLAM | ||||
| FB4C    #  HEBREW LETTER BET WITH RAFE | ||||
| FB4D    #  HEBREW LETTER KAF WITH RAFE | ||||
| FB4E    #  HEBREW LETTER PE WITH RAFE | ||||
| 
 | ||||
| # Total code points: 67 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (2) Post Composition Version precomposed characters | ||||
| # | ||||
| # These characters cannot be derived solely from the UnicodeData.txt file | ||||
| # in this version of Unicode. | ||||
| # | ||||
| # Note that characters added to the standard after the | ||||
| # Composition Version and which have canonical decomposition mappings | ||||
| # are not automatically added to this list of Post Composition | ||||
| # Version precomposed characters. | ||||
| # ================================================ | ||||
| 
 | ||||
| 2ADC    #  FORKING | ||||
| 1D15E   #  MUSICAL SYMBOL HALF NOTE | ||||
| 1D15F   #  MUSICAL SYMBOL QUARTER NOTE | ||||
| 1D160   #  MUSICAL SYMBOL EIGHTH NOTE | ||||
| 1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE | ||||
| 1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE | ||||
| 1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE | ||||
| 1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE | ||||
| 1D1BB   #  MUSICAL SYMBOL MINIMA | ||||
| 1D1BC   #  MUSICAL SYMBOL MINIMA BLACK | ||||
| 1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE | ||||
| 1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK | ||||
| 1D1BF   #  MUSICAL SYMBOL FUSA WHITE | ||||
| 1D1C0   #  MUSICAL SYMBOL FUSA BLACK | ||||
| 
 | ||||
| # Total code points: 14 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (3) Singleton Decompositions | ||||
| # | ||||
| # These characters can be derived from the UnicodeData.txt file | ||||
| # by including all canonically decomposable characters whose | ||||
| # canonical decomposition consists of a single character. | ||||
| # | ||||
| # These characters are simply quoted here for reference. | ||||
| # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # ================================================ | ||||
| 
 | ||||
| # 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK | ||||
| # 0343                 COMBINING GREEK KORONIS | ||||
| # 0374                 GREEK NUMERAL SIGN | ||||
| # 037E                 GREEK QUESTION MARK | ||||
| # 0387                 GREEK ANO TELEIA | ||||
| # 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA | ||||
| # 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA | ||||
| # 1F75                 GREEK SMALL LETTER ETA WITH OXIA | ||||
| # 1F77                 GREEK SMALL LETTER IOTA WITH OXIA | ||||
| # 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA | ||||
| # 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA | ||||
| # 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA | ||||
| # 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA | ||||
| # 1FBE                 GREEK PROSGEGRAMMENI | ||||
| # 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA | ||||
| # 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA | ||||
| # 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | ||||
| # 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA | ||||
| # 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | ||||
| # 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA | ||||
| # 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA | ||||
| # 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA | ||||
| # 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA | ||||
| # 1FFD                 GREEK OXIA | ||||
| # 2000..2001       [2] EN QUAD..EM QUAD | ||||
| # 2126                 OHM SIGN | ||||
| # 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN | ||||
| # 2329                 LEFT-POINTING ANGLE BRACKET | ||||
| # 232A                 RIGHT-POINTING ANGLE BRACKET | ||||
| # F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D | ||||
| # FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10 | ||||
| # FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12 | ||||
| # FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E | ||||
| # FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20 | ||||
| # FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22 | ||||
| # FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 | ||||
| # FA2A..FA6D      [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D | ||||
| # FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 | ||||
| # 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D | ||||
| 
 | ||||
| # Total code points: 1035 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (4) Non-Starter Decompositions | ||||
| # | ||||
| # These characters can be derived from the UnicodeData.txt file | ||||
| # by including each expanding canonical decomposition | ||||
| # (i.e., those which canonically decompose to a sequence | ||||
| # of characters instead of a single character), such that: | ||||
| # | ||||
| # A. The character is not a Starter. | ||||
| # | ||||
| # OR (inclusive) | ||||
| # | ||||
| # B. The character's canonical decomposition begins | ||||
| # with a character that is not a Starter. | ||||
| # | ||||
| # Note that a "Starter" is any character with a zero combining class. | ||||
| # | ||||
| # These characters are simply quoted here for reference. | ||||
| # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # ================================================ | ||||
| 
 | ||||
| # 0344                 COMBINING GREEK DIALYTIKA TONOS | ||||
| # 0F73                 TIBETAN VOWEL SIGN II | ||||
| # 0F75                 TIBETAN VOWEL SIGN UU | ||||
| # 0F81                 TIBETAN VOWEL SIGN REVERSED II | ||||
| 
 | ||||
| # Total code points: 4 | ||||
| 
 | ||||
| # EOF | ||||
							
								
								
									
										19047
									
								
								unicode/UNIDATA/NormalizationTest.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19047
									
								
								unicode/UNIDATA/NormalizationTest.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1743
									
								
								unicode/UNIDATA/PropList.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1743
									
								
								unicode/UNIDATA/PropList.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										281
									
								
								unicode/UNIDATA/SpecialCasing.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										281
									
								
								unicode/UNIDATA/SpecialCasing.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,281 @@ | |||
| # SpecialCasing-14.0.0.txt | ||||
| # Date: 2021-03-08, 19:35:55 GMT | ||||
| # © 2021 Unicode®, Inc. | ||||
| # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. | ||||
| # For terms of use, see http://www.unicode.org/terms_of_use.html | ||||
| # | ||||
| # Unicode Character Database | ||||
| #   For documentation, see http://www.unicode.org/reports/tr44/ | ||||
| # | ||||
| # Special Casing | ||||
| # | ||||
| # This file is a supplement to the UnicodeData.txt file. It does not define any | ||||
| # properties, but rather provides additional information about the casing of | ||||
| # Unicode characters, for situations when casing incurs a change in string length | ||||
| # or is dependent on context or locale. For compatibility, the UnicodeData.txt | ||||
| # file only contains simple case mappings for characters where they are one-to-one | ||||
| # and independent of context and language. The data in this file, combined with | ||||
| # the simple case mappings in UnicodeData.txt, defines the full case mappings | ||||
| # Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc). | ||||
| # | ||||
| # Note that the preferred mechanism for defining tailored casing operations is | ||||
| # the Unicode Common Locale Data Repository (CLDR). For more information, see the | ||||
| # discussion of case mappings and case algorithms in the Unicode Standard. | ||||
| # | ||||
| # All code points not listed in this file that do not have a simple case mappings | ||||
| # in UnicodeData.txt map to themselves. | ||||
| # ================================================================================ | ||||
| # Format | ||||
| # ================================================================================ | ||||
| # The entries in this file are in the following machine-readable format: | ||||
| # | ||||
| # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> | ||||
| # | ||||
| # <code>, <lower>, <title>, and <upper> provide the respective full case mappings | ||||
| # of <code>, expressed as character values in hex. If there is more than one character, | ||||
| # they are separated by spaces. Other than as used to separate elements, spaces are | ||||
| # to be ignored. | ||||
| # | ||||
| # The <condition_list> is optional. Where present, it consists of one or more language IDs | ||||
| # or casing contexts, separated by spaces. In these conditions: | ||||
| # - A condition list overrides the normal behavior if all of the listed conditions are true. | ||||
| # - The casing context is always the context of the characters in the original string, | ||||
| #   NOT in the resulting string. | ||||
| # - Case distinctions in the condition list are not significant. | ||||
| # - Conditions preceded by "Not_" represent the negation of the condition. | ||||
| # The condition list is not represented in the UCD as a formal property. | ||||
| # | ||||
| # A language ID is defined by BCP 47, with '-' and '_' treated equivalently. | ||||
| # | ||||
| # A casing context for a character is defined by Section 3.13 Default Case Algorithms | ||||
| # of The Unicode Standard. | ||||
| # | ||||
| # Parsers of this file must be prepared to deal with future additions to this format: | ||||
| #  * Additional contexts | ||||
| #  * Additional fields | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Unconditional mappings | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # The German es-zed is special--the normal mapping is to SS. | ||||
| # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>)) | ||||
| 
 | ||||
| 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S | ||||
| 
 | ||||
| # Preserve canonical equivalence for I with dot. Turkic is handled below. | ||||
| 
 | ||||
| 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 
 | ||||
| # Ligatures | ||||
| 
 | ||||
| FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF | ||||
| FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI | ||||
| FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL | ||||
| FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI | ||||
| FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL | ||||
| FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T | ||||
| FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST | ||||
| 
 | ||||
| 0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN | ||||
| FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW | ||||
| FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH | ||||
| FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI | ||||
| FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW | ||||
| FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH | ||||
| 
 | ||||
| # No corresponding uppercase precomposed character | ||||
| 
 | ||||
| 0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE | ||||
| 0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS | ||||
| 03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS | ||||
| 01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON | ||||
| 1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW | ||||
| 1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS | ||||
| 1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE | ||||
| 1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE | ||||
| 1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING | ||||
| 1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI | ||||
| 1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA | ||||
| 1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA | ||||
| 1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI | ||||
| 1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI | ||||
| 1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI | ||||
| 1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA | ||||
| 1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | ||||
| 1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI | ||||
| 1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI | ||||
| 1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA | ||||
| 1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | ||||
| 1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI | ||||
| 1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI | ||||
| 1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI | ||||
| 1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI | ||||
| 
 | ||||
| # IMPORTANT-when iota-subscript (0345) is uppercased or titlecased, | ||||
| #  the result will be incorrect unless the iota-subscript is moved to the end | ||||
| #  of any sequence of combining marks. Otherwise, the accents will go on the capital iota. | ||||
| #  This process can be achieved by first transforming the text to NFC before casing. | ||||
| #  E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> | ||||
| 
 | ||||
| # The following cases are already in the UnicodeData.txt file, so are only commented here. | ||||
| 
 | ||||
| # 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI | ||||
| 
 | ||||
| # All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript) | ||||
| # have special uppercases. | ||||
| # Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase! | ||||
| 
 | ||||
| 1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI | ||||
| 1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI | ||||
| 1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI | ||||
| 1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI | ||||
| 1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI | ||||
| 1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI | ||||
| 
 | ||||
| # Some characters with YPOGEGRAMMENI also have no corresponding titlecases | ||||
| 
 | ||||
| 1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI | ||||
| 1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI | ||||
| 1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI | ||||
| 
 | ||||
| 1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Conditional Mappings | ||||
| # The remainder of this file provides conditional casing data used to produce | ||||
| # full case mappings. | ||||
| # ================================================================================ | ||||
| # Language-Insensitive Mappings | ||||
| # These are characters whose full case mappings do not depend on language, but do | ||||
| # depend on context (which characters come before or after). For more information | ||||
| # see the header of this file and the Unicode Standard. | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Special case for final form of sigma | ||||
| 
 | ||||
| 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | ||||
| 
 | ||||
| # Note: the following cases for non-final are already in the UnicodeData.txt file. | ||||
| 
 | ||||
| # 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA | ||||
| # 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA | ||||
| # 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA | ||||
| 
 | ||||
| # Note: the following cases are not included, since they would case-fold in lowercasing | ||||
| 
 | ||||
| # 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA | ||||
| # 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Language-Sensitive Mappings | ||||
| # These are characters whose full case mappings depend on language and perhaps also | ||||
| # context (which characters come before or after). For more information | ||||
| # see the header of this file and the Unicode Standard. | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Lithuanian | ||||
| 
 | ||||
| # Lithuanian retains the dot in a lowercase i when followed by accents. | ||||
| 
 | ||||
| # Remove DOT ABOVE after "i" with upper or titlecase | ||||
| 
 | ||||
| 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE | ||||
| 
 | ||||
| # Introduce an explicit dot above when lowercasing capital I's and J's | ||||
| # whenever there are more accents above. | ||||
| # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) | ||||
| 
 | ||||
| 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I | ||||
| 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J | ||||
| 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK | ||||
| 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE | ||||
| 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE | ||||
| 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE | ||||
| 
 | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Turkish and Azeri | ||||
| 
 | ||||
| # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri | ||||
| # The following rules handle those cases. | ||||
| 
 | ||||
| 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 
 | ||||
| # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. | ||||
| # This matches the behavior of the canonically equivalent I-dot_above | ||||
| 
 | ||||
| 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE | ||||
| 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE | ||||
| 
 | ||||
| # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. | ||||
| 
 | ||||
| 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I | ||||
| 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I | ||||
| 
 | ||||
| # When uppercasing, i turns into a dotted capital I | ||||
| 
 | ||||
| 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I | ||||
| 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I | ||||
| 
 | ||||
| # Note: the following case is already in the UnicodeData.txt file. | ||||
| 
 | ||||
| # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I | ||||
| 
 | ||||
| # EOF | ||||
| 
 | ||||
							
								
								
									
										34626
									
								
								unicode/UNIDATA/UnicodeData.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										34626
									
								
								unicode/UNIDATA/UnicodeData.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1441
									
								
								unicode/UNIDATA/WordBreakProperty.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1441
									
								
								unicode/UNIDATA/WordBreakProperty.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
		Reference in a new issue
	
	 tmtt
						tmtt