feat: 9.5.9
This commit is contained in:
		
							parent
							
								
									cb1753732b
								
							
						
					
					
						commit
						35f43a7909
					
				
					 1084 changed files with 558985 additions and 0 deletions
				
			
		
							
								
								
									
										21
									
								
								unicode/Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								unicode/Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,21 @@ | |||
| Scheme=../bin/scheme | ||||
| 
 | ||||
| doit: unicode-char-cases.ss unicode-charinfo.ss | ||||
| 
 | ||||
| unicode-char-cases.ss: extract-char-cases.ss unicode-data.ss | ||||
| 	echo | $(Scheme) -q extract-char-cases.ss | ||||
| 
 | ||||
| unicode-charinfo.ss: extract-info.ss unicode-data.ss | ||||
| 	echo | $(Scheme) -q extract-info.ss | ||||
| 
 | ||||
| unicode-char-cases.ss: \ | ||||
|  UNIDATA/CompositionExclusions.txt\
 | ||||
|  UNIDATA/UnicodeData.txt\
 | ||||
|  UNIDATA/CaseFolding.txt\
 | ||||
|  UNIDATA/SpecialCasing.txt | ||||
| 
 | ||||
| unicode-charinfo.ss: \ | ||||
|  UNIDATA/UnicodeData.txt\
 | ||||
|  UNIDATA/WordBreakProperty.txt\
 | ||||
|  UNIDATA/PropList.txt | ||||
| 
 | ||||
							
								
								
									
										12
									
								
								unicode/ReadMe
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								unicode/ReadMe
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | |||
| To rebuild unicode-char-cases.ss and unicode-charinfo.ss, download into | ||||
| ./UNIDATA the following files: | ||||
| 
 | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/CompositionExclusions.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/PropList.txt | ||||
| http://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt | ||||
| 
 | ||||
| Then run 'make'. | ||||
							
								
								
									
										1624
									
								
								unicode/UNIDATA/CaseFolding.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1624
									
								
								unicode/UNIDATA/CaseFolding.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										221
									
								
								unicode/UNIDATA/CompositionExclusions.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										221
									
								
								unicode/UNIDATA/CompositionExclusions.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,221 @@ | |||
| # CompositionExclusions-14.0.0.txt | ||||
| # Date: 2021-03-30, 23:59:00 GMT [KW, LI] | ||||
| # © 2021 Unicode®, Inc. | ||||
| # For terms of use, see https://www.unicode.org/terms_of_use.html | ||||
| # | ||||
| # Unicode Character Database | ||||
| # For documentation, see https://www.unicode.org/reports/tr44/ | ||||
| # | ||||
| # This file lists the characters for the Composition Exclusion Table | ||||
| # defined in UAX #15, Unicode Normalization Forms. | ||||
| # | ||||
| # This file is a normative contributory data file in the | ||||
| # Unicode Character Database. | ||||
| # | ||||
| # For more information, see | ||||
| # https://www.unicode.org/reports/tr15/#Primary_Exclusion_List_Table | ||||
| # | ||||
| # For a full derivation of composition exclusions, see the derived property | ||||
| # Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # | ||||
| 
 | ||||
| # ================================================ | ||||
| # (1) Script Specifics | ||||
| # | ||||
| # This list of characters cannot be derived from the UnicodeData.txt file. | ||||
| # | ||||
| # Included are the following subcategories: | ||||
| # | ||||
| # - Many precomposed characters using a nukta diacritic in the Devanagari, | ||||
| #   Bangla/Bengali, Gurmukhi, or Odia/Oriya scripts. | ||||
| # - Tibetan letters and subjoined letters with decompositions including  | ||||
| #   U+0FB7 TIBETAN SUBJOINED LETTER HA or U+0FB5 TIBETAN SUBJOINED LETTER SSA. | ||||
| # - Two two-part Tibetan vowel signs involving top and bottom pieces. | ||||
| # - A large collection of compatibility precomposed characters for Hebrew | ||||
| #   involving dagesh and/or other combining marks. | ||||
| # | ||||
| # This list is unlikely to grow. | ||||
| # | ||||
| # ================================================ | ||||
| 
 | ||||
| 0958    #  DEVANAGARI LETTER QA | ||||
| 0959    #  DEVANAGARI LETTER KHHA | ||||
| 095A    #  DEVANAGARI LETTER GHHA | ||||
| 095B    #  DEVANAGARI LETTER ZA | ||||
| 095C    #  DEVANAGARI LETTER DDDHA | ||||
| 095D    #  DEVANAGARI LETTER RHA | ||||
| 095E    #  DEVANAGARI LETTER FA | ||||
| 095F    #  DEVANAGARI LETTER YYA | ||||
| 09DC    #  BENGALI LETTER RRA | ||||
| 09DD    #  BENGALI LETTER RHA | ||||
| 09DF    #  BENGALI LETTER YYA | ||||
| 0A33    #  GURMUKHI LETTER LLA | ||||
| 0A36    #  GURMUKHI LETTER SHA | ||||
| 0A59    #  GURMUKHI LETTER KHHA | ||||
| 0A5A    #  GURMUKHI LETTER GHHA | ||||
| 0A5B    #  GURMUKHI LETTER ZA | ||||
| 0A5E    #  GURMUKHI LETTER FA | ||||
| 0B5C    #  ORIYA LETTER RRA | ||||
| 0B5D    #  ORIYA LETTER RHA | ||||
| 0F43    #  TIBETAN LETTER GHA | ||||
| 0F4D    #  TIBETAN LETTER DDHA | ||||
| 0F52    #  TIBETAN LETTER DHA | ||||
| 0F57    #  TIBETAN LETTER BHA | ||||
| 0F5C    #  TIBETAN LETTER DZHA | ||||
| 0F69    #  TIBETAN LETTER KSSA | ||||
| 0F76    #  TIBETAN VOWEL SIGN VOCALIC R | ||||
| 0F78    #  TIBETAN VOWEL SIGN VOCALIC L | ||||
| 0F93    #  TIBETAN SUBJOINED LETTER GHA | ||||
| 0F9D    #  TIBETAN SUBJOINED LETTER DDHA | ||||
| 0FA2    #  TIBETAN SUBJOINED LETTER DHA | ||||
| 0FA7    #  TIBETAN SUBJOINED LETTER BHA | ||||
| 0FAC    #  TIBETAN SUBJOINED LETTER DZHA | ||||
| 0FB9    #  TIBETAN SUBJOINED LETTER KSSA | ||||
| FB1D    #  HEBREW LETTER YOD WITH HIRIQ | ||||
| FB1F    #  HEBREW LIGATURE YIDDISH YOD YOD PATAH | ||||
| FB2A    #  HEBREW LETTER SHIN WITH SHIN DOT | ||||
| FB2B    #  HEBREW LETTER SHIN WITH SIN DOT | ||||
| FB2C    #  HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT | ||||
| FB2D    #  HEBREW LETTER SHIN WITH DAGESH AND SIN DOT | ||||
| FB2E    #  HEBREW LETTER ALEF WITH PATAH | ||||
| FB2F    #  HEBREW LETTER ALEF WITH QAMATS | ||||
| FB30    #  HEBREW LETTER ALEF WITH MAPIQ | ||||
| FB31    #  HEBREW LETTER BET WITH DAGESH | ||||
| FB32    #  HEBREW LETTER GIMEL WITH DAGESH | ||||
| FB33    #  HEBREW LETTER DALET WITH DAGESH | ||||
| FB34    #  HEBREW LETTER HE WITH MAPIQ | ||||
| FB35    #  HEBREW LETTER VAV WITH DAGESH | ||||
| FB36    #  HEBREW LETTER ZAYIN WITH DAGESH | ||||
| FB38    #  HEBREW LETTER TET WITH DAGESH | ||||
| FB39    #  HEBREW LETTER YOD WITH DAGESH | ||||
| FB3A    #  HEBREW LETTER FINAL KAF WITH DAGESH | ||||
| FB3B    #  HEBREW LETTER KAF WITH DAGESH | ||||
| FB3C    #  HEBREW LETTER LAMED WITH DAGESH | ||||
| FB3E    #  HEBREW LETTER MEM WITH DAGESH | ||||
| FB40    #  HEBREW LETTER NUN WITH DAGESH | ||||
| FB41    #  HEBREW LETTER SAMEKH WITH DAGESH | ||||
| FB43    #  HEBREW LETTER FINAL PE WITH DAGESH | ||||
| FB44    #  HEBREW LETTER PE WITH DAGESH | ||||
| FB46    #  HEBREW LETTER TSADI WITH DAGESH | ||||
| FB47    #  HEBREW LETTER QOF WITH DAGESH | ||||
| FB48    #  HEBREW LETTER RESH WITH DAGESH | ||||
| FB49    #  HEBREW LETTER SHIN WITH DAGESH | ||||
| FB4A    #  HEBREW LETTER TAV WITH DAGESH | ||||
| FB4B    #  HEBREW LETTER VAV WITH HOLAM | ||||
| FB4C    #  HEBREW LETTER BET WITH RAFE | ||||
| FB4D    #  HEBREW LETTER KAF WITH RAFE | ||||
| FB4E    #  HEBREW LETTER PE WITH RAFE | ||||
| 
 | ||||
| # Total code points: 67 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (2) Post Composition Version precomposed characters | ||||
| # | ||||
| # These characters cannot be derived solely from the UnicodeData.txt file | ||||
| # in this version of Unicode. | ||||
| # | ||||
| # Note that characters added to the standard after the | ||||
| # Composition Version and which have canonical decomposition mappings | ||||
| # are not automatically added to this list of Post Composition | ||||
| # Version precomposed characters. | ||||
| # ================================================ | ||||
| 
 | ||||
| 2ADC    #  FORKING | ||||
| 1D15E   #  MUSICAL SYMBOL HALF NOTE | ||||
| 1D15F   #  MUSICAL SYMBOL QUARTER NOTE | ||||
| 1D160   #  MUSICAL SYMBOL EIGHTH NOTE | ||||
| 1D161   #  MUSICAL SYMBOL SIXTEENTH NOTE | ||||
| 1D162   #  MUSICAL SYMBOL THIRTY-SECOND NOTE | ||||
| 1D163   #  MUSICAL SYMBOL SIXTY-FOURTH NOTE | ||||
| 1D164   #  MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE | ||||
| 1D1BB   #  MUSICAL SYMBOL MINIMA | ||||
| 1D1BC   #  MUSICAL SYMBOL MINIMA BLACK | ||||
| 1D1BD   #  MUSICAL SYMBOL SEMIMINIMA WHITE | ||||
| 1D1BE   #  MUSICAL SYMBOL SEMIMINIMA BLACK | ||||
| 1D1BF   #  MUSICAL SYMBOL FUSA WHITE | ||||
| 1D1C0   #  MUSICAL SYMBOL FUSA BLACK | ||||
| 
 | ||||
| # Total code points: 14 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (3) Singleton Decompositions | ||||
| # | ||||
| # These characters can be derived from the UnicodeData.txt file | ||||
| # by including all canonically decomposable characters whose | ||||
| # canonical decomposition consists of a single character. | ||||
| # | ||||
| # These characters are simply quoted here for reference. | ||||
| # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # ================================================ | ||||
| 
 | ||||
| # 0340..0341       [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK | ||||
| # 0343                 COMBINING GREEK KORONIS | ||||
| # 0374                 GREEK NUMERAL SIGN | ||||
| # 037E                 GREEK QUESTION MARK | ||||
| # 0387                 GREEK ANO TELEIA | ||||
| # 1F71                 GREEK SMALL LETTER ALPHA WITH OXIA | ||||
| # 1F73                 GREEK SMALL LETTER EPSILON WITH OXIA | ||||
| # 1F75                 GREEK SMALL LETTER ETA WITH OXIA | ||||
| # 1F77                 GREEK SMALL LETTER IOTA WITH OXIA | ||||
| # 1F79                 GREEK SMALL LETTER OMICRON WITH OXIA | ||||
| # 1F7B                 GREEK SMALL LETTER UPSILON WITH OXIA | ||||
| # 1F7D                 GREEK SMALL LETTER OMEGA WITH OXIA | ||||
| # 1FBB                 GREEK CAPITAL LETTER ALPHA WITH OXIA | ||||
| # 1FBE                 GREEK PROSGEGRAMMENI | ||||
| # 1FC9                 GREEK CAPITAL LETTER EPSILON WITH OXIA | ||||
| # 1FCB                 GREEK CAPITAL LETTER ETA WITH OXIA | ||||
| # 1FD3                 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | ||||
| # 1FDB                 GREEK CAPITAL LETTER IOTA WITH OXIA | ||||
| # 1FE3                 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | ||||
| # 1FEB                 GREEK CAPITAL LETTER UPSILON WITH OXIA | ||||
| # 1FEE..1FEF       [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA | ||||
| # 1FF9                 GREEK CAPITAL LETTER OMICRON WITH OXIA | ||||
| # 1FFB                 GREEK CAPITAL LETTER OMEGA WITH OXIA | ||||
| # 1FFD                 GREEK OXIA | ||||
| # 2000..2001       [2] EN QUAD..EM QUAD | ||||
| # 2126                 OHM SIGN | ||||
| # 212A..212B       [2] KELVIN SIGN..ANGSTROM SIGN | ||||
| # 2329                 LEFT-POINTING ANGLE BRACKET | ||||
| # 232A                 RIGHT-POINTING ANGLE BRACKET | ||||
| # F900..FA0D     [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D | ||||
| # FA10                 CJK COMPATIBILITY IDEOGRAPH-FA10 | ||||
| # FA12                 CJK COMPATIBILITY IDEOGRAPH-FA12 | ||||
| # FA15..FA1E      [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E | ||||
| # FA20                 CJK COMPATIBILITY IDEOGRAPH-FA20 | ||||
| # FA22                 CJK COMPATIBILITY IDEOGRAPH-FA22 | ||||
| # FA25..FA26       [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26 | ||||
| # FA2A..FA6D      [68] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA6D | ||||
| # FA70..FAD9     [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9 | ||||
| # 2F800..2FA1D   [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D | ||||
| 
 | ||||
| # Total code points: 1035 | ||||
| 
 | ||||
| # ================================================ | ||||
| # (4) Non-Starter Decompositions | ||||
| # | ||||
| # These characters can be derived from the UnicodeData.txt file | ||||
| # by including each expanding canonical decomposition | ||||
| # (i.e., those which canonically decompose to a sequence | ||||
| # of characters instead of a single character), such that: | ||||
| # | ||||
| # A. The character is not a Starter. | ||||
| # | ||||
| # OR (inclusive) | ||||
| # | ||||
| # B. The character's canonical decomposition begins | ||||
| # with a character that is not a Starter. | ||||
| # | ||||
| # Note that a "Starter" is any character with a zero combining class. | ||||
| # | ||||
| # These characters are simply quoted here for reference. | ||||
| # See also Full_Composition_Exclusion in DerivedNormalizationProps.txt | ||||
| # ================================================ | ||||
| 
 | ||||
| # 0344                 COMBINING GREEK DIALYTIKA TONOS | ||||
| # 0F73                 TIBETAN VOWEL SIGN II | ||||
| # 0F75                 TIBETAN VOWEL SIGN UU | ||||
| # 0F81                 TIBETAN VOWEL SIGN REVERSED II | ||||
| 
 | ||||
| # Total code points: 4 | ||||
| 
 | ||||
| # EOF | ||||
							
								
								
									
										19047
									
								
								unicode/UNIDATA/NormalizationTest.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										19047
									
								
								unicode/UNIDATA/NormalizationTest.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1743
									
								
								unicode/UNIDATA/PropList.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1743
									
								
								unicode/UNIDATA/PropList.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										281
									
								
								unicode/UNIDATA/SpecialCasing.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										281
									
								
								unicode/UNIDATA/SpecialCasing.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,281 @@ | |||
| # SpecialCasing-14.0.0.txt | ||||
| # Date: 2021-03-08, 19:35:55 GMT | ||||
| # © 2021 Unicode®, Inc. | ||||
| # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. | ||||
| # For terms of use, see http://www.unicode.org/terms_of_use.html | ||||
| # | ||||
| # Unicode Character Database | ||||
| #   For documentation, see http://www.unicode.org/reports/tr44/ | ||||
| # | ||||
| # Special Casing | ||||
| # | ||||
| # This file is a supplement to the UnicodeData.txt file. It does not define any | ||||
| # properties, but rather provides additional information about the casing of | ||||
| # Unicode characters, for situations when casing incurs a change in string length | ||||
| # or is dependent on context or locale. For compatibility, the UnicodeData.txt | ||||
| # file only contains simple case mappings for characters where they are one-to-one | ||||
| # and independent of context and language. The data in this file, combined with | ||||
| # the simple case mappings in UnicodeData.txt, defines the full case mappings | ||||
| # Lowercase_Mapping (lc), Titlecase_Mapping (tc), and Uppercase_Mapping (uc). | ||||
| # | ||||
| # Note that the preferred mechanism for defining tailored casing operations is | ||||
| # the Unicode Common Locale Data Repository (CLDR). For more information, see the | ||||
| # discussion of case mappings and case algorithms in the Unicode Standard. | ||||
| # | ||||
| # All code points not listed in this file that do not have a simple case mappings | ||||
| # in UnicodeData.txt map to themselves. | ||||
| # ================================================================================ | ||||
| # Format | ||||
| # ================================================================================ | ||||
| # The entries in this file are in the following machine-readable format: | ||||
| # | ||||
| # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> | ||||
| # | ||||
| # <code>, <lower>, <title>, and <upper> provide the respective full case mappings | ||||
| # of <code>, expressed as character values in hex. If there is more than one character, | ||||
| # they are separated by spaces. Other than as used to separate elements, spaces are | ||||
| # to be ignored. | ||||
| # | ||||
| # The <condition_list> is optional. Where present, it consists of one or more language IDs | ||||
| # or casing contexts, separated by spaces. In these conditions: | ||||
| # - A condition list overrides the normal behavior if all of the listed conditions are true. | ||||
| # - The casing context is always the context of the characters in the original string, | ||||
| #   NOT in the resulting string. | ||||
| # - Case distinctions in the condition list are not significant. | ||||
| # - Conditions preceded by "Not_" represent the negation of the condition. | ||||
| # The condition list is not represented in the UCD as a formal property. | ||||
| # | ||||
| # A language ID is defined by BCP 47, with '-' and '_' treated equivalently. | ||||
| # | ||||
| # A casing context for a character is defined by Section 3.13 Default Case Algorithms | ||||
| # of The Unicode Standard. | ||||
| # | ||||
| # Parsers of this file must be prepared to deal with future additions to this format: | ||||
| #  * Additional contexts | ||||
| #  * Additional fields | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Unconditional mappings | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # The German es-zed is special--the normal mapping is to SS. | ||||
| # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>)) | ||||
| 
 | ||||
| 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S | ||||
| 
 | ||||
| # Preserve canonical equivalence for I with dot. Turkic is handled below. | ||||
| 
 | ||||
| 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 
 | ||||
| # Ligatures | ||||
| 
 | ||||
| FB00; FB00; 0046 0066; 0046 0046; # LATIN SMALL LIGATURE FF | ||||
| FB01; FB01; 0046 0069; 0046 0049; # LATIN SMALL LIGATURE FI | ||||
| FB02; FB02; 0046 006C; 0046 004C; # LATIN SMALL LIGATURE FL | ||||
| FB03; FB03; 0046 0066 0069; 0046 0046 0049; # LATIN SMALL LIGATURE FFI | ||||
| FB04; FB04; 0046 0066 006C; 0046 0046 004C; # LATIN SMALL LIGATURE FFL | ||||
| FB05; FB05; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE LONG S T | ||||
| FB06; FB06; 0053 0074; 0053 0054; # LATIN SMALL LIGATURE ST | ||||
| 
 | ||||
| 0587; 0587; 0535 0582; 0535 0552; # ARMENIAN SMALL LIGATURE ECH YIWN | ||||
| FB13; FB13; 0544 0576; 0544 0546; # ARMENIAN SMALL LIGATURE MEN NOW | ||||
| FB14; FB14; 0544 0565; 0544 0535; # ARMENIAN SMALL LIGATURE MEN ECH | ||||
| FB15; FB15; 0544 056B; 0544 053B; # ARMENIAN SMALL LIGATURE MEN INI | ||||
| FB16; FB16; 054E 0576; 054E 0546; # ARMENIAN SMALL LIGATURE VEW NOW | ||||
| FB17; FB17; 0544 056D; 0544 053D; # ARMENIAN SMALL LIGATURE MEN XEH | ||||
| 
 | ||||
| # No corresponding uppercase precomposed character | ||||
| 
 | ||||
| 0149; 0149; 02BC 004E; 02BC 004E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE | ||||
| 0390; 0390; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS | ||||
| 03B0; 03B0; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS | ||||
| 01F0; 01F0; 004A 030C; 004A 030C; # LATIN SMALL LETTER J WITH CARON | ||||
| 1E96; 1E96; 0048 0331; 0048 0331; # LATIN SMALL LETTER H WITH LINE BELOW | ||||
| 1E97; 1E97; 0054 0308; 0054 0308; # LATIN SMALL LETTER T WITH DIAERESIS | ||||
| 1E98; 1E98; 0057 030A; 0057 030A; # LATIN SMALL LETTER W WITH RING ABOVE | ||||
| 1E99; 1E99; 0059 030A; 0059 030A; # LATIN SMALL LETTER Y WITH RING ABOVE | ||||
| 1E9A; 1E9A; 0041 02BE; 0041 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING | ||||
| 1F50; 1F50; 03A5 0313; 03A5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI | ||||
| 1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA | ||||
| 1F54; 1F54; 03A5 0313 0301; 03A5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA | ||||
| 1F56; 1F56; 03A5 0313 0342; 03A5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI | ||||
| 1FB6; 1FB6; 0391 0342; 0391 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI | ||||
| 1FC6; 1FC6; 0397 0342; 0397 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI | ||||
| 1FD2; 1FD2; 0399 0308 0300; 0399 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA | ||||
| 1FD3; 1FD3; 0399 0308 0301; 0399 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA | ||||
| 1FD6; 1FD6; 0399 0342; 0399 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI | ||||
| 1FD7; 1FD7; 0399 0308 0342; 0399 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI | ||||
| 1FE2; 1FE2; 03A5 0308 0300; 03A5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA | ||||
| 1FE3; 1FE3; 03A5 0308 0301; 03A5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA | ||||
| 1FE4; 1FE4; 03A1 0313; 03A1 0313; # GREEK SMALL LETTER RHO WITH PSILI | ||||
| 1FE6; 1FE6; 03A5 0342; 03A5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI | ||||
| 1FE7; 1FE7; 03A5 0308 0342; 03A5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI | ||||
| 1FF6; 1FF6; 03A9 0342; 03A9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI | ||||
| 
 | ||||
| # IMPORTANT-when iota-subscript (0345) is uppercased or titlecased, | ||||
| #  the result will be incorrect unless the iota-subscript is moved to the end | ||||
| #  of any sequence of combining marks. Otherwise, the accents will go on the capital iota. | ||||
| #  This process can be achieved by first transforming the text to NFC before casing. | ||||
| #  E.g. <alpha><iota_subscript><acute> is uppercased to <ALPHA><acute><IOTA> | ||||
| 
 | ||||
| # The following cases are already in the UnicodeData.txt file, so are only commented here. | ||||
| 
 | ||||
| # 0345; 0345; 0399; 0399; # COMBINING GREEK YPOGEGRAMMENI | ||||
| 
 | ||||
| # All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript) | ||||
| # have special uppercases. | ||||
| # Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase! | ||||
| 
 | ||||
| 1F80; 1F80; 1F88; 1F08 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1F81; 1F81; 1F89; 1F09 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1F82; 1F82; 1F8A; 1F0A 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1F83; 1F83; 1F8B; 1F0B 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1F84; 1F84; 1F8C; 1F0C 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1F85; 1F85; 1F8D; 1F0D 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1F86; 1F86; 1F8E; 1F0E 0399; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F87; 1F87; 1F8F; 1F0F 0399; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F88; 1F80; 1F88; 1F08 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1F89; 1F81; 1F89; 1F09 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1F8A; 1F82; 1F8A; 1F0A 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1F8B; 1F83; 1F8B; 1F0B 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1F8C; 1F84; 1F8C; 1F0C 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1F8D; 1F85; 1F8D; 1F0D 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1F8E; 1F86; 1F8E; 1F0E 0399; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F8F; 1F87; 1F8F; 1F0F 0399; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F90; 1F90; 1F98; 1F28 0399; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1F91; 1F91; 1F99; 1F29 0399; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1F92; 1F92; 1F9A; 1F2A 0399; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1F93; 1F93; 1F9B; 1F2B 0399; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1F94; 1F94; 1F9C; 1F2C 0399; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1F95; 1F95; 1F9D; 1F2D 0399; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1F96; 1F96; 1F9E; 1F2E 0399; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F97; 1F97; 1F9F; 1F2F 0399; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1F98; 1F90; 1F98; 1F28 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1F99; 1F91; 1F99; 1F29 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1F9A; 1F92; 1F9A; 1F2A 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1F9B; 1F93; 1F9B; 1F2B 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1F9C; 1F94; 1F9C; 1F2C 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1F9D; 1F95; 1F9D; 1F2D 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1F9E; 1F96; 1F9E; 1F2E 0399; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1F9F; 1F97; 1F9F; 1F2F 0399; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FA0; 1FA0; 1FA8; 1F68 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI | ||||
| 1FA1; 1FA1; 1FA9; 1F69 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI | ||||
| 1FA2; 1FA2; 1FAA; 1F6A 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI | ||||
| 1FA3; 1FA3; 1FAB; 1F6B 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI | ||||
| 1FA4; 1FA4; 1FAC; 1F6C 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI | ||||
| 1FA5; 1FA5; 1FAD; 1F6D 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI | ||||
| 1FA6; 1FA6; 1FAE; 1F6E 0399; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FA7; 1FA7; 1FAF; 1F6F 0399; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FA8; 1FA0; 1FA8; 1F68 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI | ||||
| 1FA9; 1FA1; 1FA9; 1F69 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI | ||||
| 1FAA; 1FA2; 1FAA; 1F6A 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI | ||||
| 1FAB; 1FA3; 1FAB; 1F6B 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI | ||||
| 1FAC; 1FA4; 1FAC; 1F6C 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI | ||||
| 1FAD; 1FA5; 1FAD; 1F6D 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI | ||||
| 1FAE; 1FA6; 1FAE; 1F6E 0399; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FAF; 1FA7; 1FAF; 1F6F 0399; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI | ||||
| 1FB3; 1FB3; 1FBC; 0391 0399; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI | ||||
| 1FBC; 1FB3; 1FBC; 0391 0399; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI | ||||
| 1FC3; 1FC3; 1FCC; 0397 0399; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI | ||||
| 1FCC; 1FC3; 1FCC; 0397 0399; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI | ||||
| 1FF3; 1FF3; 1FFC; 03A9 0399; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI | ||||
| 1FFC; 1FF3; 1FFC; 03A9 0399; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI | ||||
| 
 | ||||
| # Some characters with YPOGEGRAMMENI also have no corresponding titlecases | ||||
| 
 | ||||
| 1FB2; 1FB2; 1FBA 0345; 1FBA 0399; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FB4; 1FB4; 0386 0345; 0386 0399; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI | ||||
| 1FC2; 1FC2; 1FCA 0345; 1FCA 0399; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FC4; 1FC4; 0389 0345; 0389 0399; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI | ||||
| 1FF2; 1FF2; 1FFA 0345; 1FFA 0399; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI | ||||
| 1FF4; 1FF4; 038F 0345; 038F 0399; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI | ||||
| 
 | ||||
| 1FB7; 1FB7; 0391 0342 0345; 0391 0342 0399; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FC7; 1FC7; 0397 0342 0345; 0397 0342 0399; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 1FF7; 1FF7; 03A9 0342 0345; 03A9 0342 0399; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Conditional Mappings | ||||
| # The remainder of this file provides conditional casing data used to produce | ||||
| # full case mappings. | ||||
| # ================================================================================ | ||||
| # Language-Insensitive Mappings | ||||
| # These are characters whose full case mappings do not depend on language, but do | ||||
| # depend on context (which characters come before or after). For more information | ||||
| # see the header of this file and the Unicode Standard. | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Special case for final form of sigma | ||||
| 
 | ||||
| 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA | ||||
| 
 | ||||
| # Note: the following cases for non-final are already in the UnicodeData.txt file. | ||||
| 
 | ||||
| # 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA | ||||
| # 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA | ||||
| # 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA | ||||
| 
 | ||||
| # Note: the following cases are not included, since they would case-fold in lowercasing | ||||
| 
 | ||||
| # 03C3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK SMALL LETTER SIGMA | ||||
| # 03C2; 03C3; 03A3; 03A3; Not_Final_Sigma; # GREEK SMALL LETTER FINAL SIGMA | ||||
| 
 | ||||
| # ================================================================================ | ||||
| # Language-Sensitive Mappings | ||||
| # These are characters whose full case mappings depend on language and perhaps also | ||||
| # context (which characters come before or after). For more information | ||||
| # see the header of this file and the Unicode Standard. | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Lithuanian | ||||
| 
 | ||||
| # Lithuanian retains the dot in a lowercase i when followed by accents. | ||||
| 
 | ||||
| # Remove DOT ABOVE after "i" with upper or titlecase | ||||
| 
 | ||||
| 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE | ||||
| 
 | ||||
| # Introduce an explicit dot above when lowercasing capital I's and J's | ||||
| # whenever there are more accents above. | ||||
| # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) | ||||
| 
 | ||||
| 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I | ||||
| 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J | ||||
| 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK | ||||
| 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE | ||||
| 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE | ||||
| 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE | ||||
| 
 | ||||
| # ================================================================================ | ||||
| 
 | ||||
| # Turkish and Azeri | ||||
| 
 | ||||
| # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri | ||||
| # The following rules handle those cases. | ||||
| 
 | ||||
| 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE | ||||
| 
 | ||||
| # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. | ||||
| # This matches the behavior of the canonically equivalent I-dot_above | ||||
| 
 | ||||
| 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE | ||||
| 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE | ||||
| 
 | ||||
| # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. | ||||
| 
 | ||||
| 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I | ||||
| 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I | ||||
| 
 | ||||
| # When uppercasing, i turns into a dotted capital I | ||||
| 
 | ||||
| 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I | ||||
| 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I | ||||
| 
 | ||||
| # Note: the following case is already in the UnicodeData.txt file. | ||||
| 
 | ||||
| # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I | ||||
| 
 | ||||
| # EOF | ||||
| 
 | ||||
							
								
								
									
										34626
									
								
								unicode/UNIDATA/UnicodeData.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										34626
									
								
								unicode/UNIDATA/UnicodeData.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1441
									
								
								unicode/UNIDATA/WordBreakProperty.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1441
									
								
								unicode/UNIDATA/WordBreakProperty.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										249
									
								
								unicode/extract-char-cases.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										249
									
								
								unicode/extract-char-cases.ss
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,249 @@ | |||
| ;;; Copyright (C) 2008  Abdulaziz Ghuloum, R. Kent Dybvig | ||||
| ;;; Copyright (C) 2006,2007  Abdulaziz Ghuloum | ||||
| ;;;  | ||||
| ;;; Permission is hereby granted, free of charge, to any person obtaining a | ||||
| ;;; copy of this software and associated documentation files (the "Software"), | ||||
| ;;; to deal in the Software without restriction, including without limitation | ||||
| ;;; the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||||
| ;;; and/or sell copies of the Software, and to permit persons to whom the | ||||
| ;;; Software is furnished to do so, subject to the following conditions: | ||||
| ;;;  | ||||
| ;;; The above copyright notice and this permission notice shall be included in | ||||
| ;;; all copies or substantial portions of the Software. | ||||
| ;;;  | ||||
| ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL | ||||
| ;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||||
| ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||||
| ;;; DEALINGS IN THE SOFTWARE.  | ||||
| 
 | ||||
| (import (scheme) (unicode-data)) | ||||
| 
 | ||||
| ; dropping support for s16 inner vectors for now | ||||
| (include "extract-common.ss") | ||||
| 
 | ||||
| (define code-point-limit #x110000) ; as of Unicode 5.1 | ||||
| #;(define table-limit #x30000) | ||||
| (define table-limit code-point-limit) | ||||
| (define-table (make-table table-ref table-set! table-ref-code) | ||||
|   (make-vector vector-ref vector-set!) | ||||
|   table-limit #x40 #x40) | ||||
| 
 | ||||
| (define-record-type chardata | ||||
|   (fields (immutable ucchar) | ||||
|           (immutable lcchar) | ||||
|           (immutable tcchar) | ||||
|           (mutable fcchar) | ||||
|           (mutable ucstr) | ||||
|           (mutable lcstr) | ||||
|           (mutable tcstr) | ||||
|           (mutable fcstr) | ||||
|           (immutable decomp-canon) | ||||
|           (immutable decomp-compat)) | ||||
|   (protocol | ||||
|     (lambda (new) | ||||
|       (lambda (ucchar lcchar tcchar decomp-canon decomp-compat) | ||||
|         (new ucchar lcchar tcchar 0 ucchar lcchar tcchar 0 | ||||
|              decomp-canon decomp-compat))))) | ||||
| 
 | ||||
| (define (find-cdrec idx ls) | ||||
|   (cond | ||||
|     [(assq idx ls) => cdr] | ||||
|     [else (error 'find-cdrec "~s is missing" idx)])) | ||||
| 
 | ||||
| (define data-case | ||||
|   (lambda (fields) | ||||
|     (let ([n (hex->num (car fields))] | ||||
|           [uc (list-ref fields 12)] | ||||
|           [lc (list-ref fields 13)] | ||||
|           [tc (list-ref fields 14)]) | ||||
|       (define (f x) (if (string=? x "") 0 (- (hex->num x) n))) | ||||
|       (cons n (make-chardata (f uc) (f lc) (f tc) | ||||
|                 (parse-decomp n (list-ref fields 5) #f) | ||||
|                 (parse-decomp n (list-ref fields 5) #t)))))) | ||||
| 
 | ||||
| (define (split str) | ||||
|   (remove "" | ||||
|     (let f ([i 0] [n (string-length str)]) | ||||
|       (cond | ||||
|         [(= i n) (list (substring str 0 n))] | ||||
|         [(char=? (string-ref str i) #\space) | ||||
|          (cons (substring str 0 i)  | ||||
|                (split (substring str (+ i 1) n)))] | ||||
|         [else (f (add1 i) n)])))) | ||||
| 
 | ||||
| (define (improperize ls) | ||||
|   (cond | ||||
|     [(null? (cdr ls)) (car ls)] | ||||
|     [else (cons (car ls) (improperize (cdr ls)))])) | ||||
| 
 | ||||
| (define (c*->off c* n) | ||||
|   (if (= (length c*) 1) | ||||
|       (- (car c*) n) | ||||
|       (improperize (map integer->char c*)))) | ||||
| 
 | ||||
| (define (codes->off str n) | ||||
|   (c*->off (map hex->num (split str)) n)) | ||||
| 
 | ||||
| ;;; decomposition field looks like: | ||||
| ;;;    hex-value* | ||||
| ;;;    <tag> hex-value* | ||||
| ;;; latter appear to be for compatibility decomposition only | ||||
| (define (parse-decomp n str compat?) | ||||
|   (let f ([ls (split str)]) | ||||
|     (cond | ||||
|       [(null? ls) 0] | ||||
|       [(char=? (string-ref (car ls) 0) #\<) | ||||
|        (if compat? (c*->off (map hex->num (cdr ls)) n) 0)] | ||||
|       [else (c*->off (map hex->num ls) n)]))) | ||||
| 
 | ||||
| (define (insert-foldcase-data! ls data) | ||||
|   (for-each  | ||||
|     (lambda (fields) | ||||
|       (let ([n (hex->num (car fields))]) | ||||
|         (let ([cdrec (find-cdrec n ls)] | ||||
|               [offset (codes->off (caddr fields) n)]) | ||||
|           (chardata-fcchar-set! cdrec offset) | ||||
|           (chardata-fcstr-set! cdrec offset)))) | ||||
|     (filter (lambda (fields) (equal? (cadr fields) "C")) data)) | ||||
|   (for-each  | ||||
|     (lambda (fields) | ||||
|       (let ([n (hex->num (car fields))]) | ||||
|         (chardata-fcstr-set! | ||||
|           (find-cdrec n ls) | ||||
|           (codes->off (caddr fields) n)))) | ||||
|     (filter (lambda (fields) (equal? (cadr fields) "F")) data))) | ||||
| 
 | ||||
| (define (insert-specialcase-data! ls data) | ||||
|   (for-each | ||||
|     (lambda (fields) | ||||
|       (let ([n (hex->num (car fields))]) | ||||
|         (let ([cdrec (find-cdrec n ls)]) | ||||
|           (chardata-lcstr-set! cdrec (codes->off (list-ref fields 1) n)) | ||||
|           (chardata-tcstr-set! cdrec (codes->off (list-ref fields 2) n)) | ||||
|           (chardata-ucstr-set! cdrec (codes->off (list-ref fields 3) n))))) | ||||
|     (filter | ||||
|       (lambda (fields) (= 0 (string-length (list-ref fields 4)))) | ||||
|       data))) | ||||
| 
 | ||||
| (define verify-identity! | ||||
|   (lambda (n cdrec) | ||||
|     (define (zeros? . args) (andmap (lambda (x) (eqv? x 0)) args)) | ||||
|     (unless (zeros? (chardata-ucchar cdrec) | ||||
|                     (chardata-lcchar cdrec) | ||||
|                     (chardata-tcchar cdrec) | ||||
|                     (chardata-fcchar cdrec) | ||||
|                     (chardata-ucstr cdrec) | ||||
|                     (chardata-lcstr cdrec) | ||||
|                     (chardata-tcstr cdrec) | ||||
|                     (chardata-fcstr cdrec) | ||||
|                     (chardata-decomp-canon cdrec) | ||||
|                     (chardata-decomp-compat cdrec)) | ||||
|       (error 'verify-identity "failed for ~x, ~s" n cdrec)))) | ||||
| 
 | ||||
| (define build-uncommonized-table | ||||
|   (lambda (acc ls) | ||||
|     (let ([table (make-table 0)]) | ||||
|       (for-each  | ||||
|         (lambda (x) | ||||
|           (let ([n (car x)] [cdrec (cdr x)]) | ||||
|             (unless (< n code-point-limit) | ||||
|               (error 'build-table | ||||
|                 "code point value ~s is at or above declared limit ~s" | ||||
|                 n code-point-limit)) | ||||
|             (if (>= n table-limit) | ||||
|                 (verify-identity! n cdrec) | ||||
|                 (table-set! table n (acc cdrec))))) | ||||
|         ls) | ||||
|       table))) | ||||
| 
 | ||||
| (define build-table | ||||
|   (lambda (acc ls) | ||||
|     (commonize* (build-uncommonized-table acc ls)))) | ||||
| 
 | ||||
| (define (get-composition-pairs decomp-canon-table) | ||||
|   (define ($str-decomp-canon c)  | ||||
|     (define (strop tbl c) | ||||
|       (let ([n (char->integer c)]) | ||||
|         (if (and (fx< table-limit code-point-limit) | ||||
|                  (fx>= n table-limit)) | ||||
|             c | ||||
|             (let ([x (table-ref tbl n)]) | ||||
|               (if (fixnum? x) | ||||
|                   (integer->char (fx+ x n)) | ||||
|                   x))))) | ||||
|     (strop decomp-canon-table c)) | ||||
|   (let ([exclusions  | ||||
|          (map hex->num | ||||
|            (map car (get-unicode-data | ||||
|                       "UNIDATA/CompositionExclusions.txt")))] | ||||
|         [from* '()] | ||||
|         [to* '()]) | ||||
|     (define (enter i) | ||||
|       (unless (memv i exclusions) | ||||
|         (let* ([c (integer->char i)]  [c* ($str-decomp-canon c)]) | ||||
|           (when (pair? c*) | ||||
|             (set! from* (cons c* from*)) | ||||
|             (set! to* (cons c to*)))))) | ||||
|     (do ([i 0 (fx+ i 1)]) ((fx= i #xD800)) (enter i)) | ||||
|     (do ([i #xE000 (fx+ i 1)]) ((fx= i code-point-limit)) (enter i)) | ||||
|     (commonize* (cons (list->vector from*) (list->vector to*))))) | ||||
| 
 | ||||
| (let ([ls (map data-case (get-unicode-data "UNIDATA/UnicodeData.txt"))]) | ||||
|   (insert-foldcase-data! ls (get-unicode-data "UNIDATA/CaseFolding.txt")) | ||||
|   (insert-specialcase-data! ls (get-unicode-data "UNIDATA/SpecialCasing.txt")) | ||||
|  ; insert final sigma flag for char-downcase conversion | ||||
|   (chardata-lcstr-set! (find-cdrec #x3a3 ls) 'sigma) | ||||
|   (with-output-to-file* "unicode-char-cases.ss" | ||||
|     (lambda () | ||||
|       (parameterize ([print-graph #t] [print-vector-length #f] [print-unicode #f]) | ||||
|         (pretty-print | ||||
|           `(module ($char-upcase $char-downcase $char-titlecase $char-foldcase | ||||
|                     $str-upcase $str-downcase $str-titlecase $str-foldcase | ||||
|                     $str-decomp-canon $str-decomp-compat | ||||
|                     $composition-pairs) | ||||
|              (define char-upcase-table ',(build-table chardata-ucchar ls)) | ||||
|              (define char-downcase-table ',(build-table chardata-lcchar ls)) | ||||
|              (define char-titlecase-table ',(build-table chardata-tcchar ls)) | ||||
|              (define char-foldcase-table ',(build-table chardata-fcchar ls)) | ||||
|              (define string-upcase-table ',(build-table chardata-ucstr ls)) | ||||
|              (define string-downcase-table ',(build-table chardata-lcstr ls)) | ||||
|              (define string-titlecase-table ',(build-table chardata-tcstr ls)) | ||||
|              (define string-foldcase-table ',(build-table chardata-fcstr ls)) | ||||
|              (define decomp-canon-table ',(build-table chardata-decomp-canon ls)) | ||||
|              (define decomp-compat-table ',(build-table chardata-decomp-compat ls)) | ||||
|              (define table-limit ,table-limit) | ||||
|              (define code-point-limit ,code-point-limit) | ||||
|              (define table-ref ,table-ref-code) | ||||
|              (define (charop tbl c) | ||||
|                (let ([n (char->integer c)]) | ||||
|                  (if (and (fx< table-limit code-point-limit) | ||||
|                           (fx>= n table-limit)) | ||||
|                      c | ||||
|                      (integer->char (fx+ (table-ref tbl n) n))))) | ||||
|              (define (strop tbl c) | ||||
|                (let ([n (char->integer c)]) | ||||
|                  (if (and (fx< table-limit code-point-limit) | ||||
|                           (fx>= n table-limit)) | ||||
|                      c | ||||
|                      (let ([x (table-ref tbl n)]) | ||||
|                        (if (fixnum? x) | ||||
|                            (integer->char (fx+ x n)) | ||||
|                            x))))) | ||||
|              (define ($char-upcase c) (charop char-upcase-table c)) | ||||
|              (define ($char-downcase c) (charop char-downcase-table c)) | ||||
|              (define ($char-titlecase c) (charop char-titlecase-table c)) | ||||
|              (define ($char-foldcase c) (charop char-foldcase-table c)) | ||||
|              (define ($str-upcase c) (strop string-upcase-table c)) | ||||
|              (define ($str-downcase c) (strop string-downcase-table c)) | ||||
|              (define ($str-titlecase c) (strop string-titlecase-table c)) | ||||
|              (define ($str-foldcase c) (strop string-foldcase-table c)) | ||||
|              (define ($str-decomp-canon c) (strop decomp-canon-table c)) | ||||
|              (define ($str-decomp-compat c) (strop decomp-compat-table c)) | ||||
|              (define ($composition-pairs) | ||||
|                ',(get-composition-pairs | ||||
|                    (build-uncommonized-table chardata-decomp-canon ls))))))))) | ||||
| 
 | ||||
| (printf "Happy Happy Joy Joy ~a\n" (sizeof cache)) | ||||
							
								
								
									
										136
									
								
								unicode/extract-common.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										136
									
								
								unicode/extract-common.ss
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,136 @@ | |||
| ;;; Copyright (C) 2008  Abdulaziz Ghuloum, R. Kent Dybvig | ||||
| ;;; Copyright (C) 2006,2007  Abdulaziz Ghuloum | ||||
| ;;;  | ||||
| ;;; Permission is hereby granted, free of charge, to any person obtaining a | ||||
| ;;; copy of this software and associated documentation files (the "Software"), | ||||
| ;;; to deal in the Software without restriction, including without limitation | ||||
| ;;; the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||||
| ;;; and/or sell copies of the Software, and to permit persons to whom the | ||||
| ;;; Software is furnished to do so, subject to the following conditions: | ||||
| ;;;  | ||||
| ;;; The above copyright notice and this permission notice shall be included in | ||||
| ;;; all copies or substantial portions of the Software. | ||||
| ;;;  | ||||
| ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL | ||||
| ;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||||
| ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||||
| ;;; DEALINGS IN THE SOFTWARE.  | ||||
| 
 | ||||
| (define-syntax assert | ||||
|   (syntax-rules () | ||||
|     [(_ e) (unless e (syntax-error #'e "assertion failed"))])) | ||||
| 
 | ||||
| (module ((define-table maker accessor mutator)) | ||||
|   (define-syntax maker | ||||
|     (syntax-rules () | ||||
|       [(_ make-inner x t) (make-inner t x)] | ||||
|       [(_ make-inner x t1 t2 ...) | ||||
|        (let ([v (make-vector t1)]) | ||||
|          (do ([i 0 (fx+ i 1)]) | ||||
|              ((fx= i t1)) | ||||
|            (vector-set! v i (maker make-inner x t2 ...))) | ||||
|          v)])) | ||||
|   (define-syntax accessor | ||||
|     (syntax-rules () | ||||
|       [(_ inner-ref tbl i t) (inner-ref tbl i)] | ||||
|       [(_ inner-ref tbl i t1 t2 ...) | ||||
|        (let ([d (* t2 ...)]) | ||||
|          (accessor inner-ref (vector-ref tbl (fxdiv i d)) | ||||
|            (fxmod i d) t2 ...))])) | ||||
|   (define-syntax mutator | ||||
|     (syntax-rules () | ||||
|       [(_ inner-set! tbl i x t) (inner-set! tbl i x)] | ||||
|       [(_ inner-set! tbl i x t1 t2 ...) | ||||
|        (let ([d (* t2 ...)]) | ||||
|          (mutator inner-set! (vector-ref tbl (fxdiv i d)) | ||||
|            (fxmod i d) x t2 ...))])) | ||||
|   (define-syntax define-table | ||||
|     (lambda (x) | ||||
|       (define accessor-code | ||||
|         (lambda (x) | ||||
|           (syntax-case x () | ||||
|             [(inner-ref tbl i t) #'(inner-ref tbl i)] | ||||
|             [(inner-ref tbl i t1 t2 ...) | ||||
|              (with-syntax ([(d) (generate-temporaries '(d))]) | ||||
|                (with-syntax ([body (accessor-code  | ||||
|                                      #'(inner-ref | ||||
|                                          (vector-ref tbl (fxdiv i d)) | ||||
|                                          (fxmod i d) t2 ...))]) | ||||
|                  #'(let ([d (* t2 ...)]) body)))]))) | ||||
|       (syntax-case x () | ||||
|         [(_ (make-table table-ref table-set! table-ref-code) | ||||
|             (make-inner inner-ref inner-set!) size dim ...) | ||||
|          (with-syntax ([(t1 t2 ...) (generate-temporaries #'(size dim ...))] | ||||
|                        [code (accessor-code  | ||||
|                                #'(inner-ref tbl i | ||||
|                                             (/ size (* dim ...)) | ||||
|                                             dim ...))]) | ||||
|            #'(module (make-table table-ref table-set! table-ref-code) | ||||
|                (define t2 dim) ... | ||||
|                (define t1 (/ size (* t2 ...))) | ||||
|                (define make-table (lambda (x) (maker make-inner x t1 t2 ...))) | ||||
|                (define table-ref-code '(lambda (tbl i) code)) | ||||
|                (define table-ref (lambda (tbl i) (accessor inner-ref tbl i t1 t2 ...))) | ||||
|                (define table-set! (lambda (tbl i x) (mutator inner-set! tbl i x t1 t2 ...)))))])))) | ||||
| 
 | ||||
| (define (with-output-to-file* file thunk) | ||||
|   (when (file-exists? file) (delete-file file)) | ||||
|   (with-output-to-file file thunk)) | ||||
| 
 | ||||
| (define common-equal? | ||||
|   (lambda (x y) | ||||
|     (cond | ||||
|       [(eq? x y) #t] | ||||
|       [(vector? x) | ||||
|        (and (vector? y) | ||||
|          (let ([n (vector-length x)]) | ||||
|            (and (fx= (vector-length y) n) | ||||
|              (let f ([i 0]) | ||||
|                (or (fx= i n) | ||||
|                    (and (eq? (vector-ref x i) (vector-ref y i)) | ||||
|                         (f (fx+ i 1))))))))] | ||||
|       [(pair? x) (and (pair? y) (eq? (car x) (car y)) (eq? (cdr x) (cdr y)))] | ||||
|       [else (equal? x y)]))) | ||||
| 
 | ||||
| (define cache '()) | ||||
| #;(define commonize ; 5.8s | ||||
|   (lambda (x) | ||||
|     (or (find (lambda (y) (common-equal? y x)) cache) | ||||
|         (begin (set! cache (cons x cache)) x)))) | ||||
| #;(define commonize ; 2.6s | ||||
|   (let ([cache-table (make-hashtable equal-hash common-equal?)]) | ||||
|     (lambda (x) | ||||
|       (or (hashtable-ref cache-table x #f) | ||||
|           (begin | ||||
|             (set! cache (cons x cache)) ; for sizeof | ||||
|             (hashtable-set! cache-table x x) | ||||
|             x))))) | ||||
| (define commonize ; 1.9s | ||||
|   (lambda (x) | ||||
|     (let ([v (find (lambda (y) (common-equal? y x)) cache)]) | ||||
|       (if v | ||||
|           (begin (set! cache (cons v (remq v cache))) v) | ||||
|           (begin (set! cache (cons x cache)) x))))) | ||||
| 
 | ||||
| (define commonize* | ||||
|   (lambda (x) | ||||
|     (cond | ||||
|       [(vector? x) | ||||
|        (let ([n (vector-length x)]) | ||||
|          (do ([i 0 (fx+ i 1)]) | ||||
|              ((fx= i n)) | ||||
|            (vector-set! x i (commonize* (vector-ref x i))))) | ||||
|        (commonize x)] | ||||
|       [(bytevector? x) (commonize x)] | ||||
|       [(pair? x) | ||||
|        (set-car! x (commonize* (car x))) | ||||
|        (set-cdr! x (commonize* (cdr x))) | ||||
|        (commonize x)] | ||||
|       [else x]))) | ||||
| 
 | ||||
| (define (sizeof ls) (compute-size ls)) | ||||
| 
 | ||||
| (define (hex->num x) (string->number x 16)) | ||||
							
								
								
									
										403
									
								
								unicode/extract-info.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										403
									
								
								unicode/extract-info.ss
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,403 @@ | |||
| ;;; Copyright (C) 2008  Abdulaziz Ghuloum, R. Kent Dybvig | ||||
| ;;; Copyright (C) 2006,2007  Abdulaziz Ghuloum | ||||
| ;;;  | ||||
| ;;; Permission is hereby granted, free of charge, to any person obtaining a | ||||
| ;;; copy of this software and associated documentation files (the "Software"), | ||||
| ;;; to deal in the Software without restriction, including without limitation | ||||
| ;;; the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||||
| ;;; and/or sell copies of the Software, and to permit persons to whom the | ||||
| ;;; Software is furnished to do so, subject to the following conditions: | ||||
| ;;;  | ||||
| ;;; The above copyright notice and this permission notice shall be included in | ||||
| ;;; all copies or substantial portions of the Software. | ||||
| ;;;  | ||||
| ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL | ||||
| ;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||||
| ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||||
| ;;; DEALINGS IN THE SOFTWARE.  | ||||
| 
 | ||||
| (import (scheme) (unicode-data)) | ||||
| 
 | ||||
| (include "extract-common.ss") | ||||
| 
 | ||||
| (define code-point-limit #x110000) | ||||
| (define-table (make-table table-ref table-set! table-ref-code) | ||||
|   (make-vector vector-ref vector-set!) | ||||
|   code-point-limit #x40 #x40) | ||||
| 
 | ||||
| (define (string-suffix? str suffix) | ||||
|   (let ([n (string-length str)] [m (string-length suffix)]) | ||||
|     (and (fx>= n m) (string=? (substring str (fx- n m) n) suffix)))) | ||||
|         | ||||
| (define (extract-range str) | ||||
|   (define (find-char c s) | ||||
|     (let f ([i 0] [n (string-length s)]) | ||||
|       (cond | ||||
|         [(= i n) #f] | ||||
|         [(char=? (string-ref s i) c) i] | ||||
|         [else (f (+ i 1) n)]))) | ||||
|   (cond | ||||
|     [(find-char #\. str) => | ||||
|      (lambda (i) | ||||
|        (cons | ||||
|          (hex->num (substring str 0 i)) | ||||
|          (hex->num (substring str (+ i 2) (string-length str)))))] | ||||
|     [else (let ([n (hex->num str)]) (cons n n))])) | ||||
| 
 | ||||
| ; fixnum field laid out as follows: | ||||
| ;   bits 0-5: category number | ||||
| ;   bits 6-9: wordbreak property | ||||
| ;   bits 10-17: combining class | ||||
| ;   bits 18-29: case/type property bits | ||||
| 
 | ||||
| (define-syntax define-bitfields | ||||
|   (lambda (x) | ||||
|     (define construct-name | ||||
|       (lambda (template-identifier . args) | ||||
|         (datum->syntax template-identifier | ||||
|           (string->symbol | ||||
|             (apply string-append | ||||
|                    (map (lambda (x) (format "~a" (syntax->datum x))) | ||||
|                         args)))))) | ||||
|     (define extract | ||||
|       (lambda (fld* bit def*) | ||||
|         (assert (< bit (fixnum-width))) | ||||
|         (if (null? fld*) | ||||
|             def* | ||||
|             (syntax-case (car fld*) (flag enumeration integer) | ||||
|               [(flag name) (identifier? #'name) | ||||
|                (extract (cdr fld*) (+ bit 1) | ||||
|                  #`((define name #,(fxsll 1 bit)) #,@def*))] | ||||
|               [(enumeration name id ...) | ||||
|                (and (identifier? #'name) (for-all identifier? #'(id ...))) | ||||
|                (let ([width (bitwise-length (length #'(id ...)))]) | ||||
|                  (with-syntax ([name-shift (construct-name #'name #'name "-shift")] | ||||
|                                [name-mask (construct-name #'name #'name "-mask")]) | ||||
|                    (extract (cdr fld*) (+ bit width) | ||||
|                      #`((define name-shift #,bit) | ||||
|                         (define name-mask #,(fx- (fxsll 1 width) 1)) | ||||
|                         #,@(map (lambda (id val) #`(define #,id #,val)) | ||||
|                                 #'(id ...) | ||||
|                                 (enumerate #'(id ...))) | ||||
|                         #,@def*))))] | ||||
|               [(integer name width) (identifier? #'name) | ||||
|                (let ([width (syntax->datum #'width)]) | ||||
|                  (with-syntax ([name-shift (construct-name #'name #'name "-shift")] | ||||
|                                [name-mask (construct-name #'name #'name "-mask")]) | ||||
|                    (extract (cdr fld*) (+ bit width) | ||||
|                      #`((define name-shift #,bit) | ||||
|                         (define name-mask #,(fx- (fxsll 1 width) 1)) | ||||
|                         #,@def*))))])))) | ||||
|     (syntax-case x () | ||||
|       [(_ fld ...) | ||||
|        #`(begin #,@(extract #'(fld ...) 0 #'()))]))) | ||||
| 
 | ||||
| (define-bitfields | ||||
|   (flag cased-property) | ||||
|   (flag case-ignorable-property) | ||||
|   (flag constituent-property) | ||||
|   (flag subsequent-property) | ||||
|   (flag uppercase-property) | ||||
|   (flag lowercase-property) | ||||
|   (flag titlecase-property) | ||||
|   (flag alphabetic-property) | ||||
|   (flag numeric-property) | ||||
|   (flag whitespace-property) | ||||
|   (enumeration category Lu-cat Ll-cat Lt-cat Lm-cat Lo-cat | ||||
|     Mn-cat Mc-cat Me-cat Nd-cat Nl-cat No-cat Pc-cat Pd-cat | ||||
|     Ps-cat Pe-cat Pi-cat Pf-cat Po-cat Sm-cat Sc-cat Sk-cat | ||||
|     So-cat Zs-cat Zl-cat Zp-cat Cc-cat Cf-cat Cs-cat Co-cat | ||||
|     Cn-cat) | ||||
|  ; default wb-other-property must be zero, so must be listed first | ||||
|   (enumeration wbproperty wb-other-property wb-aletter-property | ||||
|     wb-numeric-property wb-katakana-property | ||||
|     wb-extend-property wb-format-property wb-midnum-property | ||||
|     wb-midletter-property wb-midnumlet-property | ||||
|     wb-extendnumlet-property wb-cr-property wb-lf-property | ||||
|     wb-newline-property | ||||
|     ; UNICODE 7.0.0 | ||||
|     wb-double-quote-property wb-single-quote-property | ||||
|     wb-hebrew-letter-property wb-regional-indicator-property | ||||
|     ; UNICODE 14.0 | ||||
|     wb-zwj-property wb-wsegspace-property) | ||||
|   (integer combining-class 8)) | ||||
| 
 | ||||
| ;;; Uppercase = Lu + Other_Uppercase | ||||
| ;;; Lowercase = Ll + Other_Lowercase | ||||
| ;;; Titlecase = Lt | ||||
| ;;; Alphabetic = Lu + Ll + Lt + Lm + Lo + Nl + Other_Alphabetic  | ||||
| ;;; Numeric = ??? | ||||
| ;;; White_Space =  | ||||
| 
 | ||||
| ;;; cased property: | ||||
| ;;;   D135: A character C is defined to be cased if and only if C has the | ||||
| ;;;   Lowercase or Uppercase property or has a General_Category value of | ||||
| ;;;   Titlecase_Letter. | ||||
| ;;; | ||||
| ;;; case-ignorable property: | ||||
| ;;;   D136 A character C is defined to be case-ignorable if C has the | ||||
| ;;;   value MidLetter, MidNumLet, or Single_Quote for the Word_Break property | ||||
| ;;;   or its General_Category is one of Nonspacing_Mark (Mn), | ||||
| ;;;   Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or | ||||
| ;;;   Modifier_Symbol (Sk). | ||||
| 
 | ||||
| (define name->wbprop | ||||
|   (lambda (name) | ||||
|     (case (string->symbol name) | ||||
|       [(ALetter) (fxsll wb-aletter-property wbproperty-shift)] | ||||
|       [(Numeric) (fxsll wb-numeric-property wbproperty-shift)] | ||||
|       [(Katakana) (fxsll wb-katakana-property wbproperty-shift)] | ||||
|       [(MidLetter) (fxior (fxsll wb-midletter-property wbproperty-shift) case-ignorable-property)] | ||||
|       [(Extend) (fxsll wb-extend-property wbproperty-shift)] | ||||
|       [(Format) (fxsll wb-format-property wbproperty-shift)] | ||||
|       [(MidNum) (fxsll wb-midnum-property wbproperty-shift)] | ||||
|       [(MidNumLet) (fxior (fxsll wb-midnumlet-property wbproperty-shift) case-ignorable-property)] | ||||
|       [(ExtendNumLet) (fxsll wb-extendnumlet-property wbproperty-shift)] | ||||
|       [(CR) (fxsll wb-cr-property wbproperty-shift)] | ||||
|       [(LF) (fxsll wb-lf-property wbproperty-shift)] | ||||
|       [(Newline) (fxsll wb-newline-property wbproperty-shift)] | ||||
|       [(Double_Quote) (fxsll wb-double-quote-property wbproperty-shift)] | ||||
|       [(Single_Quote) (fxior (fxsll wb-single-quote-property wbproperty-shift) case-ignorable-property)] | ||||
|       [(Hebrew_Letter) (fxsll wb-hebrew-letter-property wbproperty-shift)] | ||||
|       [(Regional_Indicator) (fxsll wb-regional-indicator-property wbproperty-shift)] | ||||
|       [(ZWJ) (fxsll wb-zwj-property wbproperty-shift)] | ||||
|       [(WSegSpace) (fxsll wb-wsegspace-property wbproperty-shift)] | ||||
|       [else (errorf 'name->wbprop "unexpected property ~a" name)]))) | ||||
| 
 | ||||
| (define proplist-properties | ||||
|   `(["Other_Uppercase"  ,uppercase-property] | ||||
|     ["Other_Lowercase"  ,lowercase-property] | ||||
|     ["Other_Alphabetic" ,alphabetic-property] | ||||
|     ["White_Space"      ,whitespace-property])) | ||||
| 
 | ||||
| (define categories | ||||
|   ;;; 30 categories | ||||
|   `([Lu ,(+ (fxsll Lu-cat category-shift) constituent-property uppercase-property alphabetic-property) "Letter, Uppercase"] | ||||
|     [Ll ,(+ (fxsll Ll-cat category-shift) constituent-property lowercase-property alphabetic-property) "Letter, Lowercase"] | ||||
|     [Lt ,(+ (fxsll Lt-cat category-shift) constituent-property titlecase-property alphabetic-property cased-property) "Letter, Titlecase"] | ||||
|     [Lm ,(+ (fxsll Lm-cat category-shift) constituent-property alphabetic-property case-ignorable-property)  "Letter, Modifier"] | ||||
|     [Lo ,(+ (fxsll Lo-cat category-shift) constituent-property alphabetic-property)  "Letter, Other"] | ||||
|     [Mn ,(+ (fxsll Mn-cat category-shift) constituent-property case-ignorable-property)  "Mark, Nonspacing"] | ||||
|     [Mc ,(+ (fxsll Mc-cat category-shift) subsequent-property)   "Mark, Spacing Combining"] | ||||
|     [Me ,(+ (fxsll Me-cat category-shift) subsequent-property case-ignorable-property) "Mark, Enclosing"] | ||||
|     [Nd ,(+ (fxsll Nd-cat category-shift) subsequent-property)   "Number, Decimal Digit"] | ||||
|     [Nl ,(+ (fxsll Nl-cat category-shift) constituent-property alphabetic-property)  "Number, Letter"] | ||||
|     [No ,(+ (fxsll No-cat category-shift) constituent-property)  "Number, Other"] | ||||
|     [Pc ,(+ (fxsll Pc-cat category-shift) constituent-property)  "Punctuation, Connector"] | ||||
|     [Pd ,(+ (fxsll Pd-cat category-shift) constituent-property)  "Punctuation, Dash"] | ||||
|     [Ps ,(+ (fxsll Ps-cat category-shift) )                      "Punctuation, Open"] | ||||
|     [Pe ,(+ (fxsll Pe-cat category-shift) )                      "Punctuation, Close"] | ||||
|     [Pi ,(+ (fxsll Pi-cat category-shift) )                      "Punctuation, Initial quote"] | ||||
|     [Pf ,(+ (fxsll Pf-cat category-shift) )                      "Punctuation, Final quote"] | ||||
|     [Po ,(+ (fxsll Po-cat category-shift) constituent-property)  "Punctuation, Other"] | ||||
|     [Sm ,(+ (fxsll Sm-cat category-shift) constituent-property)  "Symbol, Math"] | ||||
|     [Sc ,(+ (fxsll Sc-cat category-shift) constituent-property)  "Symbol, Currency"] | ||||
|     [Sk ,(+ (fxsll Sk-cat category-shift) constituent-property case-ignorable-property)  "Symbol, Modifier"] | ||||
|     [So ,(+ (fxsll So-cat category-shift) constituent-property)  "Symbol, Other"] | ||||
|     [Zs ,(+ (fxsll Zs-cat category-shift) )                      "Separator, Space"] | ||||
|     [Zl ,(+ (fxsll Zl-cat category-shift) )                      "Separator, Line"] | ||||
|     [Zp ,(+ (fxsll Zp-cat category-shift) )                      "Separator, Paragraph"] | ||||
|     [Cc ,(+ (fxsll Cc-cat category-shift) )                      "Other, Control"] | ||||
|     [Cf ,(+ (fxsll Cf-cat category-shift) case-ignorable-property)                      "Other, Format"] | ||||
|     [Cs ,(+ (fxsll Cs-cat category-shift) )                      "Other, Surrogate"] | ||||
|     [Co ,(+ (fxsll Co-cat category-shift) constituent-property)  "Other, Private Use"] | ||||
|     [Cn ,(+ (fxsll Cn-cat category-shift) )                      "Other, Not Assigned"] | ||||
|   )) | ||||
| 
 | ||||
| (define (category/flags x) | ||||
|   (cond | ||||
|     [(assq x categories) => cadr] | ||||
|     [else (errorf 'category/flags "invalid cat ~s" x)])) | ||||
| 
 | ||||
| (define (make-cats-table ls) | ||||
|   (let f ([i 1] [st (car ls)] [ls (cdr ls)] [ac '()]) | ||||
|     (cond | ||||
|       [(null? ls) (reverse (cons (cons i st) ac))] | ||||
|       [(equal? (cdar ls) (cdr st)) (f (+ i 1) st (cdr ls) ac)] | ||||
|       [else (f 1 (car ls) (cdr ls) (cons (cons i st) ac))]))) | ||||
| 
 | ||||
| ; create table, placing all in category Cn until proven otherwise | ||||
| (let ([tbl (make-table (category/flags 'Cn))]) | ||||
|   (define (setprop n prop) (table-set! tbl n prop)) | ||||
|   (define (getprop n) (table-ref tbl n)) | ||||
|   ;;; interesting parts of each element in UnicodeData.txt are: | ||||
|   ;;; field0: the character index, numeric | ||||
|   ;;; field1: the description, possibly with First or Last marker | ||||
|   ;;; field2: the category, symbolic | ||||
|   ;;; field3: the combining class (0-255) | ||||
|   ;;; field8: if set, then the char has the numeric property | ||||
|   ;;; field12: if set, then the char has upper-case mapping and is thus cased | ||||
|   ;;; field13: if set, then the char has lower-case mapping and is thus cased | ||||
|   (let f ([ls (get-unicode-data "UNIDATA/UnicodeData.txt")]) | ||||
|     (unless (null? ls) | ||||
|       (let ([x (car ls)] [ls (cdr ls)]) | ||||
|         (let ([n (hex->num (list-ref x 0))] | ||||
|               [cclass (string->number (list-ref x 3))] | ||||
|               [cat/flags (category/flags (string->symbol (list-ref x 2)))] | ||||
|               [num (if (string=? (list-ref x 8) "") 0 numeric-property)] | ||||
|               [cased (if (and (string=? (list-ref x 12) "") (string=? (list-ref x 13) "")) | ||||
|                          0 cased-property)]) | ||||
|           (let ([props (fxior num cased | ||||
|                          (fxsll cclass combining-class-shift) | ||||
|                          cat/flags)]) | ||||
|             (if (string-suffix? (list-ref x 1) "First>") | ||||
|                 (let ([y (car ls)] [ls (cdr ls)]) | ||||
|                   (unless (string-suffix? (list-ref y 1) "Last>") | ||||
|                     (errorf #f "expected entry marked Last following entry marked First for ~x" n)) | ||||
|                   (let ([m (hex->num (list-ref y 0))]) | ||||
|                     (do ([n n (fx+ n 1)]) | ||||
|                         ((fx> n m)) | ||||
|                       (setprop n props))) | ||||
|                   (f ls)) | ||||
|                 (begin (setprop n props) (f ls)))))))) | ||||
|   ;;; interesting parts of each element in WordBreakProperty.txt are: | ||||
|   ;;; field0: the character index, numeric | ||||
|   ;;; field1: the word-break property | ||||
|   (for-each | ||||
|     (lambda (x) | ||||
|       (let ([range (extract-range (list-ref x 0))]) | ||||
|         (let f ([i (car range)] [j (cdr range)]) | ||||
|           (unless (> i j) | ||||
|             (let ([prop (getprop i)]) | ||||
|               (unless (fx= (fxand (fxsrl prop wbproperty-shift) wbproperty-mask) 0) | ||||
|                 (errorf #f "multiple word break properties found for ~x" i)) | ||||
|               (setprop i (fxior prop (name->wbprop (list-ref x 1)))) | ||||
|               (f (+ i 1) j)))))) | ||||
|     (get-unicode-data "UNIDATA/WordBreakProperty.txt")) | ||||
|   ;;; interesting parts of each element in PropList.txt are: | ||||
|   ;;; field0: range of character indices | ||||
|   ;;; field1: property name | ||||
|   (for-each | ||||
|     (lambda (x) | ||||
|       (let ([range (extract-range (list-ref x 0))] | ||||
|             [name (list-ref x 1)]) | ||||
|         (cond | ||||
|           [(assoc name proplist-properties) => | ||||
|            (lambda (a)  | ||||
|              (let ([n (cadr a)]) | ||||
|                (let f ([i (car range)] [j (cdr range)]) | ||||
|                  (unless (> i j)  | ||||
|                    (setprop i (fxlogor (getprop i) n)) | ||||
|                    (f (+ i 1) j)))))]))) | ||||
|     (get-unicode-data "UNIDATA/PropList.txt")) | ||||
|   ;;; clear constituent property for first 128 characters | ||||
|   (do ([i 0 (fx+ i 1)]) | ||||
|       ((fx= i 128)) | ||||
|     (setprop i (fxand (getprop i) (fxnot constituent-property)))) | ||||
|   (commonize* tbl) | ||||
|   (with-output-to-file* "unicode-charinfo.ss" | ||||
|     (lambda ()  | ||||
|       (parameterize ([print-graph #t] [print-vector-length #f]) | ||||
|         (pretty-print | ||||
|           `(module ($char-constituent? $char-subsequent? $char-upper-case? $char-lower-case? $char-title-case? $char-alphabetic? | ||||
|                     $char-numeric? $char-whitespace? $char-cased? $char-case-ignorable? $char-category | ||||
|                     $wb-aletter? $wb-numeric? $wb-katakana? $wb-extend? $wb-format? $wb-midnum? $wb-midletter? | ||||
|                     $wb-midnumlet? $wb-extendnumlet? $char-combining-class $char-dump | ||||
|                     ; UNICODE 7.0.0 | ||||
|                     $wb-hebrew-letter? $wb-single-quote? $wb-double-quote? $wb-regional-indicator? | ||||
|                     ; UNICODE 14.0 | ||||
|                     $wb-zwj? $wb-wsegspace?) | ||||
|              (define category-mask ,category-mask) | ||||
|              (define unicode-category-table ',tbl) | ||||
|              (define unicode-category-names | ||||
|                ',(list->vector (map car categories))) | ||||
|              (define table-ref ,table-ref-code) | ||||
|              (define (getprop n) (table-ref unicode-category-table n)) | ||||
|              (define $char-constituent? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,constituent-property))) | ||||
|              (define $char-subsequent? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,subsequent-property))) | ||||
|              (define $char-upper-case? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,uppercase-property))) | ||||
|              (define $char-lower-case? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,lowercase-property))) | ||||
|              (define $char-title-case? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,titlecase-property))) | ||||
|              (define $char-alphabetic? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,alphabetic-property))) | ||||
|              (define $char-numeric? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,numeric-property))) | ||||
|              (define $char-whitespace? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,whitespace-property))) | ||||
|              (define $char-cased? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,cased-property))) | ||||
|              (define $char-case-ignorable? | ||||
|                (lambda (c) | ||||
|                  (fxlogtest (getprop (char->integer c)) ,case-ignorable-property))) | ||||
|              (define (wb prop) | ||||
|                (lambda (c) | ||||
|                  (fx= (fxand | ||||
|                         (fxsrl | ||||
|                           (getprop (char->integer c)) | ||||
|                           ,wbproperty-shift) | ||||
|                         ,wbproperty-mask) | ||||
|                       prop))) | ||||
|              (define $wb-aletter? (wb ,wb-aletter-property)) | ||||
|              (define $wb-numeric? (wb ,wb-numeric-property)) | ||||
|              (define $wb-katakana? (wb ,wb-katakana-property)) | ||||
|              (define $wb-extend? (wb ,wb-extend-property)) | ||||
|              (define $wb-format? (wb ,wb-format-property)) | ||||
|              (define $wb-midnum? (wb ,wb-midnum-property)) | ||||
|              (define $wb-midletter? (wb ,wb-midletter-property)) | ||||
|              (define $wb-midnumlet? (wb ,wb-midnumlet-property)) | ||||
|              (define $wb-extendnumlet? (wb ,wb-extendnumlet-property)) | ||||
|              (define $wb-hebrew-letter? (wb ,wb-hebrew-letter-property)) | ||||
|              (define $wb-double-quote? (wb ,wb-double-quote-property)) | ||||
|              (define $wb-single-quote? (wb ,wb-single-quote-property)) | ||||
|              (define $wb-regional-indicator? (wb ,wb-regional-indicator-property)) | ||||
|              (define $wb-zwj? (wb ,wb-zwj-property)) | ||||
|              (define $wb-wsegspace? (wb ,wb-wsegspace-property)) | ||||
|              (define $char-combining-class | ||||
|                (lambda (c) | ||||
|                  (fxand (fxsrl (getprop (char->integer c)) ,combining-class-shift) | ||||
|                         ,combining-class-mask))) | ||||
|              (define $char-category | ||||
|                (lambda (c) | ||||
|                  (vector-ref unicode-category-names | ||||
|                    (fxand (fxsrl (getprop (char->integer c)) ,category-shift) | ||||
|                           ,category-mask)))) | ||||
|              (define $char-dump | ||||
|                (lambda (c) | ||||
|                  (define (list-true . args) (remq #f args)) | ||||
|                  (list-true | ||||
|                    (and ($char-constituent? c) 'constituent) | ||||
|                    (and ($char-subsequent? c) 'subsequent) | ||||
|                    (and ($char-upper-case? c) 'upper-case) | ||||
|                    (and ($char-lower-case? c) 'lower-case) | ||||
|                    (and ($char-title-case? c) 'title-case) | ||||
|                    (and ($char-alphabetic? c) 'alphabetic) | ||||
|                    (and ($char-numeric? c) 'whitespace) | ||||
|                    (and ($char-whitespace? c) 'whitespace) | ||||
|                    (and ($char-cased? c) 'cased) | ||||
|                    (and ($char-case-ignorable? c) 'case-ignorable) | ||||
|                    (and ($wb-aletter? c) 'aletter) | ||||
|                    (and ($wb-numeric? c) 'numeric) | ||||
|                    (and ($wb-katakana? c) 'katakana) | ||||
|                    (and ($wb-extend? c) 'extend) | ||||
|                    (and ($wb-format? c) 'format) | ||||
|                    (and ($wb-midnum? c) 'midnum) | ||||
|                    (and ($wb-midletter? c) 'midletter) | ||||
|                    (and ($wb-midnumlet? c) 'midnumlet) | ||||
|                    (and ($wb-extendnumlet? c) 'extendnumlet) | ||||
|                    (and ($wb-hebrew-letter? c) 'hebrew-letter) | ||||
|                    (and ($wb-double-quote? c) 'double-quote) | ||||
|                    (and ($wb-single-quote? c) 'single-quote) | ||||
|                    (and ($wb-regional-indicator? c) 'regional-indicator) | ||||
|                    (and ($wb-zwj? c) 'zwj) | ||||
|                    (and ($wb-wsegspace? c) 'wsegspace) | ||||
|                    `(combining-class ,($char-combining-class c)) | ||||
|                    ($char-category c)))))))))) | ||||
| 
 | ||||
| (printf "Happy Happy Joy Joy ~s\n" (sizeof cache)) | ||||
							
								
								
									
										3016
									
								
								unicode/unicode-char-cases.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3016
									
								
								unicode/unicode-char-cases.ss
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										3318
									
								
								unicode/unicode-charinfo.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3318
									
								
								unicode/unicode-charinfo.ss
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										79
									
								
								unicode/unicode-data.ss
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								unicode/unicode-data.ss
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,79 @@ | |||
| ;;; Ikarus Scheme -- A compiler for R6RS Scheme. | ||||
| ;;; Copyright (C) 2006,2007,2008  Abdulaziz Ghuloum | ||||
| 
 | ||||
| ;;; Permission is hereby granted, free of charge, to any person obtaining a | ||||
| ;;; copy of this software and associated documentation files (the "Software"), | ||||
| ;;; to deal in the Software without restriction, including without limitation | ||||
| ;;; the rights to use, copy, modify, merge, publish, distribute, sublicense, | ||||
| ;;; and/or sell copies of the Software, and to permit persons to whom the | ||||
| ;;; Software is furnished to do so, subject to the following conditions: | ||||
| ;;;  | ||||
| ;;; The above copyright notice and this permission notice shall be included in | ||||
| ;;; all copies or substantial portions of the Software. | ||||
| ;;;  | ||||
| ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| ;;; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| ;;; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL | ||||
| ;;; THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| ;;; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||||
| ;;; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||||
| ;;; DEALINGS IN THE SOFTWARE.  | ||||
| 
 | ||||
| (library (unicode-data) | ||||
|   (export get-unicode-data) | ||||
|   (import (rnrs)) | ||||
|    | ||||
|   (define (find-semi/hash str i n) | ||||
|     (cond | ||||
|       [(or (fx=? i n) (memv (string-ref str i) '(#\; #\#))) i] | ||||
|       [else (find-semi/hash str (+ i 1) n)])) | ||||
|    | ||||
|   (define (cleanup str) | ||||
|     (let ([lo  | ||||
|            (let f ([i 0] [n (string-length str)]) | ||||
|              (cond | ||||
|                [(= i n) n] | ||||
|                [(char=? #\space (string-ref str i))  | ||||
|                 (f (+ i 1) n)] | ||||
|                [else i]))] | ||||
|           [hi | ||||
|            (let f ([i (- (string-length str) 1)]) | ||||
|              (cond | ||||
|                [(< i 0) i] | ||||
|                [(char=? #\space (string-ref str i))  | ||||
|                 (f (- i 1))] | ||||
|                [else (+ i 1)]))]) | ||||
|       (if (> hi lo) | ||||
|           (substring str lo hi) | ||||
|           ""))) | ||||
| 
 | ||||
|   (define (split str) | ||||
|     (let f ([i 0] [n (string-length str)]) | ||||
|       (cond | ||||
|         [(or (= i n) (memv (string-ref str i) '(#\#))) | ||||
|          '("")] | ||||
|         [else | ||||
|          (let ([j (find-semi/hash str i n)]) | ||||
|            (cond | ||||
|              [(or (= j n) (memv (string-ref str i) '(#\#))) | ||||
|               (list (cleanup (substring str i j)))] | ||||
|              [else | ||||
|               (cons (cleanup (substring str i j)) | ||||
|                     (f (+ j 1) n))]))]))) | ||||
|    | ||||
|   (define (extract-uni-data) | ||||
|     (let f ([ls '()]) | ||||
|       (let ([line (get-line (current-input-port))]) | ||||
|         (cond | ||||
|           [(eof-object? line) | ||||
|            (reverse ls)] | ||||
|           [else | ||||
|            (let ([fields (split line)]) | ||||
|              (if (or (null? fields) (equal? fields '(""))) | ||||
|                  (f ls) | ||||
|                  (f (cons fields ls))))])))) | ||||
| 
 | ||||
|   (define (get-unicode-data filename) | ||||
|     (with-input-from-file  | ||||
|        filename | ||||
|        extract-uni-data))) | ||||
		Reference in a new issue
	
	 tmtt
						tmtt