CharExtensions.cs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. using System;
  2. namespace NLangDetect.Core.Extensions
  3. {
  4. public static class CharExtensions
  5. {
  6. private const int MIN_CODE_POINT = 0x000000;
  7. private const int MAX_CODE_POINT = 0x10ffff;
  8. private static readonly int[] _unicodeBlockStarts =
  9. {
  10. #region Unicode block starts
  11. 0x0000, // Basic Latin
  12. 0x0080, // Latin-1 Supplement
  13. 0x0100, // Latin Extended-A
  14. 0x0180, // Latin Extended-B
  15. 0x0250, // IPA Extensions
  16. 0x02B0, // Spacing Modifier Letters
  17. 0x0300, // Combining Diacritical Marks
  18. 0x0370, // Greek and Coptic
  19. 0x0400, // Cyrillic
  20. 0x0500, // Cyrillic Supplementary
  21. 0x0530, // Armenian
  22. 0x0590, // Hebrew
  23. 0x0600, // Arabic
  24. 0x0700, // Syriac
  25. 0x0750, // unassigned
  26. 0x0780, // Thaana
  27. 0x07C0, // unassigned
  28. 0x0900, // Devanagari
  29. 0x0980, // Bengali
  30. 0x0A00, // Gurmukhi
  31. 0x0A80, // Gujarati
  32. 0x0B00, // Oriya
  33. 0x0B80, // Tamil
  34. 0x0C00, // Telugu
  35. 0x0C80, // Kannada
  36. 0x0D00, // Malayalam
  37. 0x0D80, // Sinhala
  38. 0x0E00, // Thai
  39. 0x0E80, // Lao
  40. 0x0F00, // Tibetan
  41. 0x1000, // Myanmar
  42. 0x10A0, // Georgian
  43. 0x1100, // Hangul Jamo
  44. 0x1200, // Ethiopic
  45. 0x1380, // unassigned
  46. 0x13A0, // Cherokee
  47. 0x1400, // Unified Canadian Aboriginal Syllabics
  48. 0x1680, // Ogham
  49. 0x16A0, // Runic
  50. 0x1700, // Tagalog
  51. 0x1720, // Hanunoo
  52. 0x1740, // Buhid
  53. 0x1760, // Tagbanwa
  54. 0x1780, // Khmer
  55. 0x1800, // Mongolian
  56. 0x18B0, // unassigned
  57. 0x1900, // Limbu
  58. 0x1950, // Tai Le
  59. 0x1980, // unassigned
  60. 0x19E0, // Khmer Symbols
  61. 0x1A00, // unassigned
  62. 0x1D00, // Phonetic Extensions
  63. 0x1D80, // unassigned
  64. 0x1E00, // Latin Extended Additional
  65. 0x1F00, // Greek Extended
  66. 0x2000, // General Punctuation
  67. 0x2070, // Superscripts and Subscripts
  68. 0x20A0, // Currency Symbols
  69. 0x20D0, // Combining Diacritical Marks for Symbols
  70. 0x2100, // Letterlike Symbols
  71. 0x2150, // Number Forms
  72. 0x2190, // Arrows
  73. 0x2200, // Mathematical Operators
  74. 0x2300, // Miscellaneous Technical
  75. 0x2400, // Control Pictures
  76. 0x2440, // Optical Character Recognition
  77. 0x2460, // Enclosed Alphanumerics
  78. 0x2500, // Box Drawing
  79. 0x2580, // Block Elements
  80. 0x25A0, // Geometric Shapes
  81. 0x2600, // Miscellaneous Symbols
  82. 0x2700, // Dingbats
  83. 0x27C0, // Miscellaneous Mathematical Symbols-A
  84. 0x27F0, // Supplemental Arrows-A
  85. 0x2800, // Braille Patterns
  86. 0x2900, // Supplemental Arrows-B
  87. 0x2980, // Miscellaneous Mathematical Symbols-B
  88. 0x2A00, // Supplemental Mathematical Operators
  89. 0x2B00, // Miscellaneous Symbols and Arrows
  90. 0x2C00, // unassigned
  91. 0x2E80, // CJK Radicals Supplement
  92. 0x2F00, // Kangxi Radicals
  93. 0x2FE0, // unassigned
  94. 0x2FF0, // Ideographic Description Characters
  95. 0x3000, // CJK Symbols and Punctuation
  96. 0x3040, // Hiragana
  97. 0x30A0, // Katakana
  98. 0x3100, // Bopomofo
  99. 0x3130, // Hangul Compatibility Jamo
  100. 0x3190, // Kanbun
  101. 0x31A0, // Bopomofo Extended
  102. 0x31C0, // unassigned
  103. 0x31F0, // Katakana Phonetic Extensions
  104. 0x3200, // Enclosed CJK Letters and Months
  105. 0x3300, // CJK Compatibility
  106. 0x3400, // CJK Unified Ideographs Extension A
  107. 0x4DC0, // Yijing Hexagram Symbols
  108. 0x4E00, // CJK Unified Ideographs
  109. 0xA000, // Yi Syllables
  110. 0xA490, // Yi Radicals
  111. 0xA4D0, // unassigned
  112. 0xAC00, // Hangul Syllables
  113. 0xD7B0, // unassigned
  114. 0xD800, // High Surrogates
  115. 0xDB80, // High Private Use Surrogates
  116. 0xDC00, // Low Surrogates
  117. 0xE000, // Private Use
  118. 0xF900, // CJK Compatibility Ideographs
  119. 0xFB00, // Alphabetic Presentation Forms
  120. 0xFB50, // Arabic Presentation Forms-A
  121. 0xFE00, // Variation Selectors
  122. 0xFE10, // unassigned
  123. 0xFE20, // Combining Half Marks
  124. 0xFE30, // CJK Compatibility Forms
  125. 0xFE50, // Small Form Variants
  126. 0xFE70, // Arabic Presentation Forms-B
  127. 0xFF00, // Halfwidth and Fullwidth Forms
  128. 0xFFF0, // Specials
  129. 0x10000, // Linear B Syllabary
  130. 0x10080, // Linear B Ideograms
  131. 0x10100, // Aegean Numbers
  132. 0x10140, // unassigned
  133. 0x10300, // Old Italic
  134. 0x10330, // Gothic
  135. 0x10350, // unassigned
  136. 0x10380, // Ugaritic
  137. 0x103A0, // unassigned
  138. 0x10400, // Deseret
  139. 0x10450, // Shavian
  140. 0x10480, // Osmanya
  141. 0x104B0, // unassigned
  142. 0x10800, // Cypriot Syllabary
  143. 0x10840, // unassigned
  144. 0x1D000, // Byzantine Musical Symbols
  145. 0x1D100, // Musical Symbols
  146. 0x1D200, // unassigned
  147. 0x1D300, // Tai Xuan Jing Symbols
  148. 0x1D360, // unassigned
  149. 0x1D400, // Mathematical Alphanumeric Symbols
  150. 0x1D800, // unassigned
  151. 0x20000, // CJK Unified Ideographs Extension B
  152. 0x2A6E0, // unassigned
  153. 0x2F800, // CJK Compatibility Ideographs Supplement
  154. 0x2FA20, // unassigned
  155. 0xE0000, // Tags
  156. 0xE0080, // unassigned
  157. 0xE0100, // Variation Selectors Supplement
  158. 0xE01F0, // unassigned
  159. 0xF0000, // Supplementary Private Use Area-A
  160. 0x100000, // Supplementary Private Use Area-B
  161. #endregion
  162. };
  163. private static readonly UnicodeBlock?[] _unicodeBlocks =
  164. {
  165. #region Unicode blocks
  166. UnicodeBlock.BasicLatin,
  167. UnicodeBlock.Latin1Supplement,
  168. UnicodeBlock.LatinExtendedA,
  169. UnicodeBlock.LatinExtendedB,
  170. UnicodeBlock.IpaExtensions,
  171. UnicodeBlock.SpacingModifierLetters,
  172. UnicodeBlock.CombiningDiacriticalMarks,
  173. UnicodeBlock.Greek,
  174. UnicodeBlock.Cyrillic,
  175. UnicodeBlock.CyrillicSupplementary,
  176. UnicodeBlock.Armenian,
  177. UnicodeBlock.Hebrew,
  178. UnicodeBlock.Arabic,
  179. UnicodeBlock.Syriac,
  180. null,
  181. UnicodeBlock.Thaana,
  182. null,
  183. UnicodeBlock.Devanagari,
  184. UnicodeBlock.Bengali,
  185. UnicodeBlock.Gurmukhi,
  186. UnicodeBlock.Gujarati,
  187. UnicodeBlock.Oriya,
  188. UnicodeBlock.Tamil,
  189. UnicodeBlock.Telugu,
  190. UnicodeBlock.Kannada,
  191. UnicodeBlock.Malayalam,
  192. UnicodeBlock.Sinhala,
  193. UnicodeBlock.Thai,
  194. UnicodeBlock.Lao,
  195. UnicodeBlock.Tibetan,
  196. UnicodeBlock.Myanmar,
  197. UnicodeBlock.Georgian,
  198. UnicodeBlock.HangulJamo,
  199. UnicodeBlock.Ethiopic,
  200. null,
  201. UnicodeBlock.Cherokee,
  202. UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
  203. UnicodeBlock.Ogham,
  204. UnicodeBlock.Runic,
  205. UnicodeBlock.Tagalog,
  206. UnicodeBlock.Hanunoo,
  207. UnicodeBlock.Buhid,
  208. UnicodeBlock.Tagbanwa,
  209. UnicodeBlock.Khmer,
  210. UnicodeBlock.Mongolian,
  211. null,
  212. UnicodeBlock.Limbu,
  213. UnicodeBlock.TaiLe,
  214. null,
  215. UnicodeBlock.KhmerSymbols,
  216. null,
  217. UnicodeBlock.PhoneticExtensions,
  218. null,
  219. UnicodeBlock.LatinExtendedAdditional,
  220. UnicodeBlock.GreekExtended,
  221. UnicodeBlock.GeneralPunctuation,
  222. UnicodeBlock.SuperscriptsAndSubscripts,
  223. UnicodeBlock.CurrencySymbols,
  224. UnicodeBlock.CombiningMarksForSymbols,
  225. UnicodeBlock.LetterlikeSymbols,
  226. UnicodeBlock.NumberForms,
  227. UnicodeBlock.Arrows,
  228. UnicodeBlock.MathematicalOperators,
  229. UnicodeBlock.MiscellaneousTechnical,
  230. UnicodeBlock.ControlPictures,
  231. UnicodeBlock.OpticalCharacterRecognition,
  232. UnicodeBlock.EnclosedAlphanumerics,
  233. UnicodeBlock.BoxDrawing,
  234. UnicodeBlock.BlockElements,
  235. UnicodeBlock.GeometricShapes,
  236. UnicodeBlock.MiscellaneousSymbols,
  237. UnicodeBlock.Dingbats,
  238. UnicodeBlock.MiscellaneousMathematicalSymbolsA,
  239. UnicodeBlock.SupplementalArrowsA,
  240. UnicodeBlock.BraillePatterns,
  241. UnicodeBlock.SupplementalArrowsB,
  242. UnicodeBlock.MiscellaneousMathematicalSymbolsB,
  243. UnicodeBlock.SupplementalMathematicalOperators,
  244. UnicodeBlock.MiscellaneousSymbolsAndArrows,
  245. null,
  246. UnicodeBlock.CjkRadicalsSupplement,
  247. UnicodeBlock.KangxiRadicals,
  248. null,
  249. UnicodeBlock.IdeographicDescriptionCharacters,
  250. UnicodeBlock.CjkSymbolsAndPunctuation,
  251. UnicodeBlock.Hiragana,
  252. UnicodeBlock.Katakana,
  253. UnicodeBlock.Bopomofo,
  254. UnicodeBlock.HangulCompatibilityJamo,
  255. UnicodeBlock.Kanbun,
  256. UnicodeBlock.BopomofoExtended,
  257. null,
  258. UnicodeBlock.KatakanaPhoneticExtensions,
  259. UnicodeBlock.EnclosedCjkLettersAndMonths,
  260. UnicodeBlock.CjkCompatibility,
  261. UnicodeBlock.CjkUnifiedIdeographsExtensionA,
  262. UnicodeBlock.YijingHexagramSymbols,
  263. UnicodeBlock.CjkUnifiedIdeographs,
  264. UnicodeBlock.YiSyllables,
  265. UnicodeBlock.YiRadicals,
  266. null,
  267. UnicodeBlock.HangulSyllables,
  268. null,
  269. UnicodeBlock.HighSurrogates,
  270. UnicodeBlock.HighPrivateUseSurrogates,
  271. UnicodeBlock.LowSurrogates,
  272. UnicodeBlock.PrivateUseArea,
  273. UnicodeBlock.CjkCompatibilityIdeographs,
  274. UnicodeBlock.AlphabeticPresentationForms,
  275. UnicodeBlock.ArabicPresentationFormsA,
  276. UnicodeBlock.VariationSelectors,
  277. null,
  278. UnicodeBlock.CombiningHalfMarks,
  279. UnicodeBlock.CjkCompatibilityForms,
  280. UnicodeBlock.SmallFormVariants,
  281. UnicodeBlock.ArabicPresentationFormsB,
  282. UnicodeBlock.HalfwidthAndFullwidthForms,
  283. UnicodeBlock.Specials,
  284. UnicodeBlock.LinearBSyllabary,
  285. UnicodeBlock.LinearBIdeograms,
  286. UnicodeBlock.AegeanNumbers,
  287. null,
  288. UnicodeBlock.OldItalic,
  289. UnicodeBlock.Gothic,
  290. null,
  291. UnicodeBlock.Ugaritic,
  292. null,
  293. UnicodeBlock.Deseret,
  294. UnicodeBlock.Shavian,
  295. UnicodeBlock.Osmanya,
  296. null,
  297. UnicodeBlock.CypriotSyllabary,
  298. null,
  299. UnicodeBlock.ByzantineMusicalSymbols,
  300. UnicodeBlock.MusicalSymbols,
  301. null,
  302. UnicodeBlock.TaiXuanJingSymbols,
  303. null,
  304. UnicodeBlock.MathematicalAlphanumericSymbols,
  305. null,
  306. UnicodeBlock.CjkUnifiedIdeographsExtensionB,
  307. null,
  308. UnicodeBlock.CjkCompatibilityIdeographsSupplement,
  309. null,
  310. UnicodeBlock.Tags,
  311. null,
  312. UnicodeBlock.VariationSelectorsSupplement,
  313. null,
  314. UnicodeBlock.SupplementaryPrivateUseAreaA,
  315. UnicodeBlock.SupplementaryPrivateUseAreaB,
  316. #endregion
  317. };
  318. #region Public methods
  319. /// <remarks>
  320. /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
  321. /// </remarks>
  322. public static UnicodeBlock? GetUnicodeBlock(this char ch)
  323. {
  324. int codePoint = ch;
  325. if (!IsValidCodePoint(codePoint))
  326. {
  327. throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
  328. }
  329. int top, bottom, current;
  330. bottom = 0;
  331. top = _unicodeBlockStarts.Length;
  332. current = top / 2;
  333. // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
  334. while (top - bottom > 1)
  335. {
  336. if (codePoint >= _unicodeBlockStarts[current])
  337. {
  338. bottom = current;
  339. }
  340. else
  341. {
  342. top = current;
  343. }
  344. current = (top + bottom) / 2;
  345. }
  346. return _unicodeBlocks[current];
  347. }
  348. #endregion
  349. #region Private helper methods
  350. private static bool IsValidCodePoint(int codePoint)
  351. {
  352. return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
  353. }
  354. #endregion
  355. }
  356. }