123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
- using System;
- namespace NLangDetect.Core.Extensions
- {
- public static class CharExtensions
- {
- private const int MIN_CODE_POINT = 0x000000;
- private const int MAX_CODE_POINT = 0x10ffff;
- private static readonly int[] _unicodeBlockStarts =
- {
- #region Unicode block starts
- 0x0000, // Basic Latin
- 0x0080, // Latin-1 Supplement
- 0x0100, // Latin Extended-A
- 0x0180, // Latin Extended-B
- 0x0250, // IPA Extensions
- 0x02B0, // Spacing Modifier Letters
- 0x0300, // Combining Diacritical Marks
- 0x0370, // Greek and Coptic
- 0x0400, // Cyrillic
- 0x0500, // Cyrillic Supplementary
- 0x0530, // Armenian
- 0x0590, // Hebrew
- 0x0600, // Arabic
- 0x0700, // Syriac
- 0x0750, // unassigned
- 0x0780, // Thaana
- 0x07C0, // unassigned
- 0x0900, // Devanagari
- 0x0980, // Bengali
- 0x0A00, // Gurmukhi
- 0x0A80, // Gujarati
- 0x0B00, // Oriya
- 0x0B80, // Tamil
- 0x0C00, // Telugu
- 0x0C80, // Kannada
- 0x0D00, // Malayalam
- 0x0D80, // Sinhala
- 0x0E00, // Thai
- 0x0E80, // Lao
- 0x0F00, // Tibetan
- 0x1000, // Myanmar
- 0x10A0, // Georgian
- 0x1100, // Hangul Jamo
- 0x1200, // Ethiopic
- 0x1380, // unassigned
- 0x13A0, // Cherokee
- 0x1400, // Unified Canadian Aboriginal Syllabics
- 0x1680, // Ogham
- 0x16A0, // Runic
- 0x1700, // Tagalog
- 0x1720, // Hanunoo
- 0x1740, // Buhid
- 0x1760, // Tagbanwa
- 0x1780, // Khmer
- 0x1800, // Mongolian
- 0x18B0, // unassigned
- 0x1900, // Limbu
- 0x1950, // Tai Le
- 0x1980, // unassigned
- 0x19E0, // Khmer Symbols
- 0x1A00, // unassigned
- 0x1D00, // Phonetic Extensions
- 0x1D80, // unassigned
- 0x1E00, // Latin Extended Additional
- 0x1F00, // Greek Extended
- 0x2000, // General Punctuation
- 0x2070, // Superscripts and Subscripts
- 0x20A0, // Currency Symbols
- 0x20D0, // Combining Diacritical Marks for Symbols
- 0x2100, // Letterlike Symbols
- 0x2150, // Number Forms
- 0x2190, // Arrows
- 0x2200, // Mathematical Operators
- 0x2300, // Miscellaneous Technical
- 0x2400, // Control Pictures
- 0x2440, // Optical Character Recognition
- 0x2460, // Enclosed Alphanumerics
- 0x2500, // Box Drawing
- 0x2580, // Block Elements
- 0x25A0, // Geometric Shapes
- 0x2600, // Miscellaneous Symbols
- 0x2700, // Dingbats
- 0x27C0, // Miscellaneous Mathematical Symbols-A
- 0x27F0, // Supplemental Arrows-A
- 0x2800, // Braille Patterns
- 0x2900, // Supplemental Arrows-B
- 0x2980, // Miscellaneous Mathematical Symbols-B
- 0x2A00, // Supplemental Mathematical Operators
- 0x2B00, // Miscellaneous Symbols and Arrows
- 0x2C00, // unassigned
- 0x2E80, // CJK Radicals Supplement
- 0x2F00, // Kangxi Radicals
- 0x2FE0, // unassigned
- 0x2FF0, // Ideographic Description Characters
- 0x3000, // CJK Symbols and Punctuation
- 0x3040, // Hiragana
- 0x30A0, // Katakana
- 0x3100, // Bopomofo
- 0x3130, // Hangul Compatibility Jamo
- 0x3190, // Kanbun
- 0x31A0, // Bopomofo Extended
- 0x31C0, // unassigned
- 0x31F0, // Katakana Phonetic Extensions
- 0x3200, // Enclosed CJK Letters and Months
- 0x3300, // CJK Compatibility
- 0x3400, // CJK Unified Ideographs Extension A
- 0x4DC0, // Yijing Hexagram Symbols
- 0x4E00, // CJK Unified Ideographs
- 0xA000, // Yi Syllables
- 0xA490, // Yi Radicals
- 0xA4D0, // unassigned
- 0xAC00, // Hangul Syllables
- 0xD7B0, // unassigned
- 0xD800, // High Surrogates
- 0xDB80, // High Private Use Surrogates
- 0xDC00, // Low Surrogates
- 0xE000, // Private Use
- 0xF900, // CJK Compatibility Ideographs
- 0xFB00, // Alphabetic Presentation Forms
- 0xFB50, // Arabic Presentation Forms-A
- 0xFE00, // Variation Selectors
- 0xFE10, // unassigned
- 0xFE20, // Combining Half Marks
- 0xFE30, // CJK Compatibility Forms
- 0xFE50, // Small Form Variants
- 0xFE70, // Arabic Presentation Forms-B
- 0xFF00, // Halfwidth and Fullwidth Forms
- 0xFFF0, // Specials
- 0x10000, // Linear B Syllabary
- 0x10080, // Linear B Ideograms
- 0x10100, // Aegean Numbers
- 0x10140, // unassigned
- 0x10300, // Old Italic
- 0x10330, // Gothic
- 0x10350, // unassigned
- 0x10380, // Ugaritic
- 0x103A0, // unassigned
- 0x10400, // Deseret
- 0x10450, // Shavian
- 0x10480, // Osmanya
- 0x104B0, // unassigned
- 0x10800, // Cypriot Syllabary
- 0x10840, // unassigned
- 0x1D000, // Byzantine Musical Symbols
- 0x1D100, // Musical Symbols
- 0x1D200, // unassigned
- 0x1D300, // Tai Xuan Jing Symbols
- 0x1D360, // unassigned
- 0x1D400, // Mathematical Alphanumeric Symbols
- 0x1D800, // unassigned
- 0x20000, // CJK Unified Ideographs Extension B
- 0x2A6E0, // unassigned
- 0x2F800, // CJK Compatibility Ideographs Supplement
- 0x2FA20, // unassigned
- 0xE0000, // Tags
- 0xE0080, // unassigned
- 0xE0100, // Variation Selectors Supplement
- 0xE01F0, // unassigned
- 0xF0000, // Supplementary Private Use Area-A
- 0x100000, // Supplementary Private Use Area-B
- #endregion
- };
- private static readonly UnicodeBlock?[] _unicodeBlocks =
- {
- #region Unicode blocks
- UnicodeBlock.BasicLatin,
- UnicodeBlock.Latin1Supplement,
- UnicodeBlock.LatinExtendedA,
- UnicodeBlock.LatinExtendedB,
- UnicodeBlock.IpaExtensions,
- UnicodeBlock.SpacingModifierLetters,
- UnicodeBlock.CombiningDiacriticalMarks,
- UnicodeBlock.Greek,
- UnicodeBlock.Cyrillic,
- UnicodeBlock.CyrillicSupplementary,
- UnicodeBlock.Armenian,
- UnicodeBlock.Hebrew,
- UnicodeBlock.Arabic,
- UnicodeBlock.Syriac,
- null,
- UnicodeBlock.Thaana,
- null,
- UnicodeBlock.Devanagari,
- UnicodeBlock.Bengali,
- UnicodeBlock.Gurmukhi,
- UnicodeBlock.Gujarati,
- UnicodeBlock.Oriya,
- UnicodeBlock.Tamil,
- UnicodeBlock.Telugu,
- UnicodeBlock.Kannada,
- UnicodeBlock.Malayalam,
- UnicodeBlock.Sinhala,
- UnicodeBlock.Thai,
- UnicodeBlock.Lao,
- UnicodeBlock.Tibetan,
- UnicodeBlock.Myanmar,
- UnicodeBlock.Georgian,
- UnicodeBlock.HangulJamo,
- UnicodeBlock.Ethiopic,
- null,
- UnicodeBlock.Cherokee,
- UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
- UnicodeBlock.Ogham,
- UnicodeBlock.Runic,
- UnicodeBlock.Tagalog,
- UnicodeBlock.Hanunoo,
- UnicodeBlock.Buhid,
- UnicodeBlock.Tagbanwa,
- UnicodeBlock.Khmer,
- UnicodeBlock.Mongolian,
- null,
- UnicodeBlock.Limbu,
- UnicodeBlock.TaiLe,
- null,
- UnicodeBlock.KhmerSymbols,
- null,
- UnicodeBlock.PhoneticExtensions,
- null,
- UnicodeBlock.LatinExtendedAdditional,
- UnicodeBlock.GreekExtended,
- UnicodeBlock.GeneralPunctuation,
- UnicodeBlock.SuperscriptsAndSubscripts,
- UnicodeBlock.CurrencySymbols,
- UnicodeBlock.CombiningMarksForSymbols,
- UnicodeBlock.LetterlikeSymbols,
- UnicodeBlock.NumberForms,
- UnicodeBlock.Arrows,
- UnicodeBlock.MathematicalOperators,
- UnicodeBlock.MiscellaneousTechnical,
- UnicodeBlock.ControlPictures,
- UnicodeBlock.OpticalCharacterRecognition,
- UnicodeBlock.EnclosedAlphanumerics,
- UnicodeBlock.BoxDrawing,
- UnicodeBlock.BlockElements,
- UnicodeBlock.GeometricShapes,
- UnicodeBlock.MiscellaneousSymbols,
- UnicodeBlock.Dingbats,
- UnicodeBlock.MiscellaneousMathematicalSymbolsA,
- UnicodeBlock.SupplementalArrowsA,
- UnicodeBlock.BraillePatterns,
- UnicodeBlock.SupplementalArrowsB,
- UnicodeBlock.MiscellaneousMathematicalSymbolsB,
- UnicodeBlock.SupplementalMathematicalOperators,
- UnicodeBlock.MiscellaneousSymbolsAndArrows,
- null,
- UnicodeBlock.CjkRadicalsSupplement,
- UnicodeBlock.KangxiRadicals,
- null,
- UnicodeBlock.IdeographicDescriptionCharacters,
- UnicodeBlock.CjkSymbolsAndPunctuation,
- UnicodeBlock.Hiragana,
- UnicodeBlock.Katakana,
- UnicodeBlock.Bopomofo,
- UnicodeBlock.HangulCompatibilityJamo,
- UnicodeBlock.Kanbun,
- UnicodeBlock.BopomofoExtended,
- null,
- UnicodeBlock.KatakanaPhoneticExtensions,
- UnicodeBlock.EnclosedCjkLettersAndMonths,
- UnicodeBlock.CjkCompatibility,
- UnicodeBlock.CjkUnifiedIdeographsExtensionA,
- UnicodeBlock.YijingHexagramSymbols,
- UnicodeBlock.CjkUnifiedIdeographs,
- UnicodeBlock.YiSyllables,
- UnicodeBlock.YiRadicals,
- null,
- UnicodeBlock.HangulSyllables,
- null,
- UnicodeBlock.HighSurrogates,
- UnicodeBlock.HighPrivateUseSurrogates,
- UnicodeBlock.LowSurrogates,
- UnicodeBlock.PrivateUseArea,
- UnicodeBlock.CjkCompatibilityIdeographs,
- UnicodeBlock.AlphabeticPresentationForms,
- UnicodeBlock.ArabicPresentationFormsA,
- UnicodeBlock.VariationSelectors,
- null,
- UnicodeBlock.CombiningHalfMarks,
- UnicodeBlock.CjkCompatibilityForms,
- UnicodeBlock.SmallFormVariants,
- UnicodeBlock.ArabicPresentationFormsB,
- UnicodeBlock.HalfwidthAndFullwidthForms,
- UnicodeBlock.Specials,
- UnicodeBlock.LinearBSyllabary,
- UnicodeBlock.LinearBIdeograms,
- UnicodeBlock.AegeanNumbers,
- null,
- UnicodeBlock.OldItalic,
- UnicodeBlock.Gothic,
- null,
- UnicodeBlock.Ugaritic,
- null,
- UnicodeBlock.Deseret,
- UnicodeBlock.Shavian,
- UnicodeBlock.Osmanya,
- null,
- UnicodeBlock.CypriotSyllabary,
- null,
- UnicodeBlock.ByzantineMusicalSymbols,
- UnicodeBlock.MusicalSymbols,
- null,
- UnicodeBlock.TaiXuanJingSymbols,
- null,
- UnicodeBlock.MathematicalAlphanumericSymbols,
- null,
- UnicodeBlock.CjkUnifiedIdeographsExtensionB,
- null,
- UnicodeBlock.CjkCompatibilityIdeographsSupplement,
- null,
- UnicodeBlock.Tags,
- null,
- UnicodeBlock.VariationSelectorsSupplement,
- null,
- UnicodeBlock.SupplementaryPrivateUseAreaA,
- UnicodeBlock.SupplementaryPrivateUseAreaB,
- #endregion
- };
- #region Public methods
- /// <remarks>
- /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
- /// </remarks>
- public static UnicodeBlock? GetUnicodeBlock(this char ch)
- {
- int codePoint = ch;
- if (!IsValidCodePoint(codePoint))
- {
- throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
- }
- int top, bottom, current;
- bottom = 0;
- top = _unicodeBlockStarts.Length;
- current = top / 2;
- // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
- while (top - bottom > 1)
- {
- if (codePoint >= _unicodeBlockStarts[current])
- {
- bottom = current;
- }
- else
- {
- top = current;
- }
- current = (top + bottom) / 2;
- }
- return _unicodeBlocks[current];
- }
- #endregion
- #region Private helper methods
- private static bool IsValidCodePoint(int codePoint)
- {
- return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
- }
- #endregion
- }
- }
|