| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 | using System;namespace NLangDetect.Core.Extensions{  public static class CharExtensions  {    private const int MIN_CODE_POINT = 0x000000;    private const int MAX_CODE_POINT = 0x10ffff;    private static readonly int[] _unicodeBlockStarts =      {        #region Unicode block starts        0x0000, // Basic Latin        0x0080, // Latin-1 Supplement        0x0100, // Latin Extended-A        0x0180, // Latin Extended-B        0x0250, // IPA Extensions        0x02B0, // Spacing Modifier Letters        0x0300, // Combining Diacritical Marks        0x0370, // Greek and Coptic        0x0400, // Cyrillic        0x0500, // Cyrillic Supplementary        0x0530, // Armenian        0x0590, // Hebrew        0x0600, // Arabic        0x0700, // Syriac        0x0750, // unassigned        0x0780, // Thaana        0x07C0, // unassigned        0x0900, // Devanagari        0x0980, // Bengali        0x0A00, // Gurmukhi        0x0A80, // Gujarati        0x0B00, // Oriya        0x0B80, // Tamil        0x0C00, // Telugu        0x0C80, // Kannada        0x0D00, // Malayalam        0x0D80, // Sinhala        0x0E00, // Thai        0x0E80, // Lao        0x0F00, // Tibetan        0x1000, // Myanmar        0x10A0, // Georgian        0x1100, // Hangul Jamo        0x1200, // Ethiopic        0x1380, // unassigned        0x13A0, // Cherokee        0x1400, // Unified Canadian Aboriginal Syllabics        0x1680, // Ogham        0x16A0, // Runic        0x1700, // Tagalog        0x1720, // Hanunoo        0x1740, // Buhid        0x1760, // Tagbanwa        0x1780, // Khmer        0x1800, // Mongolian        0x18B0, // unassigned        0x1900, // Limbu        0x1950, // Tai Le        0x1980, // unassigned        0x19E0, // Khmer Symbols        0x1A00, // unassigned        0x1D00, // Phonetic Extensions        0x1D80, // unassigned        0x1E00, // Latin Extended Additional        0x1F00, // Greek Extended        0x2000, // General Punctuation        0x2070, // Superscripts and Subscripts        0x20A0, // Currency Symbols        0x20D0, // Combining Diacritical Marks for Symbols        0x2100, // Letterlike Symbols        0x2150, // Number Forms        0x2190, // Arrows        0x2200, // Mathematical Operators        0x2300, // Miscellaneous Technical        0x2400, // Control Pictures        0x2440, // Optical Character Recognition        0x2460, // Enclosed Alphanumerics        0x2500, // Box Drawing        0x2580, // Block Elements        0x25A0, // Geometric Shapes        0x2600, // Miscellaneous Symbols        0x2700, // Dingbats        0x27C0, // Miscellaneous Mathematical Symbols-A        0x27F0, // Supplemental Arrows-A        0x2800, // Braille Patterns        0x2900, // Supplemental Arrows-B        0x2980, // Miscellaneous Mathematical Symbols-B        0x2A00, // Supplemental Mathematical Operators        0x2B00, // Miscellaneous Symbols and Arrows        0x2C00, // unassigned        0x2E80, // CJK Radicals Supplement        0x2F00, // Kangxi Radicals        0x2FE0, // unassigned        0x2FF0, // Ideographic Description Characters        0x3000, // CJK Symbols and Punctuation        0x3040, // Hiragana        0x30A0, // Katakana        0x3100, // Bopomofo        0x3130, // Hangul Compatibility Jamo        0x3190, // Kanbun        0x31A0, // Bopomofo Extended        0x31C0, // unassigned        0x31F0, // Katakana Phonetic Extensions        0x3200, // Enclosed CJK Letters and Months        0x3300, // CJK Compatibility        0x3400, // CJK Unified Ideographs Extension A        0x4DC0, // Yijing Hexagram Symbols        0x4E00, // CJK Unified Ideographs        0xA000, // Yi Syllables        0xA490, // Yi Radicals        0xA4D0, // unassigned        0xAC00, // Hangul Syllables        0xD7B0, // unassigned        0xD800, // High Surrogates        0xDB80, // High Private Use Surrogates        0xDC00, // Low Surrogates        0xE000, // Private Use        0xF900, // CJK Compatibility Ideographs        0xFB00, // Alphabetic Presentation Forms        0xFB50, // Arabic Presentation Forms-A        0xFE00, // Variation Selectors        0xFE10, // unassigned        0xFE20, // Combining Half Marks        0xFE30, // CJK Compatibility Forms        0xFE50, // Small Form Variants        0xFE70, // Arabic Presentation Forms-B        0xFF00, // Halfwidth and Fullwidth Forms        0xFFF0, // Specials        0x10000, // Linear B Syllabary        0x10080, // Linear B Ideograms        0x10100, // Aegean Numbers        0x10140, // unassigned        0x10300, // Old Italic        0x10330, // Gothic        0x10350, // unassigned        0x10380, // Ugaritic        0x103A0, // unassigned        0x10400, // Deseret        0x10450, // Shavian        0x10480, // Osmanya        0x104B0, // unassigned        0x10800, // Cypriot Syllabary        0x10840, // unassigned        0x1D000, // Byzantine Musical Symbols        0x1D100, // Musical Symbols        0x1D200, // unassigned        0x1D300, // Tai Xuan Jing Symbols        0x1D360, // unassigned        0x1D400, // Mathematical Alphanumeric Symbols        0x1D800, // unassigned        0x20000, // CJK Unified Ideographs Extension B        0x2A6E0, // unassigned        0x2F800, // CJK Compatibility Ideographs Supplement        0x2FA20, // unassigned        0xE0000, // Tags        0xE0080, // unassigned        0xE0100, // Variation Selectors Supplement        0xE01F0, // unassigned        0xF0000, // Supplementary Private Use Area-A        0x100000, // Supplementary Private Use Area-B        #endregion      };    private static readonly UnicodeBlock?[] _unicodeBlocks =      {        #region Unicode blocks        UnicodeBlock.BasicLatin,        UnicodeBlock.Latin1Supplement,        UnicodeBlock.LatinExtendedA,        UnicodeBlock.LatinExtendedB,        UnicodeBlock.IpaExtensions,        UnicodeBlock.SpacingModifierLetters,        UnicodeBlock.CombiningDiacriticalMarks,        UnicodeBlock.Greek,        UnicodeBlock.Cyrillic,        UnicodeBlock.CyrillicSupplementary,        UnicodeBlock.Armenian,        UnicodeBlock.Hebrew,        UnicodeBlock.Arabic,        UnicodeBlock.Syriac,        null,        UnicodeBlock.Thaana,        null,        UnicodeBlock.Devanagari,        UnicodeBlock.Bengali,        UnicodeBlock.Gurmukhi,        UnicodeBlock.Gujarati,        UnicodeBlock.Oriya,        UnicodeBlock.Tamil,        UnicodeBlock.Telugu,        UnicodeBlock.Kannada,        UnicodeBlock.Malayalam,        UnicodeBlock.Sinhala,        UnicodeBlock.Thai,        UnicodeBlock.Lao,        UnicodeBlock.Tibetan,        UnicodeBlock.Myanmar,        UnicodeBlock.Georgian,        UnicodeBlock.HangulJamo,        UnicodeBlock.Ethiopic,        null,        UnicodeBlock.Cherokee,        UnicodeBlock.UnifiedCanadianAboriginalSyllabics,        UnicodeBlock.Ogham,        UnicodeBlock.Runic,        UnicodeBlock.Tagalog,        UnicodeBlock.Hanunoo,        UnicodeBlock.Buhid,        UnicodeBlock.Tagbanwa,        UnicodeBlock.Khmer,        UnicodeBlock.Mongolian,        null,        UnicodeBlock.Limbu,        UnicodeBlock.TaiLe,        null,        UnicodeBlock.KhmerSymbols,        null,        UnicodeBlock.PhoneticExtensions,        null,        UnicodeBlock.LatinExtendedAdditional,        UnicodeBlock.GreekExtended,        UnicodeBlock.GeneralPunctuation,        UnicodeBlock.SuperscriptsAndSubscripts,        UnicodeBlock.CurrencySymbols,        UnicodeBlock.CombiningMarksForSymbols,        UnicodeBlock.LetterlikeSymbols,        UnicodeBlock.NumberForms,        UnicodeBlock.Arrows,        UnicodeBlock.MathematicalOperators,        UnicodeBlock.MiscellaneousTechnical,        UnicodeBlock.ControlPictures,        UnicodeBlock.OpticalCharacterRecognition,        UnicodeBlock.EnclosedAlphanumerics,        UnicodeBlock.BoxDrawing,        UnicodeBlock.BlockElements,        UnicodeBlock.GeometricShapes,        UnicodeBlock.MiscellaneousSymbols,        UnicodeBlock.Dingbats,        UnicodeBlock.MiscellaneousMathematicalSymbolsA,        UnicodeBlock.SupplementalArrowsA,        UnicodeBlock.BraillePatterns,        UnicodeBlock.SupplementalArrowsB,        UnicodeBlock.MiscellaneousMathematicalSymbolsB,        UnicodeBlock.SupplementalMathematicalOperators,        UnicodeBlock.MiscellaneousSymbolsAndArrows,        null,        UnicodeBlock.CjkRadicalsSupplement,        UnicodeBlock.KangxiRadicals,        null,        UnicodeBlock.IdeographicDescriptionCharacters,        UnicodeBlock.CjkSymbolsAndPunctuation,        UnicodeBlock.Hiragana,        UnicodeBlock.Katakana,        UnicodeBlock.Bopomofo,        UnicodeBlock.HangulCompatibilityJamo,        UnicodeBlock.Kanbun,        UnicodeBlock.BopomofoExtended,        null,        UnicodeBlock.KatakanaPhoneticExtensions,        UnicodeBlock.EnclosedCjkLettersAndMonths,        UnicodeBlock.CjkCompatibility,        UnicodeBlock.CjkUnifiedIdeographsExtensionA,        UnicodeBlock.YijingHexagramSymbols,        UnicodeBlock.CjkUnifiedIdeographs,        UnicodeBlock.YiSyllables,        UnicodeBlock.YiRadicals,        null,        UnicodeBlock.HangulSyllables,        null,        UnicodeBlock.HighSurrogates,        UnicodeBlock.HighPrivateUseSurrogates,        UnicodeBlock.LowSurrogates,        UnicodeBlock.PrivateUseArea,        UnicodeBlock.CjkCompatibilityIdeographs,        UnicodeBlock.AlphabeticPresentationForms,        UnicodeBlock.ArabicPresentationFormsA,        UnicodeBlock.VariationSelectors,        null,        UnicodeBlock.CombiningHalfMarks,        UnicodeBlock.CjkCompatibilityForms,        UnicodeBlock.SmallFormVariants,        UnicodeBlock.ArabicPresentationFormsB,        UnicodeBlock.HalfwidthAndFullwidthForms,        UnicodeBlock.Specials,        UnicodeBlock.LinearBSyllabary,        UnicodeBlock.LinearBIdeograms,        UnicodeBlock.AegeanNumbers,        null,        UnicodeBlock.OldItalic,        UnicodeBlock.Gothic,        null,        UnicodeBlock.Ugaritic,        null,        UnicodeBlock.Deseret,        UnicodeBlock.Shavian,        UnicodeBlock.Osmanya,        null,        UnicodeBlock.CypriotSyllabary,        null,        UnicodeBlock.ByzantineMusicalSymbols,        UnicodeBlock.MusicalSymbols,        null,        UnicodeBlock.TaiXuanJingSymbols,        null,        UnicodeBlock.MathematicalAlphanumericSymbols,        null,        UnicodeBlock.CjkUnifiedIdeographsExtensionB,        null,        UnicodeBlock.CjkCompatibilityIdeographsSupplement,        null,        UnicodeBlock.Tags,        null,        UnicodeBlock.VariationSelectorsSupplement,        null,        UnicodeBlock.SupplementaryPrivateUseAreaA,        UnicodeBlock.SupplementaryPrivateUseAreaB,        #endregion      };    #region Public methods    /// <remarks>    /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL    /// </remarks>    public static UnicodeBlock? GetUnicodeBlock(this char ch)    {      int codePoint = ch;      if (!IsValidCodePoint(codePoint))      {        throw new ArgumentException("Argument is not a valid code point.", "ch");      }      int top, bottom, current;      bottom = 0;      top = _unicodeBlockStarts.Length;      current = top / 2;      // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]      while (top - bottom > 1)      {        if (codePoint >= _unicodeBlockStarts[current])        {          bottom = current;        }        else        {          top = current;        }        current = (top + bottom) / 2;      }      return _unicodeBlocks[current];    }    #endregion    #region Private helper methods    private static bool IsValidCodePoint(int codePoint)    {      return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;    }    #endregion  }}
 |