| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 | 
							- using System;
 
- namespace NLangDetect.Core.Extensions
 
- {
 
-   public static class CharExtensions
 
-   {
 
-     private const int MIN_CODE_POINT = 0x000000;
 
-     private const int MAX_CODE_POINT = 0x10ffff;
 
-     private static readonly int[] _unicodeBlockStarts =
 
-       {
 
-         #region Unicode block starts
 
-         0x0000, // Basic Latin
 
-         0x0080, // Latin-1 Supplement
 
-         0x0100, // Latin Extended-A
 
-         0x0180, // Latin Extended-B
 
-         0x0250, // IPA Extensions
 
-         0x02B0, // Spacing Modifier Letters
 
-         0x0300, // Combining Diacritical Marks
 
-         0x0370, // Greek and Coptic
 
-         0x0400, // Cyrillic
 
-         0x0500, // Cyrillic Supplementary
 
-         0x0530, // Armenian
 
-         0x0590, // Hebrew
 
-         0x0600, // Arabic
 
-         0x0700, // Syriac
 
-         0x0750, // unassigned
 
-         0x0780, // Thaana
 
-         0x07C0, // unassigned
 
-         0x0900, // Devanagari
 
-         0x0980, // Bengali
 
-         0x0A00, // Gurmukhi
 
-         0x0A80, // Gujarati
 
-         0x0B00, // Oriya
 
-         0x0B80, // Tamil
 
-         0x0C00, // Telugu
 
-         0x0C80, // Kannada
 
-         0x0D00, // Malayalam
 
-         0x0D80, // Sinhala
 
-         0x0E00, // Thai
 
-         0x0E80, // Lao
 
-         0x0F00, // Tibetan
 
-         0x1000, // Myanmar
 
-         0x10A0, // Georgian
 
-         0x1100, // Hangul Jamo
 
-         0x1200, // Ethiopic
 
-         0x1380, // unassigned
 
-         0x13A0, // Cherokee
 
-         0x1400, // Unified Canadian Aboriginal Syllabics
 
-         0x1680, // Ogham
 
-         0x16A0, // Runic
 
-         0x1700, // Tagalog
 
-         0x1720, // Hanunoo
 
-         0x1740, // Buhid
 
-         0x1760, // Tagbanwa
 
-         0x1780, // Khmer
 
-         0x1800, // Mongolian
 
-         0x18B0, // unassigned
 
-         0x1900, // Limbu
 
-         0x1950, // Tai Le
 
-         0x1980, // unassigned
 
-         0x19E0, // Khmer Symbols
 
-         0x1A00, // unassigned
 
-         0x1D00, // Phonetic Extensions
 
-         0x1D80, // unassigned
 
-         0x1E00, // Latin Extended Additional
 
-         0x1F00, // Greek Extended
 
-         0x2000, // General Punctuation
 
-         0x2070, // Superscripts and Subscripts
 
-         0x20A0, // Currency Symbols
 
-         0x20D0, // Combining Diacritical Marks for Symbols
 
-         0x2100, // Letterlike Symbols
 
-         0x2150, // Number Forms
 
-         0x2190, // Arrows
 
-         0x2200, // Mathematical Operators
 
-         0x2300, // Miscellaneous Technical
 
-         0x2400, // Control Pictures
 
-         0x2440, // Optical Character Recognition
 
-         0x2460, // Enclosed Alphanumerics
 
-         0x2500, // Box Drawing
 
-         0x2580, // Block Elements
 
-         0x25A0, // Geometric Shapes
 
-         0x2600, // Miscellaneous Symbols
 
-         0x2700, // Dingbats
 
-         0x27C0, // Miscellaneous Mathematical Symbols-A
 
-         0x27F0, // Supplemental Arrows-A
 
-         0x2800, // Braille Patterns
 
-         0x2900, // Supplemental Arrows-B
 
-         0x2980, // Miscellaneous Mathematical Symbols-B
 
-         0x2A00, // Supplemental Mathematical Operators
 
-         0x2B00, // Miscellaneous Symbols and Arrows
 
-         0x2C00, // unassigned
 
-         0x2E80, // CJK Radicals Supplement
 
-         0x2F00, // Kangxi Radicals
 
-         0x2FE0, // unassigned
 
-         0x2FF0, // Ideographic Description Characters
 
-         0x3000, // CJK Symbols and Punctuation
 
-         0x3040, // Hiragana
 
-         0x30A0, // Katakana
 
-         0x3100, // Bopomofo
 
-         0x3130, // Hangul Compatibility Jamo
 
-         0x3190, // Kanbun
 
-         0x31A0, // Bopomofo Extended
 
-         0x31C0, // unassigned
 
-         0x31F0, // Katakana Phonetic Extensions
 
-         0x3200, // Enclosed CJK Letters and Months
 
-         0x3300, // CJK Compatibility
 
-         0x3400, // CJK Unified Ideographs Extension A
 
-         0x4DC0, // Yijing Hexagram Symbols
 
-         0x4E00, // CJK Unified Ideographs
 
-         0xA000, // Yi Syllables
 
-         0xA490, // Yi Radicals
 
-         0xA4D0, // unassigned
 
-         0xAC00, // Hangul Syllables
 
-         0xD7B0, // unassigned
 
-         0xD800, // High Surrogates
 
-         0xDB80, // High Private Use Surrogates
 
-         0xDC00, // Low Surrogates
 
-         0xE000, // Private Use
 
-         0xF900, // CJK Compatibility Ideographs
 
-         0xFB00, // Alphabetic Presentation Forms
 
-         0xFB50, // Arabic Presentation Forms-A
 
-         0xFE00, // Variation Selectors
 
-         0xFE10, // unassigned
 
-         0xFE20, // Combining Half Marks
 
-         0xFE30, // CJK Compatibility Forms
 
-         0xFE50, // Small Form Variants
 
-         0xFE70, // Arabic Presentation Forms-B
 
-         0xFF00, // Halfwidth and Fullwidth Forms
 
-         0xFFF0, // Specials
 
-         0x10000, // Linear B Syllabary
 
-         0x10080, // Linear B Ideograms
 
-         0x10100, // Aegean Numbers
 
-         0x10140, // unassigned
 
-         0x10300, // Old Italic
 
-         0x10330, // Gothic
 
-         0x10350, // unassigned
 
-         0x10380, // Ugaritic
 
-         0x103A0, // unassigned
 
-         0x10400, // Deseret
 
-         0x10450, // Shavian
 
-         0x10480, // Osmanya
 
-         0x104B0, // unassigned
 
-         0x10800, // Cypriot Syllabary
 
-         0x10840, // unassigned
 
-         0x1D000, // Byzantine Musical Symbols
 
-         0x1D100, // Musical Symbols
 
-         0x1D200, // unassigned
 
-         0x1D300, // Tai Xuan Jing Symbols
 
-         0x1D360, // unassigned
 
-         0x1D400, // Mathematical Alphanumeric Symbols
 
-         0x1D800, // unassigned
 
-         0x20000, // CJK Unified Ideographs Extension B
 
-         0x2A6E0, // unassigned
 
-         0x2F800, // CJK Compatibility Ideographs Supplement
 
-         0x2FA20, // unassigned
 
-         0xE0000, // Tags
 
-         0xE0080, // unassigned
 
-         0xE0100, // Variation Selectors Supplement
 
-         0xE01F0, // unassigned
 
-         0xF0000, // Supplementary Private Use Area-A
 
-         0x100000, // Supplementary Private Use Area-B
 
-         #endregion
 
-       };
 
-     private static readonly UnicodeBlock?[] _unicodeBlocks =
 
-       {
 
-         #region Unicode blocks
 
-         UnicodeBlock.BasicLatin,
 
-         UnicodeBlock.Latin1Supplement,
 
-         UnicodeBlock.LatinExtendedA,
 
-         UnicodeBlock.LatinExtendedB,
 
-         UnicodeBlock.IpaExtensions,
 
-         UnicodeBlock.SpacingModifierLetters,
 
-         UnicodeBlock.CombiningDiacriticalMarks,
 
-         UnicodeBlock.Greek,
 
-         UnicodeBlock.Cyrillic,
 
-         UnicodeBlock.CyrillicSupplementary,
 
-         UnicodeBlock.Armenian,
 
-         UnicodeBlock.Hebrew,
 
-         UnicodeBlock.Arabic,
 
-         UnicodeBlock.Syriac,
 
-         null,
 
-         UnicodeBlock.Thaana,
 
-         null,
 
-         UnicodeBlock.Devanagari,
 
-         UnicodeBlock.Bengali,
 
-         UnicodeBlock.Gurmukhi,
 
-         UnicodeBlock.Gujarati,
 
-         UnicodeBlock.Oriya,
 
-         UnicodeBlock.Tamil,
 
-         UnicodeBlock.Telugu,
 
-         UnicodeBlock.Kannada,
 
-         UnicodeBlock.Malayalam,
 
-         UnicodeBlock.Sinhala,
 
-         UnicodeBlock.Thai,
 
-         UnicodeBlock.Lao,
 
-         UnicodeBlock.Tibetan,
 
-         UnicodeBlock.Myanmar,
 
-         UnicodeBlock.Georgian,
 
-         UnicodeBlock.HangulJamo,
 
-         UnicodeBlock.Ethiopic,
 
-         null,
 
-         UnicodeBlock.Cherokee,
 
-         UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
 
-         UnicodeBlock.Ogham,
 
-         UnicodeBlock.Runic,
 
-         UnicodeBlock.Tagalog,
 
-         UnicodeBlock.Hanunoo,
 
-         UnicodeBlock.Buhid,
 
-         UnicodeBlock.Tagbanwa,
 
-         UnicodeBlock.Khmer,
 
-         UnicodeBlock.Mongolian,
 
-         null,
 
-         UnicodeBlock.Limbu,
 
-         UnicodeBlock.TaiLe,
 
-         null,
 
-         UnicodeBlock.KhmerSymbols,
 
-         null,
 
-         UnicodeBlock.PhoneticExtensions,
 
-         null,
 
-         UnicodeBlock.LatinExtendedAdditional,
 
-         UnicodeBlock.GreekExtended,
 
-         UnicodeBlock.GeneralPunctuation,
 
-         UnicodeBlock.SuperscriptsAndSubscripts,
 
-         UnicodeBlock.CurrencySymbols,
 
-         UnicodeBlock.CombiningMarksForSymbols,
 
-         UnicodeBlock.LetterlikeSymbols,
 
-         UnicodeBlock.NumberForms,
 
-         UnicodeBlock.Arrows,
 
-         UnicodeBlock.MathematicalOperators,
 
-         UnicodeBlock.MiscellaneousTechnical,
 
-         UnicodeBlock.ControlPictures,
 
-         UnicodeBlock.OpticalCharacterRecognition,
 
-         UnicodeBlock.EnclosedAlphanumerics,
 
-         UnicodeBlock.BoxDrawing,
 
-         UnicodeBlock.BlockElements,
 
-         UnicodeBlock.GeometricShapes,
 
-         UnicodeBlock.MiscellaneousSymbols,
 
-         UnicodeBlock.Dingbats,
 
-         UnicodeBlock.MiscellaneousMathematicalSymbolsA,
 
-         UnicodeBlock.SupplementalArrowsA,
 
-         UnicodeBlock.BraillePatterns,
 
-         UnicodeBlock.SupplementalArrowsB,
 
-         UnicodeBlock.MiscellaneousMathematicalSymbolsB,
 
-         UnicodeBlock.SupplementalMathematicalOperators,
 
-         UnicodeBlock.MiscellaneousSymbolsAndArrows,
 
-         null,
 
-         UnicodeBlock.CjkRadicalsSupplement,
 
-         UnicodeBlock.KangxiRadicals,
 
-         null,
 
-         UnicodeBlock.IdeographicDescriptionCharacters,
 
-         UnicodeBlock.CjkSymbolsAndPunctuation,
 
-         UnicodeBlock.Hiragana,
 
-         UnicodeBlock.Katakana,
 
-         UnicodeBlock.Bopomofo,
 
-         UnicodeBlock.HangulCompatibilityJamo,
 
-         UnicodeBlock.Kanbun,
 
-         UnicodeBlock.BopomofoExtended,
 
-         null,
 
-         UnicodeBlock.KatakanaPhoneticExtensions,
 
-         UnicodeBlock.EnclosedCjkLettersAndMonths,
 
-         UnicodeBlock.CjkCompatibility,
 
-         UnicodeBlock.CjkUnifiedIdeographsExtensionA,
 
-         UnicodeBlock.YijingHexagramSymbols,
 
-         UnicodeBlock.CjkUnifiedIdeographs,
 
-         UnicodeBlock.YiSyllables,
 
-         UnicodeBlock.YiRadicals,
 
-         null,
 
-         UnicodeBlock.HangulSyllables,
 
-         null,
 
-         UnicodeBlock.HighSurrogates,
 
-         UnicodeBlock.HighPrivateUseSurrogates,
 
-         UnicodeBlock.LowSurrogates,
 
-         UnicodeBlock.PrivateUseArea,
 
-         UnicodeBlock.CjkCompatibilityIdeographs,
 
-         UnicodeBlock.AlphabeticPresentationForms,
 
-         UnicodeBlock.ArabicPresentationFormsA,
 
-         UnicodeBlock.VariationSelectors,
 
-         null,
 
-         UnicodeBlock.CombiningHalfMarks,
 
-         UnicodeBlock.CjkCompatibilityForms,
 
-         UnicodeBlock.SmallFormVariants,
 
-         UnicodeBlock.ArabicPresentationFormsB,
 
-         UnicodeBlock.HalfwidthAndFullwidthForms,
 
-         UnicodeBlock.Specials,
 
-         UnicodeBlock.LinearBSyllabary,
 
-         UnicodeBlock.LinearBIdeograms,
 
-         UnicodeBlock.AegeanNumbers,
 
-         null,
 
-         UnicodeBlock.OldItalic,
 
-         UnicodeBlock.Gothic,
 
-         null,
 
-         UnicodeBlock.Ugaritic,
 
-         null,
 
-         UnicodeBlock.Deseret,
 
-         UnicodeBlock.Shavian,
 
-         UnicodeBlock.Osmanya,
 
-         null,
 
-         UnicodeBlock.CypriotSyllabary,
 
-         null,
 
-         UnicodeBlock.ByzantineMusicalSymbols,
 
-         UnicodeBlock.MusicalSymbols,
 
-         null,
 
-         UnicodeBlock.TaiXuanJingSymbols,
 
-         null,
 
-         UnicodeBlock.MathematicalAlphanumericSymbols,
 
-         null,
 
-         UnicodeBlock.CjkUnifiedIdeographsExtensionB,
 
-         null,
 
-         UnicodeBlock.CjkCompatibilityIdeographsSupplement,
 
-         null,
 
-         UnicodeBlock.Tags,
 
-         null,
 
-         UnicodeBlock.VariationSelectorsSupplement,
 
-         null,
 
-         UnicodeBlock.SupplementaryPrivateUseAreaA,
 
-         UnicodeBlock.SupplementaryPrivateUseAreaB,
 
-         #endregion
 
-       };
 
-     #region Public methods
 
-     /// <remarks>
 
-     /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
 
-     /// </remarks>
 
-     public static UnicodeBlock? GetUnicodeBlock(this char ch)
 
-     {
 
-       int codePoint = ch;
 
-       if (!IsValidCodePoint(codePoint))
 
-       {
 
-         throw new ArgumentException("Argument is not a valid code point.", nameof(ch));
 
-       }
 
-       int top, bottom, current;
 
-       bottom = 0;
 
-       top = _unicodeBlockStarts.Length;
 
-       current = top / 2;
 
-       // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
 
-       while (top - bottom > 1)
 
-       {
 
-         if (codePoint >= _unicodeBlockStarts[current])
 
-         {
 
-           bottom = current;
 
-         }
 
-         else
 
-         {
 
-           top = current;
 
-         }
 
-         current = (top + bottom) / 2;
 
-       }
 
-       return _unicodeBlocks[current];
 
-     }
 
-     #endregion
 
-     #region Private helper methods
 
-     private static bool IsValidCodePoint(int codePoint)
 
-     {
 
-       return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
 
-     }
 
-     #endregion
 
-   }
 
- }
 
 
  |