| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330 | // TODO IMM HI: check which classes can be made internal?using System.Collections.Generic;using System.Text;using NLangDetect.Core.Extensions;namespace NLangDetect.Core.Utils{    public class NGram    {        public const int GramsCount = 3;        private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");        private static readonly string[] CjkClass =          {        #region CJK classes        Messages.getString("NGram.KANJI_1_0"),        Messages.getString("NGram.KANJI_1_2"),        Messages.getString("NGram.KANJI_1_4"),        Messages.getString("NGram.KANJI_1_8"),        Messages.getString("NGram.KANJI_1_11"),        Messages.getString("NGram.KANJI_1_12"),        Messages.getString("NGram.KANJI_1_13"),        Messages.getString("NGram.KANJI_1_14"),        Messages.getString("NGram.KANJI_1_16"),        Messages.getString("NGram.KANJI_1_18"),        Messages.getString("NGram.KANJI_1_22"),        Messages.getString("NGram.KANJI_1_27"),        Messages.getString("NGram.KANJI_1_29"),        Messages.getString("NGram.KANJI_1_31"),        Messages.getString("NGram.KANJI_1_35"),        Messages.getString("NGram.KANJI_2_0"),        Messages.getString("NGram.KANJI_2_1"),        Messages.getString("NGram.KANJI_2_4"),        Messages.getString("NGram.KANJI_2_9"),        Messages.getString("NGram.KANJI_2_10"),        Messages.getString("NGram.KANJI_2_11"),        Messages.getString("NGram.KANJI_2_12"),        Messages.getString("NGram.KANJI_2_13"),        Messages.getString("NGram.KANJI_2_15"),        Messages.getString("NGram.KANJI_2_16"),        Messages.getString("NGram.KANJI_2_18"),        Messages.getString("NGram.KANJI_2_21"),        Messages.getString("NGram.KANJI_2_22"),        Messages.getString("NGram.KANJI_2_23"),        Messages.getString("NGram.KANJI_2_28"),        Messages.getString("NGram.KANJI_2_29"),        Messages.getString("NGram.KANJI_2_30"),        Messages.getString("NGram.KANJI_2_31"),        Messages.getString("NGram.KANJI_2_32"),        Messages.getString("NGram.KANJI_2_35"),        Messages.getString("NGram.KANJI_2_36"),        Messages.getString("NGram.KANJI_2_37"),        Messages.getString("NGram.KANJI_2_38"),        Messages.getString("NGram.KANJI_3_1"),        Messages.getString("NGram.KANJI_3_2"),        Messages.getString("NGram.KANJI_3_3"),        Messages.getString("NGram.KANJI_3_4"),        Messages.getString("NGram.KANJI_3_5"),        Messages.getString("NGram.KANJI_3_8"),        Messages.getString("NGram.KANJI_3_9"),        Messages.getString("NGram.KANJI_3_11"),        Messages.getString("NGram.KANJI_3_12"),        Messages.getString("NGram.KANJI_3_13"),        Messages.getString("NGram.KANJI_3_15"),        Messages.getString("NGram.KANJI_3_16"),        Messages.getString("NGram.KANJI_3_18"),        Messages.getString("NGram.KANJI_3_19"),        Messages.getString("NGram.KANJI_3_22"),        Messages.getString("NGram.KANJI_3_23"),        Messages.getString("NGram.KANJI_3_27"),        Messages.getString("NGram.KANJI_3_29"),        Messages.getString("NGram.KANJI_3_30"),        Messages.getString("NGram.KANJI_3_31"),        Messages.getString("NGram.KANJI_3_32"),        Messages.getString("NGram.KANJI_3_35"),        Messages.getString("NGram.KANJI_3_36"),        Messages.getString("NGram.KANJI_3_37"),        Messages.getString("NGram.KANJI_3_38"),        Messages.getString("NGram.KANJI_4_0"),        Messages.getString("NGram.KANJI_4_9"),        Messages.getString("NGram.KANJI_4_10"),        Messages.getString("NGram.KANJI_4_16"),        Messages.getString("NGram.KANJI_4_17"),        Messages.getString("NGram.KANJI_4_18"),        Messages.getString("NGram.KANJI_4_22"),        Messages.getString("NGram.KANJI_4_24"),        Messages.getString("NGram.KANJI_4_28"),        Messages.getString("NGram.KANJI_4_34"),        Messages.getString("NGram.KANJI_4_39"),        Messages.getString("NGram.KANJI_5_10"),        Messages.getString("NGram.KANJI_5_11"),        Messages.getString("NGram.KANJI_5_12"),        Messages.getString("NGram.KANJI_5_13"),        Messages.getString("NGram.KANJI_5_14"),        Messages.getString("NGram.KANJI_5_18"),        Messages.getString("NGram.KANJI_5_26"),        Messages.getString("NGram.KANJI_5_29"),        Messages.getString("NGram.KANJI_5_34"),        Messages.getString("NGram.KANJI_5_39"),        Messages.getString("NGram.KANJI_6_0"),        Messages.getString("NGram.KANJI_6_3"),        Messages.getString("NGram.KANJI_6_9"),        Messages.getString("NGram.KANJI_6_10"),        Messages.getString("NGram.KANJI_6_11"),        Messages.getString("NGram.KANJI_6_12"),        Messages.getString("NGram.KANJI_6_16"),        Messages.getString("NGram.KANJI_6_18"),        Messages.getString("NGram.KANJI_6_20"),        Messages.getString("NGram.KANJI_6_21"),        Messages.getString("NGram.KANJI_6_22"),        Messages.getString("NGram.KANJI_6_23"),        Messages.getString("NGram.KANJI_6_25"),        Messages.getString("NGram.KANJI_6_28"),        Messages.getString("NGram.KANJI_6_29"),        Messages.getString("NGram.KANJI_6_30"),        Messages.getString("NGram.KANJI_6_32"),        Messages.getString("NGram.KANJI_6_34"),        Messages.getString("NGram.KANJI_6_35"),        Messages.getString("NGram.KANJI_6_37"),        Messages.getString("NGram.KANJI_6_39"),        Messages.getString("NGram.KANJI_7_0"),        Messages.getString("NGram.KANJI_7_3"),        Messages.getString("NGram.KANJI_7_6"),        Messages.getString("NGram.KANJI_7_7"),        Messages.getString("NGram.KANJI_7_9"),        Messages.getString("NGram.KANJI_7_11"),        Messages.getString("NGram.KANJI_7_12"),        Messages.getString("NGram.KANJI_7_13"),        Messages.getString("NGram.KANJI_7_16"),        Messages.getString("NGram.KANJI_7_18"),        Messages.getString("NGram.KANJI_7_19"),        Messages.getString("NGram.KANJI_7_20"),        Messages.getString("NGram.KANJI_7_21"),        Messages.getString("NGram.KANJI_7_23"),        Messages.getString("NGram.KANJI_7_25"),        Messages.getString("NGram.KANJI_7_28"),        Messages.getString("NGram.KANJI_7_29"),        Messages.getString("NGram.KANJI_7_32"),        Messages.getString("NGram.KANJI_7_33"),        Messages.getString("NGram.KANJI_7_35"),        Messages.getString("NGram.KANJI_7_37"),        #endregion      };        private static readonly Dictionary<char, char> _cjkMap;        private StringBuilder _grams;        private bool _capitalword;        #region Constructor(s)        static NGram()        {            _cjkMap = new Dictionary<char, char>();            foreach (string cjk_list in CjkClass)            {                char representative = cjk_list[0];                for (int i = 0; i < cjk_list.Length; i++)                {                    _cjkMap.Add(cjk_list[i], representative);                }            }        }        public NGram()        {            _grams = new StringBuilder(" ");            _capitalword = false;        }        #endregion        #region Public methods        public static char Normalize(char ch)        {            UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();            if (!unicodeBlock.HasValue)            {                return ch;            }            switch (unicodeBlock.Value)            {                case UnicodeBlock.BasicLatin:                    {                        if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')                        {                            return ' ';                        }                        break;                    }                case UnicodeBlock.Latin1Supplement:                    {                        if (Latin1Excluded.IndexOf(ch) >= 0)                        {                            return ' ';                        }                        break;                    }                case UnicodeBlock.GeneralPunctuation:                    {                        return ' ';                    }                case UnicodeBlock.Arabic:                    {                        if (ch == '\u06cc')                        {                            return '\u064a';                        }                        break;                    }                case UnicodeBlock.LatinExtendedAdditional:                    {                        if (ch >= '\u1ea0')                        {                            return '\u1ec3';                        }                        break;                    }                case UnicodeBlock.Hiragana:                    {                        return '\u3042';                    }                case UnicodeBlock.Katakana:                    {                        return '\u30a2';                    }                case UnicodeBlock.Bopomofo:                case UnicodeBlock.BopomofoExtended:                    {                        return '\u3105';                    }                case UnicodeBlock.CjkUnifiedIdeographs:                    {                        if (_cjkMap.ContainsKey(ch))                        {                            return _cjkMap[ch];                        }                        break;                    }                case UnicodeBlock.HangulSyllables:                    {                        return '\uac00';                    }            }            return ch;        }        public void AddChar(char ch)        {            ch = Normalize(ch);            char lastchar = _grams[_grams.Length - 1];            if (lastchar == ' ')            {                _grams = new StringBuilder(" ");                _capitalword = false;                if (ch == ' ') return;            }            else if (_grams.Length >= GramsCount)            {                _grams.Remove(0, 1);            }            _grams.Append(ch);            if (char.IsUpper(ch))            {                if (char.IsUpper(lastchar)) _capitalword = true;            }            else            {                _capitalword = false;            }        }        public string Get(int n)        {            if (_capitalword)            {                return null;            }            int len = _grams.Length;            if (n < 1 || n > 3 || len < n)            {                return null;            }            if (n == 1)            {                char ch = _grams[len - 1];                if (ch == ' ')                {                    return null;                }                return ch.ToString();            }            // TODO IMM HI: is ToString() here effective?            return _grams.ToString().SubSequence(len - n, len);        }        #endregion    }}
 |