123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330 |
- // TODO IMM HI: check which classes can be made internal?
- using System.Collections.Generic;
- using System.Text;
- using NLangDetect.Core.Extensions;
- namespace NLangDetect.Core.Utils
- {
- public class NGram
- {
- public const int GramsCount = 3;
- private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
- private static readonly string[] CjkClass =
- {
- #region CJK classes
- Messages.getString("NGram.KANJI_1_0"),
- Messages.getString("NGram.KANJI_1_2"),
- Messages.getString("NGram.KANJI_1_4"),
- Messages.getString("NGram.KANJI_1_8"),
- Messages.getString("NGram.KANJI_1_11"),
- Messages.getString("NGram.KANJI_1_12"),
- Messages.getString("NGram.KANJI_1_13"),
- Messages.getString("NGram.KANJI_1_14"),
- Messages.getString("NGram.KANJI_1_16"),
- Messages.getString("NGram.KANJI_1_18"),
- Messages.getString("NGram.KANJI_1_22"),
- Messages.getString("NGram.KANJI_1_27"),
- Messages.getString("NGram.KANJI_1_29"),
- Messages.getString("NGram.KANJI_1_31"),
- Messages.getString("NGram.KANJI_1_35"),
- Messages.getString("NGram.KANJI_2_0"),
- Messages.getString("NGram.KANJI_2_1"),
- Messages.getString("NGram.KANJI_2_4"),
- Messages.getString("NGram.KANJI_2_9"),
- Messages.getString("NGram.KANJI_2_10"),
- Messages.getString("NGram.KANJI_2_11"),
- Messages.getString("NGram.KANJI_2_12"),
- Messages.getString("NGram.KANJI_2_13"),
- Messages.getString("NGram.KANJI_2_15"),
- Messages.getString("NGram.KANJI_2_16"),
- Messages.getString("NGram.KANJI_2_18"),
- Messages.getString("NGram.KANJI_2_21"),
- Messages.getString("NGram.KANJI_2_22"),
- Messages.getString("NGram.KANJI_2_23"),
- Messages.getString("NGram.KANJI_2_28"),
- Messages.getString("NGram.KANJI_2_29"),
- Messages.getString("NGram.KANJI_2_30"),
- Messages.getString("NGram.KANJI_2_31"),
- Messages.getString("NGram.KANJI_2_32"),
- Messages.getString("NGram.KANJI_2_35"),
- Messages.getString("NGram.KANJI_2_36"),
- Messages.getString("NGram.KANJI_2_37"),
- Messages.getString("NGram.KANJI_2_38"),
- Messages.getString("NGram.KANJI_3_1"),
- Messages.getString("NGram.KANJI_3_2"),
- Messages.getString("NGram.KANJI_3_3"),
- Messages.getString("NGram.KANJI_3_4"),
- Messages.getString("NGram.KANJI_3_5"),
- Messages.getString("NGram.KANJI_3_8"),
- Messages.getString("NGram.KANJI_3_9"),
- Messages.getString("NGram.KANJI_3_11"),
- Messages.getString("NGram.KANJI_3_12"),
- Messages.getString("NGram.KANJI_3_13"),
- Messages.getString("NGram.KANJI_3_15"),
- Messages.getString("NGram.KANJI_3_16"),
- Messages.getString("NGram.KANJI_3_18"),
- Messages.getString("NGram.KANJI_3_19"),
- Messages.getString("NGram.KANJI_3_22"),
- Messages.getString("NGram.KANJI_3_23"),
- Messages.getString("NGram.KANJI_3_27"),
- Messages.getString("NGram.KANJI_3_29"),
- Messages.getString("NGram.KANJI_3_30"),
- Messages.getString("NGram.KANJI_3_31"),
- Messages.getString("NGram.KANJI_3_32"),
- Messages.getString("NGram.KANJI_3_35"),
- Messages.getString("NGram.KANJI_3_36"),
- Messages.getString("NGram.KANJI_3_37"),
- Messages.getString("NGram.KANJI_3_38"),
- Messages.getString("NGram.KANJI_4_0"),
- Messages.getString("NGram.KANJI_4_9"),
- Messages.getString("NGram.KANJI_4_10"),
- Messages.getString("NGram.KANJI_4_16"),
- Messages.getString("NGram.KANJI_4_17"),
- Messages.getString("NGram.KANJI_4_18"),
- Messages.getString("NGram.KANJI_4_22"),
- Messages.getString("NGram.KANJI_4_24"),
- Messages.getString("NGram.KANJI_4_28"),
- Messages.getString("NGram.KANJI_4_34"),
- Messages.getString("NGram.KANJI_4_39"),
- Messages.getString("NGram.KANJI_5_10"),
- Messages.getString("NGram.KANJI_5_11"),
- Messages.getString("NGram.KANJI_5_12"),
- Messages.getString("NGram.KANJI_5_13"),
- Messages.getString("NGram.KANJI_5_14"),
- Messages.getString("NGram.KANJI_5_18"),
- Messages.getString("NGram.KANJI_5_26"),
- Messages.getString("NGram.KANJI_5_29"),
- Messages.getString("NGram.KANJI_5_34"),
- Messages.getString("NGram.KANJI_5_39"),
- Messages.getString("NGram.KANJI_6_0"),
- Messages.getString("NGram.KANJI_6_3"),
- Messages.getString("NGram.KANJI_6_9"),
- Messages.getString("NGram.KANJI_6_10"),
- Messages.getString("NGram.KANJI_6_11"),
- Messages.getString("NGram.KANJI_6_12"),
- Messages.getString("NGram.KANJI_6_16"),
- Messages.getString("NGram.KANJI_6_18"),
- Messages.getString("NGram.KANJI_6_20"),
- Messages.getString("NGram.KANJI_6_21"),
- Messages.getString("NGram.KANJI_6_22"),
- Messages.getString("NGram.KANJI_6_23"),
- Messages.getString("NGram.KANJI_6_25"),
- Messages.getString("NGram.KANJI_6_28"),
- Messages.getString("NGram.KANJI_6_29"),
- Messages.getString("NGram.KANJI_6_30"),
- Messages.getString("NGram.KANJI_6_32"),
- Messages.getString("NGram.KANJI_6_34"),
- Messages.getString("NGram.KANJI_6_35"),
- Messages.getString("NGram.KANJI_6_37"),
- Messages.getString("NGram.KANJI_6_39"),
- Messages.getString("NGram.KANJI_7_0"),
- Messages.getString("NGram.KANJI_7_3"),
- Messages.getString("NGram.KANJI_7_6"),
- Messages.getString("NGram.KANJI_7_7"),
- Messages.getString("NGram.KANJI_7_9"),
- Messages.getString("NGram.KANJI_7_11"),
- Messages.getString("NGram.KANJI_7_12"),
- Messages.getString("NGram.KANJI_7_13"),
- Messages.getString("NGram.KANJI_7_16"),
- Messages.getString("NGram.KANJI_7_18"),
- Messages.getString("NGram.KANJI_7_19"),
- Messages.getString("NGram.KANJI_7_20"),
- Messages.getString("NGram.KANJI_7_21"),
- Messages.getString("NGram.KANJI_7_23"),
- Messages.getString("NGram.KANJI_7_25"),
- Messages.getString("NGram.KANJI_7_28"),
- Messages.getString("NGram.KANJI_7_29"),
- Messages.getString("NGram.KANJI_7_32"),
- Messages.getString("NGram.KANJI_7_33"),
- Messages.getString("NGram.KANJI_7_35"),
- Messages.getString("NGram.KANJI_7_37"),
- #endregion
- };
- private static readonly Dictionary<char, char> _cjkMap;
- private StringBuilder _grams;
- private bool _capitalword;
- #region Constructor(s)
- static NGram()
- {
- _cjkMap = new Dictionary<char, char>();
- foreach (string cjk_list in CjkClass)
- {
- char representative = cjk_list[0];
- for (int i = 0; i < cjk_list.Length; i++)
- {
- _cjkMap.Add(cjk_list[i], representative);
- }
- }
- }
- public NGram()
- {
- _grams = new StringBuilder(" ");
- _capitalword = false;
- }
- #endregion
- #region Public methods
- public static char Normalize(char ch)
- {
- UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
- if (!unicodeBlock.HasValue)
- {
- return ch;
- }
- switch (unicodeBlock.Value)
- {
- case UnicodeBlock.BasicLatin:
- {
- if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
- {
- return ' ';
- }
- break;
- }
- case UnicodeBlock.Latin1Supplement:
- {
- if (Latin1Excluded.IndexOf(ch) >= 0)
- {
- return ' ';
- }
- break;
- }
- case UnicodeBlock.GeneralPunctuation:
- {
- return ' ';
- }
- case UnicodeBlock.Arabic:
- {
- if (ch == '\u06cc')
- {
- return '\u064a';
- }
- break;
- }
- case UnicodeBlock.LatinExtendedAdditional:
- {
- if (ch >= '\u1ea0')
- {
- return '\u1ec3';
- }
- break;
- }
- case UnicodeBlock.Hiragana:
- {
- return '\u3042';
- }
- case UnicodeBlock.Katakana:
- {
- return '\u30a2';
- }
- case UnicodeBlock.Bopomofo:
- case UnicodeBlock.BopomofoExtended:
- {
- return '\u3105';
- }
- case UnicodeBlock.CjkUnifiedIdeographs:
- {
- if (_cjkMap.ContainsKey(ch))
- {
- return _cjkMap[ch];
- }
- break;
- }
- case UnicodeBlock.HangulSyllables:
- {
- return '\uac00';
- }
- }
- return ch;
- }
- public void AddChar(char ch)
- {
- ch = Normalize(ch);
- char lastchar = _grams[_grams.Length - 1];
- if (lastchar == ' ')
- {
- _grams = new StringBuilder(" ");
- _capitalword = false;
- if (ch == ' ') return;
- }
- else if (_grams.Length >= GramsCount)
- {
- _grams.Remove(0, 1);
- }
- _grams.Append(ch);
- if (char.IsUpper(ch))
- {
- if (char.IsUpper(lastchar)) _capitalword = true;
- }
- else
- {
- _capitalword = false;
- }
- }
- public string Get(int n)
- {
- if (_capitalword)
- {
- return null;
- }
- int len = _grams.Length;
- if (n < 1 || n > 3 || len < n)
- {
- return null;
- }
- if (n == 1)
- {
- char ch = _grams[len - 1];
- if (ch == ' ')
- {
- return null;
- }
- return ch.ToString();
- }
- // TODO IMM HI: is ToString() here effective?
- return _grams.ToString().SubSequence(len - n, len);
- }
- #endregion
- }
- }
|