2
0

NGram.cs 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. // TODO IMM HI: check which classes can be made internal?
  2. using System.Collections.Generic;
  3. using System.Text;
  4. using NLangDetect.Core.Extensions;
  5. namespace NLangDetect.Core.Utils
  6. {
  7. public class NGram
  8. {
  9. public const int GramsCount = 3;
  10. private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
  11. private static readonly string[] CjkClass =
  12. {
  13. #region CJK classes
  14. Messages.getString("NGram.KANJI_1_0"),
  15. Messages.getString("NGram.KANJI_1_2"),
  16. Messages.getString("NGram.KANJI_1_4"),
  17. Messages.getString("NGram.KANJI_1_8"),
  18. Messages.getString("NGram.KANJI_1_11"),
  19. Messages.getString("NGram.KANJI_1_12"),
  20. Messages.getString("NGram.KANJI_1_13"),
  21. Messages.getString("NGram.KANJI_1_14"),
  22. Messages.getString("NGram.KANJI_1_16"),
  23. Messages.getString("NGram.KANJI_1_18"),
  24. Messages.getString("NGram.KANJI_1_22"),
  25. Messages.getString("NGram.KANJI_1_27"),
  26. Messages.getString("NGram.KANJI_1_29"),
  27. Messages.getString("NGram.KANJI_1_31"),
  28. Messages.getString("NGram.KANJI_1_35"),
  29. Messages.getString("NGram.KANJI_2_0"),
  30. Messages.getString("NGram.KANJI_2_1"),
  31. Messages.getString("NGram.KANJI_2_4"),
  32. Messages.getString("NGram.KANJI_2_9"),
  33. Messages.getString("NGram.KANJI_2_10"),
  34. Messages.getString("NGram.KANJI_2_11"),
  35. Messages.getString("NGram.KANJI_2_12"),
  36. Messages.getString("NGram.KANJI_2_13"),
  37. Messages.getString("NGram.KANJI_2_15"),
  38. Messages.getString("NGram.KANJI_2_16"),
  39. Messages.getString("NGram.KANJI_2_18"),
  40. Messages.getString("NGram.KANJI_2_21"),
  41. Messages.getString("NGram.KANJI_2_22"),
  42. Messages.getString("NGram.KANJI_2_23"),
  43. Messages.getString("NGram.KANJI_2_28"),
  44. Messages.getString("NGram.KANJI_2_29"),
  45. Messages.getString("NGram.KANJI_2_30"),
  46. Messages.getString("NGram.KANJI_2_31"),
  47. Messages.getString("NGram.KANJI_2_32"),
  48. Messages.getString("NGram.KANJI_2_35"),
  49. Messages.getString("NGram.KANJI_2_36"),
  50. Messages.getString("NGram.KANJI_2_37"),
  51. Messages.getString("NGram.KANJI_2_38"),
  52. Messages.getString("NGram.KANJI_3_1"),
  53. Messages.getString("NGram.KANJI_3_2"),
  54. Messages.getString("NGram.KANJI_3_3"),
  55. Messages.getString("NGram.KANJI_3_4"),
  56. Messages.getString("NGram.KANJI_3_5"),
  57. Messages.getString("NGram.KANJI_3_8"),
  58. Messages.getString("NGram.KANJI_3_9"),
  59. Messages.getString("NGram.KANJI_3_11"),
  60. Messages.getString("NGram.KANJI_3_12"),
  61. Messages.getString("NGram.KANJI_3_13"),
  62. Messages.getString("NGram.KANJI_3_15"),
  63. Messages.getString("NGram.KANJI_3_16"),
  64. Messages.getString("NGram.KANJI_3_18"),
  65. Messages.getString("NGram.KANJI_3_19"),
  66. Messages.getString("NGram.KANJI_3_22"),
  67. Messages.getString("NGram.KANJI_3_23"),
  68. Messages.getString("NGram.KANJI_3_27"),
  69. Messages.getString("NGram.KANJI_3_29"),
  70. Messages.getString("NGram.KANJI_3_30"),
  71. Messages.getString("NGram.KANJI_3_31"),
  72. Messages.getString("NGram.KANJI_3_32"),
  73. Messages.getString("NGram.KANJI_3_35"),
  74. Messages.getString("NGram.KANJI_3_36"),
  75. Messages.getString("NGram.KANJI_3_37"),
  76. Messages.getString("NGram.KANJI_3_38"),
  77. Messages.getString("NGram.KANJI_4_0"),
  78. Messages.getString("NGram.KANJI_4_9"),
  79. Messages.getString("NGram.KANJI_4_10"),
  80. Messages.getString("NGram.KANJI_4_16"),
  81. Messages.getString("NGram.KANJI_4_17"),
  82. Messages.getString("NGram.KANJI_4_18"),
  83. Messages.getString("NGram.KANJI_4_22"),
  84. Messages.getString("NGram.KANJI_4_24"),
  85. Messages.getString("NGram.KANJI_4_28"),
  86. Messages.getString("NGram.KANJI_4_34"),
  87. Messages.getString("NGram.KANJI_4_39"),
  88. Messages.getString("NGram.KANJI_5_10"),
  89. Messages.getString("NGram.KANJI_5_11"),
  90. Messages.getString("NGram.KANJI_5_12"),
  91. Messages.getString("NGram.KANJI_5_13"),
  92. Messages.getString("NGram.KANJI_5_14"),
  93. Messages.getString("NGram.KANJI_5_18"),
  94. Messages.getString("NGram.KANJI_5_26"),
  95. Messages.getString("NGram.KANJI_5_29"),
  96. Messages.getString("NGram.KANJI_5_34"),
  97. Messages.getString("NGram.KANJI_5_39"),
  98. Messages.getString("NGram.KANJI_6_0"),
  99. Messages.getString("NGram.KANJI_6_3"),
  100. Messages.getString("NGram.KANJI_6_9"),
  101. Messages.getString("NGram.KANJI_6_10"),
  102. Messages.getString("NGram.KANJI_6_11"),
  103. Messages.getString("NGram.KANJI_6_12"),
  104. Messages.getString("NGram.KANJI_6_16"),
  105. Messages.getString("NGram.KANJI_6_18"),
  106. Messages.getString("NGram.KANJI_6_20"),
  107. Messages.getString("NGram.KANJI_6_21"),
  108. Messages.getString("NGram.KANJI_6_22"),
  109. Messages.getString("NGram.KANJI_6_23"),
  110. Messages.getString("NGram.KANJI_6_25"),
  111. Messages.getString("NGram.KANJI_6_28"),
  112. Messages.getString("NGram.KANJI_6_29"),
  113. Messages.getString("NGram.KANJI_6_30"),
  114. Messages.getString("NGram.KANJI_6_32"),
  115. Messages.getString("NGram.KANJI_6_34"),
  116. Messages.getString("NGram.KANJI_6_35"),
  117. Messages.getString("NGram.KANJI_6_37"),
  118. Messages.getString("NGram.KANJI_6_39"),
  119. Messages.getString("NGram.KANJI_7_0"),
  120. Messages.getString("NGram.KANJI_7_3"),
  121. Messages.getString("NGram.KANJI_7_6"),
  122. Messages.getString("NGram.KANJI_7_7"),
  123. Messages.getString("NGram.KANJI_7_9"),
  124. Messages.getString("NGram.KANJI_7_11"),
  125. Messages.getString("NGram.KANJI_7_12"),
  126. Messages.getString("NGram.KANJI_7_13"),
  127. Messages.getString("NGram.KANJI_7_16"),
  128. Messages.getString("NGram.KANJI_7_18"),
  129. Messages.getString("NGram.KANJI_7_19"),
  130. Messages.getString("NGram.KANJI_7_20"),
  131. Messages.getString("NGram.KANJI_7_21"),
  132. Messages.getString("NGram.KANJI_7_23"),
  133. Messages.getString("NGram.KANJI_7_25"),
  134. Messages.getString("NGram.KANJI_7_28"),
  135. Messages.getString("NGram.KANJI_7_29"),
  136. Messages.getString("NGram.KANJI_7_32"),
  137. Messages.getString("NGram.KANJI_7_33"),
  138. Messages.getString("NGram.KANJI_7_35"),
  139. Messages.getString("NGram.KANJI_7_37"),
  140. #endregion
  141. };
  142. private static readonly Dictionary<char, char> _cjkMap;
  143. private StringBuilder _grams;
  144. private bool _capitalword;
  145. #region Constructor(s)
  146. static NGram()
  147. {
  148. _cjkMap = new Dictionary<char, char>();
  149. foreach (string cjk_list in CjkClass)
  150. {
  151. char representative = cjk_list[0];
  152. for (int i = 0; i < cjk_list.Length; i++)
  153. {
  154. _cjkMap.Add(cjk_list[i], representative);
  155. }
  156. }
  157. }
  158. public NGram()
  159. {
  160. _grams = new StringBuilder(" ");
  161. _capitalword = false;
  162. }
  163. #endregion
  164. #region Public methods
  165. public static char Normalize(char ch)
  166. {
  167. UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
  168. if (!unicodeBlock.HasValue)
  169. {
  170. return ch;
  171. }
  172. switch (unicodeBlock.Value)
  173. {
  174. case UnicodeBlock.BasicLatin:
  175. {
  176. if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
  177. {
  178. return ' ';
  179. }
  180. break;
  181. }
  182. case UnicodeBlock.Latin1Supplement:
  183. {
  184. if (Latin1Excluded.IndexOf(ch) >= 0)
  185. {
  186. return ' ';
  187. }
  188. break;
  189. }
  190. case UnicodeBlock.GeneralPunctuation:
  191. {
  192. return ' ';
  193. }
  194. case UnicodeBlock.Arabic:
  195. {
  196. if (ch == '\u06cc')
  197. {
  198. return '\u064a';
  199. }
  200. break;
  201. }
  202. case UnicodeBlock.LatinExtendedAdditional:
  203. {
  204. if (ch >= '\u1ea0')
  205. {
  206. return '\u1ec3';
  207. }
  208. break;
  209. }
  210. case UnicodeBlock.Hiragana:
  211. {
  212. return '\u3042';
  213. }
  214. case UnicodeBlock.Katakana:
  215. {
  216. return '\u30a2';
  217. }
  218. case UnicodeBlock.Bopomofo:
  219. case UnicodeBlock.BopomofoExtended:
  220. {
  221. return '\u3105';
  222. }
  223. case UnicodeBlock.CjkUnifiedIdeographs:
  224. {
  225. if (_cjkMap.ContainsKey(ch))
  226. {
  227. return _cjkMap[ch];
  228. }
  229. break;
  230. }
  231. case UnicodeBlock.HangulSyllables:
  232. {
  233. return '\uac00';
  234. }
  235. }
  236. return ch;
  237. }
  238. public void AddChar(char ch)
  239. {
  240. ch = Normalize(ch);
  241. char lastchar = _grams[_grams.Length - 1];
  242. if (lastchar == ' ')
  243. {
  244. _grams = new StringBuilder(" ");
  245. _capitalword = false;
  246. if (ch == ' ') return;
  247. }
  248. else if (_grams.Length >= GramsCount)
  249. {
  250. _grams.Remove(0, 1);
  251. }
  252. _grams.Append(ch);
  253. if (char.IsUpper(ch))
  254. {
  255. if (char.IsUpper(lastchar)) _capitalword = true;
  256. }
  257. else
  258. {
  259. _capitalword = false;
  260. }
  261. }
  262. public string Get(int n)
  263. {
  264. if (_capitalword)
  265. {
  266. return null;
  267. }
  268. int len = _grams.Length;
  269. if (n < 1 || n > 3 || len < n)
  270. {
  271. return null;
  272. }
  273. if (n == 1)
  274. {
  275. char ch = _grams[len - 1];
  276. if (ch == ' ')
  277. {
  278. return null;
  279. }
  280. return ch.ToString();
  281. }
  282. // TODO IMM HI: is ToString() here effective?
  283. return _grams.ToString().SubSequence(len - n, len);
  284. }
  285. #endregion
  286. }
  287. }