Detector.cs 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using NLangDetect.Core.Extensions;
  8. using NLangDetect.Core.Utils;
  9. namespace NLangDetect.Core
  10. {
  11. public class Detector
  12. {
  13. private const double _AlphaDefault = 0.5;
  14. private const double _AlphaWidth = 0.05;
  15. private const int _IterationLimit = 1000;
  16. private const double _ProbThreshold = 0.1;
  17. private const double _ConvThreshold = 0.99999;
  18. private const int _BaseFreq = 10000;
  19. private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
  20. private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
  21. private readonly Dictionary<string, ProbVector> _wordLangProbMap;
  22. private readonly List<string> _langlist;
  23. private StringBuilder _text;
  24. private double[] _langprob;
  25. private double _alpha = _AlphaDefault;
  26. private const int _trialsCount = 7;
  27. private int _maxTextLength = 10000;
  28. private double[] _priorMap;
  29. private int? _seed;
  30. #region Constructor(s)
  31. public Detector(DetectorFactory factory)
  32. {
  33. _wordLangProbMap = factory.WordLangProbMap;
  34. _langlist = factory.Langlist;
  35. _text = new StringBuilder();
  36. _seed = factory.Seed;
  37. }
  38. #endregion
  39. #region Public methods
  40. public void SetAlpha(double alpha)
  41. {
  42. _alpha = alpha;
  43. }
  44. public void SetPriorMap(Dictionary<string, double> priorMap)
  45. {
  46. _priorMap = new double[_langlist.Count];
  47. double sump = 0;
  48. for (int i = 0; i < _priorMap.Length; i++)
  49. {
  50. string lang = _langlist[i];
  51. if (priorMap.ContainsKey(lang))
  52. {
  53. double p = priorMap[lang];
  54. if (p < 0)
  55. {
  56. throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
  57. }
  58. _priorMap[i] = p;
  59. sump += p;
  60. }
  61. }
  62. if (sump <= 0)
  63. {
  64. throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
  65. }
  66. for (int i = 0; i < _priorMap.Length; i++)
  67. {
  68. _priorMap[i] /= sump;
  69. }
  70. }
  71. public void SetMaxTextLength(int max_text_length)
  72. {
  73. _maxTextLength = max_text_length;
  74. }
  75. // TODO IMM HI: TextReader?
  76. public void Append(StreamReader streamReader)
  77. {
  78. var buf = new char[_maxTextLength / 2];
  79. while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
  80. {
  81. int length = streamReader.Read(buf, 0, buf.Length);
  82. Append(new string(buf, 0, length));
  83. }
  84. }
  85. public void Append(string text)
  86. {
  87. text = _UrlRegex.Replace(text, " ");
  88. text = _MailRegex.Replace(text, " ");
  89. char pre = '\0';
  90. for (int i = 0; i < text.Length && i < _maxTextLength; i++)
  91. {
  92. char c = NGram.Normalize(text[i]);
  93. if (c != ' ' || pre != ' ')
  94. {
  95. _text.Append(c);
  96. }
  97. pre = c;
  98. }
  99. }
  100. private void CleanText()
  101. {
  102. int latinCount = 0, nonLatinCount = 0;
  103. for (int i = 0; i < _text.Length; i++)
  104. {
  105. char c = _text[i];
  106. if (c <= 'z' && c >= 'A')
  107. {
  108. latinCount++;
  109. }
  110. else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
  111. {
  112. nonLatinCount++;
  113. }
  114. }
  115. if (latinCount * 2 < nonLatinCount)
  116. {
  117. var textWithoutLatin = new StringBuilder();
  118. for (int i = 0; i < _text.Length; i++)
  119. {
  120. char c = _text[i];
  121. if (c > 'z' || c < 'A')
  122. {
  123. textWithoutLatin.Append(c);
  124. }
  125. }
  126. _text = textWithoutLatin;
  127. }
  128. }
  129. public string Detect()
  130. {
  131. List<Language> probabilities = GetProbabilities();
  132. return
  133. probabilities.Count > 0
  134. ? probabilities[0].Name
  135. : null;
  136. }
  137. public List<Language> GetProbabilities()
  138. {
  139. if (_langprob == null)
  140. {
  141. DetectBlock();
  142. }
  143. var list = SortProbability(_langprob);
  144. return list;
  145. }
  146. #endregion
  147. #region Private helper methods
  148. private static double NormalizeProb(double[] probs)
  149. {
  150. double maxp = 0, sump = 0;
  151. sump += probs.Sum();
  152. for (int i = 0; i < probs.Length; i++)
  153. {
  154. double p = probs[i] / sump;
  155. if (maxp < p)
  156. {
  157. maxp = p;
  158. }
  159. probs[i] = p;
  160. }
  161. return maxp;
  162. }
  163. private static string UnicodeEncode(string word)
  164. {
  165. var resultSb = new StringBuilder();
  166. foreach (char ch in word)
  167. {
  168. if (ch >= '\u0080')
  169. {
  170. string st = string.Format("{0:x}", 0x10000 + ch);
  171. while (st.Length < 4)
  172. {
  173. st = "0" + st;
  174. }
  175. resultSb
  176. .Append("\\u")
  177. .Append(st.SubSequence(1, 5));
  178. }
  179. else
  180. {
  181. resultSb.Append(ch);
  182. }
  183. }
  184. return resultSb.ToString();
  185. }
  186. private void DetectBlock()
  187. {
  188. CleanText();
  189. List<string> ngrams = ExtractNGrams();
  190. if (ngrams.Count == 0)
  191. {
  192. throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
  193. }
  194. _langprob = new double[_langlist.Count];
  195. var rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
  196. for (int t = 0; t < _trialsCount; t++)
  197. {
  198. double[] prob = InitProbability();
  199. // TODO IMM HI: verify it works
  200. double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
  201. for (int i = 0; ; i++)
  202. {
  203. int r = rand.Next(ngrams.Count);
  204. UpdateLangProb(prob, ngrams[r], alpha);
  205. if (i % 5 == 0)
  206. {
  207. if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
  208. {
  209. break;
  210. }
  211. }
  212. }
  213. for (int j = 0; j < _langprob.Length; j++)
  214. {
  215. _langprob[j] += prob[j] / _trialsCount;
  216. }
  217. }
  218. }
  219. private double[] InitProbability()
  220. {
  221. var prob = new double[_langlist.Count];
  222. if (_priorMap != null)
  223. {
  224. for (int i = 0; i < prob.Length; i++)
  225. {
  226. prob[i] = _priorMap[i];
  227. }
  228. }
  229. else
  230. {
  231. for (int i = 0; i < prob.Length; i++)
  232. {
  233. prob[i] = 1.0 / _langlist.Count;
  234. }
  235. }
  236. return prob;
  237. }
  238. private List<string> ExtractNGrams()
  239. {
  240. var list = new List<string>();
  241. var ngram = new NGram();
  242. for (int i = 0; i < _text.Length; i++)
  243. {
  244. ngram.AddChar(_text[i]);
  245. for (int n = 1; n <= NGram.GramsCount; n++)
  246. {
  247. string w = ngram.Get(n);
  248. if (w != null && _wordLangProbMap.ContainsKey(w))
  249. {
  250. list.Add(w);
  251. }
  252. }
  253. }
  254. return list;
  255. }
  256. private void UpdateLangProb(double[] prob, string word, double alpha)
  257. {
  258. if (word == null || !_wordLangProbMap.ContainsKey(word))
  259. {
  260. return;
  261. }
  262. ProbVector langProbMap = _wordLangProbMap[word];
  263. double weight = alpha / _BaseFreq;
  264. for (int i = 0; i < prob.Length; i++)
  265. {
  266. prob[i] *= weight + langProbMap[i];
  267. }
  268. }
  269. private List<Language> SortProbability(double[] prob)
  270. {
  271. var list = new List<Language>();
  272. for (int j = 0; j < prob.Length; j++)
  273. {
  274. double p = prob[j];
  275. if (p > _ProbThreshold)
  276. {
  277. for (int i = 0; i <= list.Count; i++)
  278. {
  279. if (i == list.Count || list[i].Probability < p)
  280. {
  281. list.Insert(i, new Language(_langlist[j], p));
  282. break;
  283. }
  284. }
  285. }
  286. }
  287. return list;
  288. }
  289. #endregion
  290. }
  291. }