123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- using System;
- using System.Collections.Generic;
- using System.IO;
- using System.Linq;
- using System.Text;
- using System.Text.RegularExpressions;
- using NLangDetect.Core.Extensions;
- using NLangDetect.Core.Utils;
- namespace NLangDetect.Core
- {
- public class Detector
- {
- private const double _AlphaDefault = 0.5;
- private const double _AlphaWidth = 0.05;
- private const int _IterationLimit = 1000;
- private const double _ProbThreshold = 0.1;
- private const double _ConvThreshold = 0.99999;
- private const int _BaseFreq = 10000;
- private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
- private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
- private readonly Dictionary<string, ProbVector> _wordLangProbMap;
- private readonly List<string> _langlist;
- private StringBuilder _text;
- private double[] _langprob;
- private double _alpha = _AlphaDefault;
- private const int _trialsCount = 7;
- private int _maxTextLength = 10000;
- private double[] _priorMap;
- private int? _seed;
- #region Constructor(s)
- public Detector(DetectorFactory factory)
- {
- _wordLangProbMap = factory.WordLangProbMap;
- _langlist = factory.Langlist;
- _text = new StringBuilder();
- _seed = factory.Seed;
- }
- #endregion
- #region Public methods
- public void SetAlpha(double alpha)
- {
- _alpha = alpha;
- }
- public void SetPriorMap(Dictionary<string, double> priorMap)
- {
- _priorMap = new double[_langlist.Count];
- double sump = 0;
- for (int i = 0; i < _priorMap.Length; i++)
- {
- string lang = _langlist[i];
- if (priorMap.ContainsKey(lang))
- {
- double p = priorMap[lang];
- if (p < 0)
- {
- throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
- }
- _priorMap[i] = p;
- sump += p;
- }
- }
- if (sump <= 0)
- {
- throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
- }
- for (int i = 0; i < _priorMap.Length; i++)
- {
- _priorMap[i] /= sump;
- }
- }
- public void SetMaxTextLength(int max_text_length)
- {
- _maxTextLength = max_text_length;
- }
- // TODO IMM HI: TextReader?
- public void Append(StreamReader streamReader)
- {
- var buf = new char[_maxTextLength / 2];
- while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
- {
- int length = streamReader.Read(buf, 0, buf.Length);
- Append(new string(buf, 0, length));
- }
- }
- public void Append(string text)
- {
- text = _UrlRegex.Replace(text, " ");
- text = _MailRegex.Replace(text, " ");
- char pre = '\0';
- for (int i = 0; i < text.Length && i < _maxTextLength; i++)
- {
- char c = NGram.Normalize(text[i]);
- if (c != ' ' || pre != ' ')
- {
- _text.Append(c);
- }
- pre = c;
- }
- }
- private void CleanText()
- {
- int latinCount = 0, nonLatinCount = 0;
- for (int i = 0; i < _text.Length; i++)
- {
- char c = _text[i];
- if (c <= 'z' && c >= 'A')
- {
- latinCount++;
- }
- else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
- {
- nonLatinCount++;
- }
- }
- if (latinCount * 2 < nonLatinCount)
- {
- var textWithoutLatin = new StringBuilder();
- for (int i = 0; i < _text.Length; i++)
- {
- char c = _text[i];
- if (c > 'z' || c < 'A')
- {
- textWithoutLatin.Append(c);
- }
- }
- _text = textWithoutLatin;
- }
- }
- public string Detect()
- {
- List<Language> probabilities = GetProbabilities();
- return
- probabilities.Count > 0
- ? probabilities[0].Name
- : null;
- }
- public List<Language> GetProbabilities()
- {
- if (_langprob == null)
- {
- DetectBlock();
- }
- var list = SortProbability(_langprob);
- return list;
- }
- #endregion
- #region Private helper methods
- private static double NormalizeProb(double[] probs)
- {
- double maxp = 0, sump = 0;
- sump += probs.Sum();
- for (int i = 0; i < probs.Length; i++)
- {
- double p = probs[i] / sump;
- if (maxp < p)
- {
- maxp = p;
- }
- probs[i] = p;
- }
- return maxp;
- }
- private static string UnicodeEncode(string word)
- {
- var resultSb = new StringBuilder();
- foreach (char ch in word)
- {
- if (ch >= '\u0080')
- {
- string st = string.Format("{0:x}", 0x10000 + ch);
- while (st.Length < 4)
- {
- st = "0" + st;
- }
- resultSb
- .Append("\\u")
- .Append(st.SubSequence(1, 5));
- }
- else
- {
- resultSb.Append(ch);
- }
- }
- return resultSb.ToString();
- }
- private void DetectBlock()
- {
- CleanText();
- List<string> ngrams = ExtractNGrams();
- if (ngrams.Count == 0)
- {
- throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
- }
- _langprob = new double[_langlist.Count];
- var rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
- for (int t = 0; t < _trialsCount; t++)
- {
- double[] prob = InitProbability();
- // TODO IMM HI: verify it works
- double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
- for (int i = 0; ; i++)
- {
- int r = rand.Next(ngrams.Count);
- UpdateLangProb(prob, ngrams[r], alpha);
- if (i % 5 == 0)
- {
- if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
- {
- break;
- }
- }
- }
- for (int j = 0; j < _langprob.Length; j++)
- {
- _langprob[j] += prob[j] / _trialsCount;
- }
- }
- }
- private double[] InitProbability()
- {
- var prob = new double[_langlist.Count];
- if (_priorMap != null)
- {
- for (int i = 0; i < prob.Length; i++)
- {
- prob[i] = _priorMap[i];
- }
- }
- else
- {
- for (int i = 0; i < prob.Length; i++)
- {
- prob[i] = 1.0 / _langlist.Count;
- }
- }
- return prob;
- }
- private List<string> ExtractNGrams()
- {
- var list = new List<string>();
- var ngram = new NGram();
- for (int i = 0; i < _text.Length; i++)
- {
- ngram.AddChar(_text[i]);
- for (int n = 1; n <= NGram.GramsCount; n++)
- {
- string w = ngram.Get(n);
- if (w != null && _wordLangProbMap.ContainsKey(w))
- {
- list.Add(w);
- }
- }
- }
- return list;
- }
- private void UpdateLangProb(double[] prob, string word, double alpha)
- {
- if (word == null || !_wordLangProbMap.ContainsKey(word))
- {
- return;
- }
- ProbVector langProbMap = _wordLangProbMap[word];
- double weight = alpha / _BaseFreq;
- for (int i = 0; i < prob.Length; i++)
- {
- prob[i] *= weight + langProbMap[i];
- }
- }
- private List<Language> SortProbability(double[] prob)
- {
- var list = new List<Language>();
- for (int j = 0; j < prob.Length; j++)
- {
- double p = prob[j];
- if (p > _ProbThreshold)
- {
- for (int i = 0; i <= list.Count; i++)
- {
- if (i == list.Count || list[i].Probability < p)
- {
- list.Insert(i, new Language(_langlist[j], p));
- break;
- }
- }
- }
- }
- return list;
- }
- #endregion
- }
- }
|