| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 | 
							- using System;
 
- using System.Collections.Generic;
 
- using System.IO;
 
- using System.Linq;
 
- using System.Text;
 
- using System.Text.RegularExpressions;
 
- using NLangDetect.Core.Extensions;
 
- using NLangDetect.Core.Utils;
 
- namespace NLangDetect.Core
 
- {
 
-     public class Detector
 
-     {
 
-         private const double _AlphaDefault = 0.5;
 
-         private const double _AlphaWidth = 0.05;
 
-         private const int _IterationLimit = 1000;
 
-         private const double _ProbThreshold = 0.1;
 
-         private const double _ConvThreshold = 0.99999;
 
-         private const int _BaseFreq = 10000;
 
-         private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
 
-         private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
 
-         private readonly Dictionary<string, ProbVector> _wordLangProbMap;
 
-         private readonly List<string> _langlist;
 
-         private StringBuilder _text;
 
-         private double[] _langprob;
 
-         private double _alpha = _AlphaDefault;
 
-         private const int _trialsCount = 7;
 
-         private int _maxTextLength = 10000;
 
-         private double[] _priorMap;
 
-         private int? _seed;
 
-         #region Constructor(s)
 
-         public Detector(DetectorFactory factory)
 
-         {
 
-             _wordLangProbMap = factory.WordLangProbMap;
 
-             _langlist = factory.Langlist;
 
-             _text = new StringBuilder();
 
-             _seed = factory.Seed;
 
-         }
 
-         #endregion
 
-         #region Public methods
 
-         public void SetAlpha(double alpha)
 
-         {
 
-             _alpha = alpha;
 
-         }
 
-         public void SetPriorMap(Dictionary<string, double> priorMap)
 
-         {
 
-             _priorMap = new double[_langlist.Count];
 
-             double sump = 0;
 
-             for (int i = 0; i < _priorMap.Length; i++)
 
-             {
 
-                 string lang = _langlist[i];
 
-                 if (priorMap.ContainsKey(lang))
 
-                 {
 
-                     double p = priorMap[lang];
 
-                     if (p < 0)
 
-                     {
 
-                         throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
 
-                     }
 
-                     _priorMap[i] = p;
 
-                     sump += p;
 
-                 }
 
-             }
 
-             if (sump <= 0)
 
-             {
 
-                 throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
 
-             }
 
-             for (int i = 0; i < _priorMap.Length; i++)
 
-             {
 
-                 _priorMap[i] /= sump;
 
-             }
 
-         }
 
-         public void SetMaxTextLength(int max_text_length)
 
-         {
 
-             _maxTextLength = max_text_length;
 
-         }
 
-         // TODO IMM HI: TextReader?
 
-         public void Append(StreamReader streamReader)
 
-         {
 
-             var buf = new char[_maxTextLength / 2];
 
-             while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
 
-             {
 
-                 int length = streamReader.Read(buf, 0, buf.Length);
 
-                 Append(new string(buf, 0, length));
 
-             }
 
-         }
 
-         public void Append(string text)
 
-         {
 
-             text = _UrlRegex.Replace(text, " ");
 
-             text = _MailRegex.Replace(text, " ");
 
-             char pre = '\0';
 
-             for (int i = 0; i < text.Length && i < _maxTextLength; i++)
 
-             {
 
-                 char c = NGram.Normalize(text[i]);
 
-                 if (c != ' ' || pre != ' ')
 
-                 {
 
-                     _text.Append(c);
 
-                 }
 
-                 pre = c;
 
-             }
 
-         }
 
-         private void CleanText()
 
-         {
 
-             int latinCount = 0, nonLatinCount = 0;
 
-             for (int i = 0; i < _text.Length; i++)
 
-             {
 
-                 char c = _text[i];
 
-                 if (c <= 'z' && c >= 'A')
 
-                 {
 
-                     latinCount++;
 
-                 }
 
-                 else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
 
-                 {
 
-                     nonLatinCount++;
 
-                 }
 
-             }
 
-             if (latinCount * 2 < nonLatinCount)
 
-             {
 
-                 var textWithoutLatin = new StringBuilder();
 
-                 for (int i = 0; i < _text.Length; i++)
 
-                 {
 
-                     char c = _text[i];
 
-                     if (c > 'z' || c < 'A')
 
-                     {
 
-                         textWithoutLatin.Append(c);
 
-                     }
 
-                 }
 
-                 _text = textWithoutLatin;
 
-             }
 
-         }
 
-         public string Detect()
 
-         {
 
-             List<Language> probabilities = GetProbabilities();
 
-             return
 
-               probabilities.Count > 0
 
-                 ? probabilities[0].Name
 
-                 : null;
 
-         }
 
-         public List<Language> GetProbabilities()
 
-         {
 
-             if (_langprob == null)
 
-             {
 
-                 DetectBlock();
 
-             }
 
-             List<Language> list = SortProbability(_langprob);
 
-             return list;
 
-         }
 
-         #endregion
 
-         #region Private helper methods
 
-         private static double NormalizeProb(double[] probs)
 
-         {
 
-             double maxp = 0, sump = 0;
 
-             sump += probs.Sum();
 
-             for (int i = 0; i < probs.Length; i++)
 
-             {
 
-                 double p = probs[i] / sump;
 
-                 if (maxp < p)
 
-                 {
 
-                     maxp = p;
 
-                 }
 
-                 probs[i] = p;
 
-             }
 
-             return maxp;
 
-         }
 
-         private static string UnicodeEncode(string word)
 
-         {
 
-             var resultSb = new StringBuilder();
 
-             foreach (char ch in word)
 
-             {
 
-                 if (ch >= '\u0080')
 
-                 {
 
-                     string st = string.Format("{0:x}", 0x10000 + ch);
 
-                     while (st.Length < 4)
 
-                     {
 
-                         st = "0" + st;
 
-                     }
 
-                     resultSb
 
-                       .Append("\\u")
 
-                       .Append(st.SubSequence(1, 5));
 
-                 }
 
-                 else
 
-                 {
 
-                     resultSb.Append(ch);
 
-                 }
 
-             }
 
-             return resultSb.ToString();
 
-         }
 
-         private void DetectBlock()
 
-         {
 
-             CleanText();
 
-             List<string> ngrams = ExtractNGrams();
 
-             if (ngrams.Count == 0)
 
-             {
 
-                 throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
 
-             }
 
-             _langprob = new double[_langlist.Count];
 
-             Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
 
-             for (int t = 0; t < _trialsCount; t++)
 
-             {
 
-                 double[] prob = InitProbability();
 
-                 // TODO IMM HI: verify it works
 
-                 double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
 
-                 for (int i = 0; ; i++)
 
-                 {
 
-                     int r = rand.Next(ngrams.Count);
 
-                     UpdateLangProb(prob, ngrams[r], alpha);
 
-                     if (i % 5 == 0)
 
-                     {
 
-                         if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
 
-                         {
 
-                             break;
 
-                         }
 
-                     }
 
-                 }
 
-                 for (int j = 0; j < _langprob.Length; j++)
 
-                 {
 
-                     _langprob[j] += prob[j] / _trialsCount;
 
-                 }
 
-             }
 
-         }
 
-         private double[] InitProbability()
 
-         {
 
-             var prob = new double[_langlist.Count];
 
-             if (_priorMap != null)
 
-             {
 
-                 for (int i = 0; i < prob.Length; i++)
 
-                 {
 
-                     prob[i] = _priorMap[i];
 
-                 }
 
-             }
 
-             else
 
-             {
 
-                 for (int i = 0; i < prob.Length; i++)
 
-                 {
 
-                     prob[i] = 1.0 / _langlist.Count;
 
-                 }
 
-             }
 
-             return prob;
 
-         }
 
-         private List<string> ExtractNGrams()
 
-         {
 
-             var list = new List<string>();
 
-             NGram ngram = new NGram();
 
-             for (int i = 0; i < _text.Length; i++)
 
-             {
 
-                 ngram.AddChar(_text[i]);
 
-                 for (int n = 1; n <= NGram.GramsCount; n++)
 
-                 {
 
-                     string w = ngram.Get(n);
 
-                     if (w != null && _wordLangProbMap.ContainsKey(w))
 
-                     {
 
-                         list.Add(w);
 
-                     }
 
-                 }
 
-             }
 
-             return list;
 
-         }
 
-         private void UpdateLangProb(double[] prob, string word, double alpha)
 
-         {
 
-             if (word == null || !_wordLangProbMap.ContainsKey(word))
 
-             {
 
-                 return;
 
-             }
 
-             ProbVector langProbMap = _wordLangProbMap[word];
 
-             double weight = alpha / _BaseFreq;
 
-             for (int i = 0; i < prob.Length; i++)
 
-             {
 
-                 prob[i] *= weight + langProbMap[i];
 
-             }
 
-         }
 
-         private List<Language> SortProbability(double[] prob)
 
-         {
 
-             var list = new List<Language>();
 
-             for (int j = 0; j < prob.Length; j++)
 
-             {
 
-                 double p = prob[j];
 
-                 if (p > _ProbThreshold)
 
-                 {
 
-                     for (int i = 0; i <= list.Count; i++)
 
-                     {
 
-                         if (i == list.Count || list[i].Probability < p)
 
-                         {
 
-                             list.Insert(i, new Language(_langlist[j], p));
 
-                             break;
 
-                         }
 
-                     }
 
-                 }
 
-             }
 
-             return list;
 
-         }
 
-         #endregion
 
-     }
 
- }
 
 
  |