| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 | using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;using System.Text.RegularExpressions;using NLangDetect.Core.Extensions;using NLangDetect.Core.Utils;namespace NLangDetect.Core{    public class Detector    {        private const double _AlphaDefault = 0.5;        private const double _AlphaWidth = 0.05;        private const int _IterationLimit = 1000;        private const double _ProbThreshold = 0.1;        private const double _ConvThreshold = 0.99999;        private const int _BaseFreq = 10000;        private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);        private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);        private readonly Dictionary<string, ProbVector> _wordLangProbMap;        private readonly List<string> _langlist;        private StringBuilder _text;        private double[] _langprob;        private double _alpha = _AlphaDefault;        private const int _trialsCount = 7;        private int _maxTextLength = 10000;        private double[] _priorMap;        private int? _seed;        #region Constructor(s)        public Detector(DetectorFactory factory)        {            _wordLangProbMap = factory.WordLangProbMap;            _langlist = factory.Langlist;            _text = new StringBuilder();            _seed = factory.Seed;        }        #endregion        #region Public methods        public void SetAlpha(double alpha)        {            _alpha = alpha;        }        public void SetPriorMap(Dictionary<string, double> priorMap)        {            _priorMap = new double[_langlist.Count];            double sump = 0;            for (int i = 0; i < _priorMap.Length; i++)            {                string lang = _langlist[i];                if (priorMap.ContainsKey(lang))                {                    double p = priorMap[lang];                    if (p < 0)                    {                        throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);                    }                    _priorMap[i] = p;                    sump += p;                }            }            if (sump <= 0)            {                throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);            }            for (int i = 0; i < _priorMap.Length; i++)            {                _priorMap[i] /= sump;            }        }        public void SetMaxTextLength(int max_text_length)        {            _maxTextLength = max_text_length;        }        // TODO IMM HI: TextReader?        public void Append(StreamReader streamReader)        {            var buf = new char[_maxTextLength / 2];            while (_text.Length < _maxTextLength && !streamReader.EndOfStream)            {                int length = streamReader.Read(buf, 0, buf.Length);                Append(new string(buf, 0, length));            }        }        public void Append(string text)        {            text = _UrlRegex.Replace(text, " ");            text = _MailRegex.Replace(text, " ");            char pre = '\0';            for (int i = 0; i < text.Length && i < _maxTextLength; i++)            {                char c = NGram.Normalize(text[i]);                if (c != ' ' || pre != ' ')                {                    _text.Append(c);                }                pre = c;            }        }        private void CleanText()        {            int latinCount = 0, nonLatinCount = 0;            for (int i = 0; i < _text.Length; i++)            {                char c = _text[i];                if (c <= 'z' && c >= 'A')                {                    latinCount++;                }                else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)                {                    nonLatinCount++;                }            }            if (latinCount * 2 < nonLatinCount)            {                var textWithoutLatin = new StringBuilder();                for (int i = 0; i < _text.Length; i++)                {                    char c = _text[i];                    if (c > 'z' || c < 'A')                    {                        textWithoutLatin.Append(c);                    }                }                _text = textWithoutLatin;            }        }        public string Detect()        {            List<Language> probabilities = GetProbabilities();            return              probabilities.Count > 0                ? probabilities[0].Name                : null;        }        public List<Language> GetProbabilities()        {            if (_langprob == null)            {                DetectBlock();            }            List<Language> list = SortProbability(_langprob);            return list;        }        #endregion        #region Private helper methods        private static double NormalizeProb(double[] probs)        {            double maxp = 0, sump = 0;            sump += probs.Sum();            for (int i = 0; i < probs.Length; i++)            {                double p = probs[i] / sump;                if (maxp < p)                {                    maxp = p;                }                probs[i] = p;            }            return maxp;        }        private static string UnicodeEncode(string word)        {            var resultSb = new StringBuilder();            foreach (char ch in word)            {                if (ch >= '\u0080')                {                    string st = string.Format("{0:x}", 0x10000 + ch);                    while (st.Length < 4)                    {                        st = "0" + st;                    }                    resultSb                      .Append("\\u")                      .Append(st.SubSequence(1, 5));                }                else                {                    resultSb.Append(ch);                }            }            return resultSb.ToString();        }        private void DetectBlock()        {            CleanText();            List<string> ngrams = ExtractNGrams();            if (ngrams.Count == 0)            {                throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);            }            _langprob = new double[_langlist.Count];            Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());            for (int t = 0; t < _trialsCount; t++)            {                double[] prob = InitProbability();                // TODO IMM HI: verify it works                double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;                for (int i = 0; ; i++)                {                    int r = rand.Next(ngrams.Count);                    UpdateLangProb(prob, ngrams[r], alpha);                    if (i % 5 == 0)                    {                        if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)                        {                            break;                        }                    }                }                for (int j = 0; j < _langprob.Length; j++)                {                    _langprob[j] += prob[j] / _trialsCount;                }            }        }        private double[] InitProbability()        {            var prob = new double[_langlist.Count];            if (_priorMap != null)            {                for (int i = 0; i < prob.Length; i++)                {                    prob[i] = _priorMap[i];                }            }            else            {                for (int i = 0; i < prob.Length; i++)                {                    prob[i] = 1.0 / _langlist.Count;                }            }            return prob;        }        private List<string> ExtractNGrams()        {            var list = new List<string>();            NGram ngram = new NGram();            for (int i = 0; i < _text.Length; i++)            {                ngram.AddChar(_text[i]);                for (int n = 1; n <= NGram.GramsCount; n++)                {                    string w = ngram.Get(n);                    if (w != null && _wordLangProbMap.ContainsKey(w))                    {                        list.Add(w);                    }                }            }            return list;        }        private void UpdateLangProb(double[] prob, string word, double alpha)        {            if (word == null || !_wordLangProbMap.ContainsKey(word))            {                return;            }            ProbVector langProbMap = _wordLangProbMap[word];            double weight = alpha / _BaseFreq;            for (int i = 0; i < prob.Length; i++)            {                prob[i] *= weight + langProbMap[i];            }        }        private List<Language> SortProbability(double[] prob)        {            var list = new List<Language>();            for (int j = 0; j < prob.Length; j++)            {                double p = prob[j];                if (p > _ProbThreshold)                {                    for (int i = 0; i <= list.Count; i++)                    {                        if (i == list.Count || list[i].Probability < p)                        {                            list.Insert(i, new Language(_langlist[j], p));                            break;                        }                    }                }            }            return list;        }        #endregion    }}
 |