| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 | using System.Collections.Generic;using System.Text.RegularExpressions;namespace NLangDetect.Core.Utils{    public class LangProfile    {        private const int MinimumFreq = 2;        private const int LessFreqRatio = 100000;        public string name { get; set; }        public Dictionary<string, int> freq { get; set; }        public int[] n_words { get; set; }        #region Constructor(s)        public LangProfile()        {            freq = new Dictionary<string, int>();            n_words = new int[NGram.GramsCount];        }        public LangProfile(string name)        {            this.name = name;            freq = new Dictionary<string, int>();            n_words = new int[NGram.GramsCount];        }        #endregion        #region Public methods        public void Add(string gram)        {            if (name == null || gram == null) return; // Illegal            int len = gram.Length;            if (len < 1 || len > NGram.GramsCount) return; // Illegal            n_words[len - 1]++;            if (freq.ContainsKey(gram))            {                freq[gram] = freq[gram] + 1;            }            else            {                freq.Add(gram, 1);            }        }        public void OmitLessFreq()        {            if (name == null) return; // Illegal            int threshold = n_words[0] / LessFreqRatio;            if (threshold < MinimumFreq) threshold = MinimumFreq;            ICollection<string> keys = freq.Keys;            int roman = 0;            // TODO IMM HI: move up?            Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);            List<string> keysToRemove = new List<string>();            foreach (string key in keys)            {                int count = freq[key];                if (count <= threshold)                {                    n_words[key.Length - 1] -= count;                    keysToRemove.Add(key);                }                else                {                    if (regex1.IsMatch(key))                    {                        roman += count;                    }                }            }            foreach (string keyToRemove in keysToRemove)            {                freq.Remove(keyToRemove);            }            // roman check            keysToRemove = new List<string>();            if (roman < n_words[0] / 3)            {                ICollection<string> keys2 = freq.Keys;                // TODO IMM HI: move up?                Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);                foreach (string key in keys2)                {                    int count = freq[key];                    if (regex2.IsMatch(key))                    {                        n_words[key.Length - 1] -= count;                        keysToRemove.Add(key);                    }                }                foreach (string keyToRemove in keysToRemove)                {                    freq.Remove(keyToRemove);                }            }        }        #endregion    }}
 |