123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- using System.Collections.Generic;
- using System.Text.RegularExpressions;
- namespace NLangDetect.Core.Utils
- {
- public class LangProfile
- {
- private const int MinimumFreq = 2;
- private const int LessFreqRatio = 100000;
- public string name { get; set; }
- public Dictionary<string, int> freq { get; set; }
- public int[] n_words { get; set; }
- #region Constructor(s)
- public LangProfile()
- {
- freq = new Dictionary<string, int>();
- n_words = new int[NGram.GramsCount];
- }
- public LangProfile(string name)
- {
- this.name = name;
- freq = new Dictionary<string, int>();
- n_words = new int[NGram.GramsCount];
- }
- #endregion
- #region Public methods
- public void Add(string gram)
- {
- if (name == null || gram == null) return; // Illegal
- int len = gram.Length;
- if (len < 1 || len > NGram.GramsCount) return; // Illegal
- n_words[len - 1]++;
- if (freq.ContainsKey(gram))
- {
- freq[gram] = freq[gram] + 1;
- }
- else
- {
- freq.Add(gram, 1);
- }
- }
- public void OmitLessFreq()
- {
- if (name == null) return; // Illegal
- int threshold = n_words[0] / LessFreqRatio;
- if (threshold < MinimumFreq) threshold = MinimumFreq;
- ICollection<string> keys = freq.Keys;
- int roman = 0;
- // TODO IMM HI: move up?
- Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
- List<string> keysToRemove = new List<string>();
- foreach (string key in keys)
- {
- int count = freq[key];
- if (count <= threshold)
- {
- n_words[key.Length - 1] -= count;
- keysToRemove.Add(key);
- }
- else
- {
- if (regex1.IsMatch(key))
- {
- roman += count;
- }
- }
- }
- foreach (string keyToRemove in keysToRemove)
- {
- freq.Remove(keyToRemove);
- }
- // roman check
- keysToRemove = new List<string>();
- if (roman < n_words[0] / 3)
- {
- ICollection<string> keys2 = freq.Keys;
- // TODO IMM HI: move up?
- Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
- foreach (string key in keys2)
- {
- int count = freq[key];
- if (regex2.IsMatch(key))
- {
- n_words[key.Length - 1] -= count;
- keysToRemove.Add(key);
- }
- }
- foreach (string keyToRemove in keysToRemove)
- {
- freq.Remove(keyToRemove);
- }
- }
- }
- #endregion
- }
- }
|