LangProfile.cs 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. using System.Collections.Generic;
  2. using System.Text.RegularExpressions;
  3. namespace NLangDetect.Core.Utils
  4. {
  5. public class LangProfile
  6. {
  7. private const int MinimumFreq = 2;
  8. private const int LessFreqRatio = 100000;
  9. public string name { get; set; }
  10. public Dictionary<string, int> freq { get; set; }
  11. public int[] n_words { get; set; }
  12. #region Constructor(s)
  13. public LangProfile()
  14. {
  15. freq = new Dictionary<string, int>();
  16. n_words = new int[NGram.GramsCount];
  17. }
  18. public LangProfile(string name)
  19. {
  20. this.name = name;
  21. freq = new Dictionary<string, int>();
  22. n_words = new int[NGram.GramsCount];
  23. }
  24. #endregion
  25. #region Public methods
  26. public void Add(string gram)
  27. {
  28. if (name == null || gram == null) return; // Illegal
  29. int len = gram.Length;
  30. if (len < 1 || len > NGram.GramsCount) return; // Illegal
  31. n_words[len - 1]++;
  32. if (freq.ContainsKey(gram))
  33. {
  34. freq[gram] = freq[gram] + 1;
  35. }
  36. else
  37. {
  38. freq.Add(gram, 1);
  39. }
  40. }
  41. public void OmitLessFreq()
  42. {
  43. if (name == null) return; // Illegal
  44. int threshold = n_words[0] / LessFreqRatio;
  45. if (threshold < MinimumFreq) threshold = MinimumFreq;
  46. ICollection<string> keys = freq.Keys;
  47. int roman = 0;
  48. // TODO IMM HI: move up?
  49. Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
  50. List<string> keysToRemove = new List<string>();
  51. foreach (string key in keys)
  52. {
  53. int count = freq[key];
  54. if (count <= threshold)
  55. {
  56. n_words[key.Length - 1] -= count;
  57. keysToRemove.Add(key);
  58. }
  59. else
  60. {
  61. if (regex1.IsMatch(key))
  62. {
  63. roman += count;
  64. }
  65. }
  66. }
  67. foreach (string keyToRemove in keysToRemove)
  68. {
  69. freq.Remove(keyToRemove);
  70. }
  71. // roman check
  72. keysToRemove = new List<string>();
  73. if (roman < n_words[0] / 3)
  74. {
  75. ICollection<string> keys2 = freq.Keys;
  76. // TODO IMM HI: move up?
  77. Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
  78. foreach (string key in keys2)
  79. {
  80. int count = freq[key];
  81. if (regex2.IsMatch(key))
  82. {
  83. n_words[key.Length - 1] -= count;
  84. keysToRemove.Add(key);
  85. }
  86. }
  87. foreach (string keyToRemove in keysToRemove)
  88. {
  89. freq.Remove(keyToRemove);
  90. }
  91. }
  92. }
  93. #endregion
  94. }
  95. }