Latin1Prober.cs 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. /* ***** BEGIN LICENSE BLOCK *****
  2. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  3. *
  4. * The contents of this file are subject to the Mozilla Public License Version
  5. * 1.1 (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. * http://www.mozilla.org/MPL/
  8. *
  9. * Software distributed under the License is distributed on an "AS IS" basis,
  10. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11. * for the specific language governing rights and limitations under the
  12. * License.
  13. *
  14. * The Original Code is Mozilla Universal charset detector code.
  15. *
  16. * The Initial Developer of the Original Code is
  17. * Netscape Communications Corporation.
  18. * Portions created by the Initial Developer are Copyright (C) 2001
  19. * the Initial Developer. All Rights Reserved.
  20. *
  21. * Contributor(s):
  22. * Shy Shalom <shooshX@gmail.com>
  23. * Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
  24. *
  25. * Alternatively, the contents of this file may be used under the terms of
  26. * either the GNU General Public License Version 2 or later (the "GPL"), or
  27. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28. * in which case the provisions of the GPL or the LGPL are applicable instead
  29. * of those above. If you wish to allow use of your version of this file only
  30. * under the terms of either the GPL or the LGPL, and not to allow others to
  31. * use your version of this file under the terms of the MPL, indicate your
  32. * decision by deleting the provisions above and replace them with the notice
  33. * and other provisions required by the GPL or the LGPL. If you do not delete
  34. * the provisions above, a recipient may use your version of this file under
  35. * the terms of any one of the MPL, the GPL or the LGPL.
  36. *
  37. * ***** END LICENSE BLOCK ***** */
  38. using System;
  39. namespace UniversalDetector.Core
  40. {
  41. // TODO: Using trigrams the detector should be able to discriminate between
  42. // latin-1 and iso8859-2
  43. public class Latin1Prober : CharsetProber
  44. {
  45. private const int FREQ_CAT_NUM = 4;
  46. private const int UDF = 0; // undefined
  47. private const int OTH = 1; // other
  48. private const int ASC = 2; // ascii capital letter
  49. private const int ASS = 3; // ascii small letter
  50. private const int ACV = 4; // accent capital vowel
  51. private const int ACO = 5; // accent capital other
  52. private const int ASV = 6; // accent small vowel
  53. private const int ASO = 7; // accent small other
  54. private const int CLASS_NUM = 8; // total classes
  55. private readonly static byte[] Latin1_CharToClass = {
  56. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
  57. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
  58. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17
  59. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F
  60. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27
  61. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F
  62. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37
  63. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F
  64. OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47
  65. ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F
  66. ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57
  67. ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F
  68. OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67
  69. ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F
  70. ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77
  71. ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F
  72. OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87
  73. OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F
  74. UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97
  75. OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F
  76. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7
  77. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF
  78. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7
  79. OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF
  80. ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7
  81. ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF
  82. ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7
  83. ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF
  84. ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7
  85. ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF
  86. ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7
  87. ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF
  88. };
  89. /* 0 : illegal
  90. 1 : very unlikely
  91. 2 : normal
  92. 3 : very likely
  93. */
  94. private readonly static byte[] Latin1ClassModel = {
  95. /* UDF OTH ASC ASS ACV ACO ASV ASO */
  96. /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
  97. /*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3,
  98. /*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3,
  99. /*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3,
  100. /*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2,
  101. /*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3,
  102. /*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3,
  103. /*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3,
  104. };
  105. private byte lastCharClass;
  106. private int[] freqCounter = new int[FREQ_CAT_NUM];
  107. public Latin1Prober()
  108. {
  109. Reset();
  110. }
  111. public override string GetCharsetName()
  112. {
  113. return "windows-1252";
  114. }
  115. public override void Reset()
  116. {
  117. state = ProbingState.Detecting;
  118. lastCharClass = OTH;
  119. for (int i = 0; i < FREQ_CAT_NUM; i++)
  120. freqCounter[i] = 0;
  121. }
  122. public override ProbingState HandleData(byte[] buf, int offset, int len)
  123. {
  124. byte[] newbuf = FilterWithEnglishLetters(buf, offset, len);
  125. byte charClass, freq;
  126. for (int i = 0; i < newbuf.Length; i++) {
  127. charClass = Latin1_CharToClass[newbuf[i]];
  128. freq = Latin1ClassModel[lastCharClass * CLASS_NUM + charClass];
  129. if (freq == 0) {
  130. state = ProbingState.NotMe;
  131. break;
  132. }
  133. freqCounter[freq]++;
  134. lastCharClass = charClass;
  135. }
  136. return state;
  137. }
  138. public override float GetConfidence()
  139. {
  140. if (state == ProbingState.NotMe)
  141. return 0.01f;
  142. float confidence = 0.0f;
  143. int total = 0;
  144. for (int i = 0; i < FREQ_CAT_NUM; i++) {
  145. total += freqCounter[i];
  146. }
  147. if (total <= 0) {
  148. confidence = 0.0f;
  149. } else {
  150. confidence = freqCounter[3] * 1.0f / total;
  151. confidence -= freqCounter[1] * 20.0f / total;
  152. }
  153. // lower the confidence of latin1 so that other more accurate detector
  154. // can take priority.
  155. return confidence < 0.0f ? 0.0f : confidence * 0.5f;
  156. }
  157. public override void DumpStatus()
  158. {
  159. //Console.WriteLine(" Latin1Prober: {0} [{1}]", GetConfidence(), GetCharsetName());
  160. }
  161. }
  162. }