SBCharsetProber.cs 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /* ***** BEGIN LICENSE BLOCK *****
  2. * Version: MPL 1.1/GPL 2.0/LGPL 2.1
  3. *
  4. * The contents of this file are subject to the Mozilla Public License Version
  5. * 1.1 (the "License"); you may not use this file except in compliance with
  6. * the License. You may obtain a copy of the License at
  7. * http://www.mozilla.org/MPL/
  8. *
  9. * Software distributed under the License is distributed on an "AS IS" basis,
  10. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11. * for the specific language governing rights and limitations under the
  12. * License.
  13. *
  14. * The Original Code is Mozilla Universal charset detector code.
  15. *
  16. * The Initial Developer of the Original Code is
  17. * Netscape Communications Corporation.
  18. * Portions created by the Initial Developer are Copyright (C) 2001
  19. * the Initial Developer. All Rights Reserved.
  20. *
  21. * Contributor(s):
  22. * Shy Shalom <shooshX@gmail.com>
  23. * Rudi Pettazzi <rudi.pettazzi@gmail.com> (C# port)
  24. *
  25. * Alternatively, the contents of this file may be used under the terms of
  26. * either the GNU General Public License Version 2 or later (the "GPL"), or
  27. * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28. * in which case the provisions of the GPL or the LGPL are applicable instead
  29. * of those above. If you wish to allow use of your version of this file only
  30. * under the terms of either the GPL or the LGPL, and not to allow others to
  31. * use your version of this file under the terms of the MPL, indicate your
  32. * decision by deleting the provisions above and replace them with the notice
  33. * and other provisions required by the GPL or the LGPL. If you do not delete
  34. * the provisions above, a recipient may use your version of this file under
  35. * the terms of any one of the MPL, the GPL or the LGPL.
  36. *
  37. * ***** END LICENSE BLOCK ***** */
  38. using System;
  39. namespace UniversalDetector.Core
  40. {
  41. public class SingleByteCharSetProber : CharsetProber
  42. {
  43. private const int SAMPLE_SIZE = 64;
  44. private const int SB_ENOUGH_REL_THRESHOLD = 1024;
  45. private const float POSITIVE_SHORTCUT_THRESHOLD = 0.95f;
  46. private const float NEGATIVE_SHORTCUT_THRESHOLD = 0.05f;
  47. private const int SYMBOL_CAT_ORDER = 250;
  48. private const int NUMBER_OF_SEQ_CAT = 4;
  49. private const int POSITIVE_CAT = NUMBER_OF_SEQ_CAT-1;
  50. private const int NEGATIVE_CAT = 0;
  51. protected SequenceModel model;
  52. // true if we need to reverse every pair in the model lookup
  53. bool reversed;
  54. // char order of last character
  55. byte lastOrder;
  56. int totalSeqs;
  57. int totalChar;
  58. int[] seqCounters = new int[NUMBER_OF_SEQ_CAT];
  59. // characters that fall in our sampling range
  60. int freqChar;
  61. // Optional auxiliary prober for name decision. created and destroyed by the GroupProber
  62. CharsetProber nameProber;
  63. public SingleByteCharSetProber(SequenceModel model)
  64. : this(model, false, null)
  65. {
  66. }
  67. public SingleByteCharSetProber(SequenceModel model, bool reversed,
  68. CharsetProber nameProber)
  69. {
  70. this.model = model;
  71. this.reversed = reversed;
  72. this.nameProber = nameProber;
  73. Reset();
  74. }
  75. public override ProbingState HandleData(byte[] buf, int offset, int len)
  76. {
  77. int max = offset + len;
  78. for (int i = offset; i < max; i++) {
  79. byte order = model.GetOrder(buf[i]);
  80. if (order < SYMBOL_CAT_ORDER)
  81. totalChar++;
  82. if (order < SAMPLE_SIZE) {
  83. freqChar++;
  84. if (lastOrder < SAMPLE_SIZE) {
  85. totalSeqs++;
  86. if (!reversed)
  87. ++(seqCounters[model.GetPrecedence(lastOrder*SAMPLE_SIZE+order)]);
  88. else // reverse the order of the letters in the lookup
  89. ++(seqCounters[model.GetPrecedence(order*SAMPLE_SIZE+lastOrder)]);
  90. }
  91. }
  92. lastOrder = order;
  93. }
  94. if (state == ProbingState.Detecting) {
  95. if (totalSeqs > SB_ENOUGH_REL_THRESHOLD) {
  96. float cf = GetConfidence();
  97. if (cf > POSITIVE_SHORTCUT_THRESHOLD)
  98. state = ProbingState.FoundIt;
  99. else if (cf < NEGATIVE_SHORTCUT_THRESHOLD)
  100. state = ProbingState.NotMe;
  101. }
  102. }
  103. return state;
  104. }
  105. public override void DumpStatus()
  106. {
  107. //Console.WriteLine(" SBCS: {0} [{1}]", GetConfidence(), GetCharsetName());
  108. }
  109. public override float GetConfidence()
  110. {
  111. /*
  112. NEGATIVE_APPROACH
  113. if (totalSeqs > 0) {
  114. if (totalSeqs > seqCounters[NEGATIVE_CAT] * 10)
  115. return (totalSeqs - seqCounters[NEGATIVE_CAT] * 10)/totalSeqs * freqChar / mTotalChar;
  116. }
  117. return 0.01f;
  118. */
  119. // POSITIVE_APPROACH
  120. float r = 0.0f;
  121. if (totalSeqs > 0) {
  122. r = 1.0f * seqCounters[POSITIVE_CAT] / totalSeqs / model.TypicalPositiveRatio;
  123. r = r * freqChar / totalChar;
  124. if (r >= 1.0f)
  125. r = 0.99f;
  126. return r;
  127. }
  128. return 0.01f;
  129. }
  130. public override void Reset()
  131. {
  132. state = ProbingState.Detecting;
  133. lastOrder = 255;
  134. for (int i = 0; i < NUMBER_OF_SEQ_CAT; i++)
  135. seqCounters[i] = 0;
  136. totalSeqs = 0;
  137. totalChar = 0;
  138. freqChar = 0;
  139. }
  140. public override string GetCharsetName()
  141. {
  142. return (nameProber == null) ? model.CharsetName
  143. : nameProber.GetCharsetName();
  144. }
  145. }
  146. }