TextEncodingDetect.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. namespace Emby.Server.Implementations.TextEncoding
  2. {
  3. // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
  4. //
  5. // https://www.autoitscript.com
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // Unless required by applicable law or agreed to in writing, software
  14. // distributed under the License is distributed on an "AS IS" BASIS,
  15. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. // See the License for the specific language governing permissions and
  17. // limitations under the License.
  18. /// <summary>
  19. /// Credit: https://github.com/AutoIt/text-encoding-detect
  20. /// </summary>
  21. public class TextEncodingDetect
  22. {
  23. private readonly byte[] _utf16BeBom =
  24. {
  25. 0xFE,
  26. 0xFF
  27. };
  28. private readonly byte[] _utf16LeBom =
  29. {
  30. 0xFF,
  31. 0xFE
  32. };
  33. private readonly byte[] _utf8Bom =
  34. {
  35. 0xEF,
  36. 0xBB,
  37. 0xBF
  38. };
  39. private bool _nullSuggestsBinary = true;
  40. private double _utf16ExpectedNullPercent = 70;
  41. private double _utf16UnexpectedNullPercent = 10;
  42. public enum CharacterEncoding
  43. {
  44. None, // Unknown or binary
  45. Ansi, // 0-255
  46. Ascii, // 0-127
  47. Utf8Bom, // UTF8 with BOM
  48. Utf8Nobom, // UTF8 without BOM
  49. Utf16LeBom, // UTF16 LE with BOM
  50. Utf16LeNoBom, // UTF16 LE without BOM
  51. Utf16BeBom, // UTF16-BE with BOM
  52. Utf16BeNoBom // UTF16-BE without BOM
  53. }
  54. /// <summary>
  55. /// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
  56. /// </summary>
  57. public bool NullSuggestsBinary
  58. {
  59. set
  60. {
  61. _nullSuggestsBinary = value;
  62. }
  63. }
  64. public double Utf16ExpectedNullPercent
  65. {
  66. set
  67. {
  68. if (value > 0 && value < 100)
  69. {
  70. _utf16ExpectedNullPercent = value;
  71. }
  72. }
  73. }
  74. public double Utf16UnexpectedNullPercent
  75. {
  76. set
  77. {
  78. if (value > 0 && value < 100)
  79. {
  80. _utf16UnexpectedNullPercent = value;
  81. }
  82. }
  83. }
  84. /// <summary>
  85. /// Gets the BOM length for a given Encoding mode.
  86. /// </summary>
  87. /// <param name="encoding"></param>
  88. /// <returns>The BOM length.</returns>
  89. public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
  90. {
  91. int length;
  92. switch (encoding)
  93. {
  94. case CharacterEncoding.Utf16BeBom:
  95. case CharacterEncoding.Utf16LeBom:
  96. length = 2;
  97. break;
  98. case CharacterEncoding.Utf8Bom:
  99. length = 3;
  100. break;
  101. default:
  102. length = 0;
  103. break;
  104. }
  105. return length;
  106. }
  107. /// <summary>
  108. /// Checks for a BOM sequence in a byte buffer.
  109. /// </summary>
  110. /// <param name="buffer"></param>
  111. /// <param name="size"></param>
  112. /// <returns>Encoding type or Encoding.None if no BOM.</returns>
  113. public CharacterEncoding CheckBom(byte[] buffer, int size)
  114. {
  115. // Check for BOM
  116. if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
  117. {
  118. return CharacterEncoding.Utf16LeBom;
  119. }
  120. if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
  121. {
  122. return CharacterEncoding.Utf16BeBom;
  123. }
  124. if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
  125. {
  126. return CharacterEncoding.Utf8Bom;
  127. }
  128. return CharacterEncoding.None;
  129. }
  130. /// <summary>
  131. /// Automatically detects the Encoding type of a given byte buffer.
  132. /// </summary>
  133. /// <param name="buffer">The byte buffer.</param>
  134. /// <param name="size">The size of the byte buffer.</param>
  135. /// <returns>The Encoding type or Encoding.None if unknown.</returns>
  136. public CharacterEncoding DetectEncoding(byte[] buffer, int size)
  137. {
  138. // First check if we have a BOM and return that if so
  139. CharacterEncoding encoding = CheckBom(buffer, size);
  140. if (encoding != CharacterEncoding.None)
  141. {
  142. return encoding;
  143. }
  144. // Now check for valid UTF8
  145. encoding = CheckUtf8(buffer, size);
  146. if (encoding != CharacterEncoding.None)
  147. {
  148. return encoding;
  149. }
  150. // Now try UTF16
  151. encoding = CheckUtf16NewlineChars(buffer, size);
  152. if (encoding != CharacterEncoding.None)
  153. {
  154. return encoding;
  155. }
  156. encoding = CheckUtf16Ascii(buffer, size);
  157. if (encoding != CharacterEncoding.None)
  158. {
  159. return encoding;
  160. }
  161. // ANSI or None (binary) then
  162. if (!DoesContainNulls(buffer, size))
  163. {
  164. return CharacterEncoding.Ansi;
  165. }
  166. // Found a null, return based on the preference in null_suggests_binary_
  167. return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
  168. }
  169. /// <summary>
  170. /// Checks if a buffer contains text that looks like utf16 by scanning for
  171. /// newline chars that would be present even in non-english text.
  172. /// </summary>
  173. /// <param name="buffer">The byte buffer.</param>
  174. /// <param name="size">The size of the byte buffer.</param>
  175. /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
  176. private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
  177. {
  178. if (size < 2)
  179. {
  180. return CharacterEncoding.None;
  181. }
  182. // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
  183. size--;
  184. var leControlChars = 0;
  185. var beControlChars = 0;
  186. uint pos = 0;
  187. while (pos < size)
  188. {
  189. byte ch1 = buffer[pos++];
  190. byte ch2 = buffer[pos++];
  191. if (ch1 == 0)
  192. {
  193. if (ch2 == 0x0a || ch2 == 0x0d)
  194. {
  195. ++beControlChars;
  196. }
  197. }
  198. else if (ch2 == 0)
  199. {
  200. if (ch1 == 0x0a || ch1 == 0x0d)
  201. {
  202. ++leControlChars;
  203. }
  204. }
  205. // If we are getting both LE and BE control chars then this file is not utf16
  206. if (leControlChars > 0 && beControlChars > 0)
  207. {
  208. return CharacterEncoding.None;
  209. }
  210. }
  211. if (leControlChars > 0)
  212. {
  213. return CharacterEncoding.Utf16LeNoBom;
  214. }
  215. return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
  216. }
  217. /// <summary>
  218. /// Checks if a buffer contains any nulls. Used to check for binary vs text data.
  219. /// </summary>
  220. /// <param name="buffer">The byte buffer.</param>
  221. /// <param name="size">The size of the byte buffer.</param>
  222. private static bool DoesContainNulls(byte[] buffer, int size)
  223. {
  224. uint pos = 0;
  225. while (pos < size)
  226. {
  227. if (buffer[pos++] == 0)
  228. {
  229. return true;
  230. }
  231. }
  232. return false;
  233. }
  234. /// <summary>
  235. /// Checks if a buffer contains text that looks like utf16. This is done based
  236. /// on the use of nulls which in ASCII/script like text can be useful to identify.
  237. /// </summary>
  238. /// <param name="buffer">The byte buffer.</param>
  239. /// <param name="size">The size of the byte buffer.</param>
  240. /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
  241. private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
  242. {
  243. var numOddNulls = 0;
  244. var numEvenNulls = 0;
  245. // Get even nulls
  246. uint pos = 0;
  247. while (pos < size)
  248. {
  249. if (buffer[pos] == 0)
  250. {
  251. numEvenNulls++;
  252. }
  253. pos += 2;
  254. }
  255. // Get odd nulls
  256. pos = 1;
  257. while (pos < size)
  258. {
  259. if (buffer[pos] == 0)
  260. {
  261. numOddNulls++;
  262. }
  263. pos += 2;
  264. }
  265. double evenNullThreshold = numEvenNulls * 2.0 / size;
  266. double oddNullThreshold = numOddNulls * 2.0 / size;
  267. double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
  268. double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
  269. // Lots of odd nulls, low number of even nulls
  270. if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
  271. {
  272. return CharacterEncoding.Utf16LeNoBom;
  273. }
  274. // Lots of even nulls, low number of odd nulls
  275. if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
  276. {
  277. return CharacterEncoding.Utf16BeNoBom;
  278. }
  279. // Don't know
  280. return CharacterEncoding.None;
  281. }
  282. /// <summary>
  283. /// Checks if a buffer contains valid utf8.
  284. /// </summary>
  285. /// <param name="buffer">The byte buffer.</param>
  286. /// <param name="size">The size of the byte buffer.</param>
  287. /// <returns>
  288. /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
  289. /// Encoding.ASCII (data in 0.127 range).
  290. /// </returns>
  291. /// <returns>2</returns>
  292. private CharacterEncoding CheckUtf8(byte[] buffer, int size)
  293. {
  294. // UTF8 Valid sequences
  295. // 0xxxxxxx ASCII
  296. // 110xxxxx 10xxxxxx 2-byte
  297. // 1110xxxx 10xxxxxx 10xxxxxx 3-byte
  298. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
  299. //
  300. // Width in UTF8
  301. // Decimal Width
  302. // 0-127 1 byte
  303. // 194-223 2 bytes
  304. // 224-239 3 bytes
  305. // 240-244 4 bytes
  306. //
  307. // Subsequent chars are in the range 128-191
  308. var onlySawAsciiRange = true;
  309. uint pos = 0;
  310. while (pos < size)
  311. {
  312. byte ch = buffer[pos++];
  313. if (ch == 0 && _nullSuggestsBinary)
  314. {
  315. return CharacterEncoding.None;
  316. }
  317. int moreChars;
  318. if (ch <= 127)
  319. {
  320. // 1 byte
  321. moreChars = 0;
  322. }
  323. else if (ch >= 194 && ch <= 223)
  324. {
  325. // 2 Byte
  326. moreChars = 1;
  327. }
  328. else if (ch >= 224 && ch <= 239)
  329. {
  330. // 3 Byte
  331. moreChars = 2;
  332. }
  333. else if (ch >= 240 && ch <= 244)
  334. {
  335. // 4 Byte
  336. moreChars = 3;
  337. }
  338. else
  339. {
  340. return CharacterEncoding.None; // Not utf8
  341. }
  342. // Check secondary chars are in range if we are expecting any
  343. while (moreChars > 0 && pos < size)
  344. {
  345. onlySawAsciiRange = false; // Seen non-ascii chars now
  346. ch = buffer[pos++];
  347. if (ch < 128 || ch > 191)
  348. {
  349. return CharacterEncoding.None; // Not utf8
  350. }
  351. --moreChars;
  352. }
  353. }
  354. // If we get to here then only valid UTF-8 sequences have been processed
  355. // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
  356. return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
  357. }
  358. }
  359. }