TextEncodingDetect.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Linq;
  4. using System.Threading.Tasks;
  5. namespace Emby.Common.Implementations.TextEncoding
  6. {
  7. // Copyright 2015-2016 Jonathan Bennett <jon@autoitscript.com>
  8. //
  9. // https://www.autoitscript.com
  10. //
  11. // Licensed under the Apache License, Version 2.0 (the "License");
  12. // you may not use this file except in compliance with the License.
  13. // You may obtain a copy of the License at
  14. //
  15. // http://www.apache.org/licenses/LICENSE-2.0
  16. //
  17. // Unless required by applicable law or agreed to in writing, software
  18. // distributed under the License is distributed on an "AS IS" BASIS,
  19. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  20. // See the License for the specific language governing permissions and
  21. // limitations under the License.
  22. /// <summary>
  23. /// Credit: https://github.com/AutoIt/text-encoding-detect
  24. /// </summary>
  25. public class TextEncodingDetect
  26. {
  27. private readonly byte[] _utf16BeBom =
  28. {
  29. 0xFE,
  30. 0xFF
  31. };
  32. private readonly byte[] _utf16LeBom =
  33. {
  34. 0xFF,
  35. 0xFE
  36. };
  37. private readonly byte[] _utf8Bom =
  38. {
  39. 0xEF,
  40. 0xBB,
  41. 0xBF
  42. };
  43. private bool _nullSuggestsBinary = true;
  44. private double _utf16ExpectedNullPercent = 70;
  45. private double _utf16UnexpectedNullPercent = 10;
  46. public enum CharacterEncoding
  47. {
  48. None, // Unknown or binary
  49. Ansi, // 0-255
  50. Ascii, // 0-127
  51. Utf8Bom, // UTF8 with BOM
  52. Utf8Nobom, // UTF8 without BOM
  53. Utf16LeBom, // UTF16 LE with BOM
  54. Utf16LeNoBom, // UTF16 LE without BOM
  55. Utf16BeBom, // UTF16-BE with BOM
  56. Utf16BeNoBom // UTF16-BE without BOM
  57. }
  58. /// <summary>
  59. /// Sets if the presence of nulls in a buffer indicate the buffer is binary data rather than text.
  60. /// </summary>
  61. public bool NullSuggestsBinary
  62. {
  63. set
  64. {
  65. _nullSuggestsBinary = value;
  66. }
  67. }
  68. public double Utf16ExpectedNullPercent
  69. {
  70. set
  71. {
  72. if (value > 0 && value < 100)
  73. {
  74. _utf16ExpectedNullPercent = value;
  75. }
  76. }
  77. }
  78. public double Utf16UnexpectedNullPercent
  79. {
  80. set
  81. {
  82. if (value > 0 && value < 100)
  83. {
  84. _utf16UnexpectedNullPercent = value;
  85. }
  86. }
  87. }
  88. /// <summary>
  89. /// Gets the BOM length for a given Encoding mode.
  90. /// </summary>
  91. /// <param name="encoding"></param>
  92. /// <returns>The BOM length.</returns>
  93. public static int GetBomLengthFromEncodingMode(CharacterEncoding encoding)
  94. {
  95. int length;
  96. switch (encoding)
  97. {
  98. case CharacterEncoding.Utf16BeBom:
  99. case CharacterEncoding.Utf16LeBom:
  100. length = 2;
  101. break;
  102. case CharacterEncoding.Utf8Bom:
  103. length = 3;
  104. break;
  105. default:
  106. length = 0;
  107. break;
  108. }
  109. return length;
  110. }
  111. /// <summary>
  112. /// Checks for a BOM sequence in a byte buffer.
  113. /// </summary>
  114. /// <param name="buffer"></param>
  115. /// <param name="size"></param>
  116. /// <returns>Encoding type or Encoding.None if no BOM.</returns>
  117. public CharacterEncoding CheckBom(byte[] buffer, int size)
  118. {
  119. // Check for BOM
  120. if (size >= 2 && buffer[0] == _utf16LeBom[0] && buffer[1] == _utf16LeBom[1])
  121. {
  122. return CharacterEncoding.Utf16LeBom;
  123. }
  124. if (size >= 2 && buffer[0] == _utf16BeBom[0] && buffer[1] == _utf16BeBom[1])
  125. {
  126. return CharacterEncoding.Utf16BeBom;
  127. }
  128. if (size >= 3 && buffer[0] == _utf8Bom[0] && buffer[1] == _utf8Bom[1] && buffer[2] == _utf8Bom[2])
  129. {
  130. return CharacterEncoding.Utf8Bom;
  131. }
  132. return CharacterEncoding.None;
  133. }
  134. /// <summary>
  135. /// Automatically detects the Encoding type of a given byte buffer.
  136. /// </summary>
  137. /// <param name="buffer">The byte buffer.</param>
  138. /// <param name="size">The size of the byte buffer.</param>
  139. /// <returns>The Encoding type or Encoding.None if unknown.</returns>
  140. public CharacterEncoding DetectEncoding(byte[] buffer, int size)
  141. {
  142. // First check if we have a BOM and return that if so
  143. CharacterEncoding encoding = CheckBom(buffer, size);
  144. if (encoding != CharacterEncoding.None)
  145. {
  146. return encoding;
  147. }
  148. // Now check for valid UTF8
  149. encoding = CheckUtf8(buffer, size);
  150. if (encoding != CharacterEncoding.None)
  151. {
  152. return encoding;
  153. }
  154. // Now try UTF16
  155. encoding = CheckUtf16NewlineChars(buffer, size);
  156. if (encoding != CharacterEncoding.None)
  157. {
  158. return encoding;
  159. }
  160. encoding = CheckUtf16Ascii(buffer, size);
  161. if (encoding != CharacterEncoding.None)
  162. {
  163. return encoding;
  164. }
  165. // ANSI or None (binary) then
  166. if (!DoesContainNulls(buffer, size))
  167. {
  168. return CharacterEncoding.Ansi;
  169. }
  170. // Found a null, return based on the preference in null_suggests_binary_
  171. return _nullSuggestsBinary ? CharacterEncoding.None : CharacterEncoding.Ansi;
  172. }
  173. /// <summary>
  174. /// Checks if a buffer contains text that looks like utf16 by scanning for
  175. /// newline chars that would be present even in non-english text.
  176. /// </summary>
  177. /// <param name="buffer">The byte buffer.</param>
  178. /// <param name="size">The size of the byte buffer.</param>
  179. /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
  180. private static CharacterEncoding CheckUtf16NewlineChars(byte[] buffer, int size)
  181. {
  182. if (size < 2)
  183. {
  184. return CharacterEncoding.None;
  185. }
  186. // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
  187. size--;
  188. var leControlChars = 0;
  189. var beControlChars = 0;
  190. uint pos = 0;
  191. while (pos < size)
  192. {
  193. byte ch1 = buffer[pos++];
  194. byte ch2 = buffer[pos++];
  195. if (ch1 == 0)
  196. {
  197. if (ch2 == 0x0a || ch2 == 0x0d)
  198. {
  199. ++beControlChars;
  200. }
  201. }
  202. else if (ch2 == 0)
  203. {
  204. if (ch1 == 0x0a || ch1 == 0x0d)
  205. {
  206. ++leControlChars;
  207. }
  208. }
  209. // If we are getting both LE and BE control chars then this file is not utf16
  210. if (leControlChars > 0 && beControlChars > 0)
  211. {
  212. return CharacterEncoding.None;
  213. }
  214. }
  215. if (leControlChars > 0)
  216. {
  217. return CharacterEncoding.Utf16LeNoBom;
  218. }
  219. return beControlChars > 0 ? CharacterEncoding.Utf16BeNoBom : CharacterEncoding.None;
  220. }
  221. /// <summary>
  222. /// Checks if a buffer contains any nulls. Used to check for binary vs text data.
  223. /// </summary>
  224. /// <param name="buffer">The byte buffer.</param>
  225. /// <param name="size">The size of the byte buffer.</param>
  226. private static bool DoesContainNulls(byte[] buffer, int size)
  227. {
  228. uint pos = 0;
  229. while (pos < size)
  230. {
  231. if (buffer[pos++] == 0)
  232. {
  233. return true;
  234. }
  235. }
  236. return false;
  237. }
  238. /// <summary>
  239. /// Checks if a buffer contains text that looks like utf16. This is done based
  240. /// on the use of nulls which in ASCII/script like text can be useful to identify.
  241. /// </summary>
  242. /// <param name="buffer">The byte buffer.</param>
  243. /// <param name="size">The size of the byte buffer.</param>
  244. /// <returns>Encoding.none, Encoding.Utf16LeNoBom or Encoding.Utf16BeNoBom.</returns>
  245. private CharacterEncoding CheckUtf16Ascii(byte[] buffer, int size)
  246. {
  247. var numOddNulls = 0;
  248. var numEvenNulls = 0;
  249. // Get even nulls
  250. uint pos = 0;
  251. while (pos < size)
  252. {
  253. if (buffer[pos] == 0)
  254. {
  255. numEvenNulls++;
  256. }
  257. pos += 2;
  258. }
  259. // Get odd nulls
  260. pos = 1;
  261. while (pos < size)
  262. {
  263. if (buffer[pos] == 0)
  264. {
  265. numOddNulls++;
  266. }
  267. pos += 2;
  268. }
  269. double evenNullThreshold = numEvenNulls * 2.0 / size;
  270. double oddNullThreshold = numOddNulls * 2.0 / size;
  271. double expectedNullThreshold = _utf16ExpectedNullPercent / 100.0;
  272. double unexpectedNullThreshold = _utf16UnexpectedNullPercent / 100.0;
  273. // Lots of odd nulls, low number of even nulls
  274. if (evenNullThreshold < unexpectedNullThreshold && oddNullThreshold > expectedNullThreshold)
  275. {
  276. return CharacterEncoding.Utf16LeNoBom;
  277. }
  278. // Lots of even nulls, low number of odd nulls
  279. if (oddNullThreshold < unexpectedNullThreshold && evenNullThreshold > expectedNullThreshold)
  280. {
  281. return CharacterEncoding.Utf16BeNoBom;
  282. }
  283. // Don't know
  284. return CharacterEncoding.None;
  285. }
  286. /// <summary>
  287. /// Checks if a buffer contains valid utf8.
  288. /// </summary>
  289. /// <param name="buffer">The byte buffer.</param>
  290. /// <param name="size">The size of the byte buffer.</param>
  291. /// <returns>
  292. /// Encoding type of Encoding.None (invalid UTF8), Encoding.Utf8NoBom (valid utf8 multibyte strings) or
  293. /// Encoding.ASCII (data in 0.127 range).
  294. /// </returns>
  295. /// <returns>2</returns>
  296. private CharacterEncoding CheckUtf8(byte[] buffer, int size)
  297. {
  298. // UTF8 Valid sequences
  299. // 0xxxxxxx ASCII
  300. // 110xxxxx 10xxxxxx 2-byte
  301. // 1110xxxx 10xxxxxx 10xxxxxx 3-byte
  302. // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 4-byte
  303. //
  304. // Width in UTF8
  305. // Decimal Width
  306. // 0-127 1 byte
  307. // 194-223 2 bytes
  308. // 224-239 3 bytes
  309. // 240-244 4 bytes
  310. //
  311. // Subsequent chars are in the range 128-191
  312. var onlySawAsciiRange = true;
  313. uint pos = 0;
  314. while (pos < size)
  315. {
  316. byte ch = buffer[pos++];
  317. if (ch == 0 && _nullSuggestsBinary)
  318. {
  319. return CharacterEncoding.None;
  320. }
  321. int moreChars;
  322. if (ch <= 127)
  323. {
  324. // 1 byte
  325. moreChars = 0;
  326. }
  327. else if (ch >= 194 && ch <= 223)
  328. {
  329. // 2 Byte
  330. moreChars = 1;
  331. }
  332. else if (ch >= 224 && ch <= 239)
  333. {
  334. // 3 Byte
  335. moreChars = 2;
  336. }
  337. else if (ch >= 240 && ch <= 244)
  338. {
  339. // 4 Byte
  340. moreChars = 3;
  341. }
  342. else
  343. {
  344. return CharacterEncoding.None; // Not utf8
  345. }
  346. // Check secondary chars are in range if we are expecting any
  347. while (moreChars > 0 && pos < size)
  348. {
  349. onlySawAsciiRange = false; // Seen non-ascii chars now
  350. ch = buffer[pos++];
  351. if (ch < 128 || ch > 191)
  352. {
  353. return CharacterEncoding.None; // Not utf8
  354. }
  355. --moreChars;
  356. }
  357. }
  358. // If we get to here then only valid UTF-8 sequences have been processed
  359. // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
  360. return onlySawAsciiRange ? CharacterEncoding.Ascii : CharacterEncoding.Utf8Nobom;
  361. }
  362. }
  363. }