TextEncoding.cs 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. using System;
  2. using System.Text;
  3. using MediaBrowser.Model.IO;
  4. using MediaBrowser.Model.Serialization;
  5. using MediaBrowser.Model.Text;
  6. using Microsoft.Extensions.Logging;
  7. using NLangDetect.Core;
  8. using UniversalDetector;
  9. namespace Emby.Server.Implementations.TextEncoding
  10. {
  11. public class TextEncoding : ITextEncoding
  12. {
  13. private readonly IFileSystem _fileSystem;
  14. private readonly ILogger _logger;
  15. private IJsonSerializer _json;
  16. public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
  17. {
  18. _fileSystem = fileSystem;
  19. _logger = logger;
  20. _json = json;
  21. }
  22. public Encoding GetASCIIEncoding()
  23. {
  24. return Encoding.ASCII;
  25. }
  26. private static Encoding GetInitialEncoding(byte[] buffer, int count)
  27. {
  28. if (count >= 3)
  29. {
  30. if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
  31. return Encoding.UTF8;
  32. }
  33. if (count >= 2)
  34. {
  35. if (buffer[0] == 0xfe && buffer[1] == 0xff)
  36. return Encoding.Unicode;
  37. }
  38. if (count >= 4)
  39. {
  40. if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
  41. return Encoding.UTF32;
  42. }
  43. if (count >= 3)
  44. {
  45. if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
  46. return Encoding.UTF7;
  47. }
  48. var result = new TextEncodingDetect().DetectEncoding(buffer, count);
  49. switch (result)
  50. {
  51. case TextEncodingDetect.CharacterEncoding.Ansi:
  52. return Encoding.ASCII;
  53. case TextEncodingDetect.CharacterEncoding.Ascii:
  54. return Encoding.ASCII;
  55. case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
  56. return Encoding.UTF32;
  57. case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
  58. return Encoding.UTF32;
  59. case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
  60. return Encoding.UTF32;
  61. case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
  62. return Encoding.UTF32;
  63. case TextEncodingDetect.CharacterEncoding.Utf8Bom:
  64. return Encoding.UTF8;
  65. case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
  66. return Encoding.UTF8;
  67. default:
  68. return null;
  69. }
  70. }
  71. private bool _langDetectInitialized;
  72. public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
  73. {
  74. var index = 0;
  75. var encoding = GetInitialEncoding(bytes, count);
  76. if (encoding != null && encoding.Equals(Encoding.UTF8))
  77. {
  78. return "utf-8";
  79. }
  80. if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
  81. {
  82. if (!_langDetectInitialized)
  83. {
  84. _langDetectInitialized = true;
  85. LanguageDetector.Initialize(_json);
  86. }
  87. language = DetectLanguage(bytes, index, count);
  88. if (!string.IsNullOrWhiteSpace(language))
  89. {
  90. _logger.LogDebug("Text language detected as {0}", language);
  91. }
  92. }
  93. var charset = DetectCharset(bytes, index, count, language);
  94. if (!string.IsNullOrWhiteSpace(charset))
  95. {
  96. if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
  97. {
  98. return "utf-8";
  99. }
  100. if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
  101. {
  102. return charset;
  103. }
  104. }
  105. if (!string.IsNullOrWhiteSpace(language))
  106. {
  107. return GetFileCharacterSetFromLanguage(language);
  108. }
  109. return null;
  110. }
  111. private string DetectLanguage(byte[] bytes, int index, int count)
  112. {
  113. try
  114. {
  115. return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
  116. }
  117. catch (NLangDetectException ex)
  118. {
  119. _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
  120. }
  121. try
  122. {
  123. return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
  124. }
  125. catch (NLangDetectException ex)
  126. {
  127. _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
  128. }
  129. try
  130. {
  131. return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
  132. }
  133. catch (NLangDetectException ex)
  134. {
  135. _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
  136. }
  137. return null;
  138. }
  139. public Encoding GetEncodingFromCharset(string charset)
  140. {
  141. if (string.IsNullOrWhiteSpace(charset))
  142. {
  143. throw new ArgumentNullException(nameof(charset));
  144. }
  145. _logger.LogDebug("Getting encoding object for character set: {0}", charset);
  146. try
  147. {
  148. return Encoding.GetEncoding(charset);
  149. }
  150. catch (ArgumentException)
  151. {
  152. charset = charset.Replace("-", string.Empty);
  153. _logger.LogDebug("Getting encoding object for character set: {0}", charset);
  154. return Encoding.GetEncoding(charset);
  155. }
  156. }
  157. public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
  158. {
  159. var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
  160. return GetEncodingFromCharset(charset);
  161. }
  162. private static string GetFileCharacterSetFromLanguage(string language)
  163. {
  164. // https://developer.xamarin.com/api/type/System.Text.Encoding/
  165. switch (language.ToLower())
  166. {
  167. case "tha":
  168. return "windows-874";
  169. case "hun":
  170. return "windows-1252";
  171. case "pol":
  172. case "cze":
  173. case "ces":
  174. case "slo":
  175. case "srp":
  176. case "hrv":
  177. case "rum":
  178. case "ron":
  179. case "rom":
  180. case "rup":
  181. return "windows-1250";
  182. // albanian
  183. case "alb":
  184. case "sqi":
  185. return "windows-1250";
  186. // slovak
  187. case "slk":
  188. case "slv":
  189. return "windows-1250";
  190. case "ara":
  191. return "windows-1256";
  192. case "heb":
  193. return "windows-1255";
  194. case "grc":
  195. return "windows-1253";
  196. // greek
  197. case "gre":
  198. case "ell":
  199. return "windows-1253";
  200. case "crh":
  201. case "ota":
  202. case "tur":
  203. return "windows-1254";
  204. // bulgarian
  205. case "bul":
  206. case "bgr":
  207. return "windows-1251";
  208. case "rus":
  209. return "windows-1251";
  210. case "vie":
  211. return "windows-1258";
  212. case "kor":
  213. return "cp949";
  214. default:
  215. return "windows-1252";
  216. }
  217. }
  218. private static string DetectCharset(byte[] bytes, int index, int count, string language)
  219. {
  220. var detector = new CharsetDetector();
  221. detector.Feed(bytes, index, count);
  222. detector.DataEnd();
  223. var charset = detector.Charset;
  224. // This is often incorrectly indetected. If this happens, try to use other techniques instead
  225. if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
  226. {
  227. if (!string.IsNullOrWhiteSpace(language))
  228. {
  229. return null;
  230. }
  231. }
  232. return charset;
  233. }
  234. }
  235. }