TextEncoding.cs 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. using System;
  2. using System.Text;
  3. using MediaBrowser.Model.IO;
  4. using MediaBrowser.Model.Logging;
  5. using MediaBrowser.Model.Serialization;
  6. using MediaBrowser.Model.Text;
  7. using NLangDetect.Core;
  8. using UniversalDetector;
  9. namespace Emby.Server.Implementations.TextEncoding
  10. {
  11. public class TextEncoding : ITextEncoding
  12. {
  13. private readonly IFileSystem _fileSystem;
  14. private readonly ILogger _logger;
  15. private IJsonSerializer _json;
  16. public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
  17. {
  18. _fileSystem = fileSystem;
  19. _logger = logger;
  20. _json = json;
  21. }
  22. public Encoding GetASCIIEncoding()
  23. {
  24. return Encoding.ASCII;
  25. }
  26. private Encoding GetInitialEncoding(byte[] buffer, int count)
  27. {
  28. if (count >= 3)
  29. {
  30. if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
  31. return Encoding.UTF8;
  32. }
  33. if (count >= 2)
  34. {
  35. if (buffer[0] == 0xfe && buffer[1] == 0xff)
  36. return Encoding.Unicode;
  37. }
  38. if (count >= 4)
  39. {
  40. if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
  41. return Encoding.UTF32;
  42. }
  43. if (count >= 3)
  44. {
  45. if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
  46. return Encoding.UTF7;
  47. }
  48. var result = new TextEncodingDetect().DetectEncoding(buffer, count);
  49. switch (result)
  50. {
  51. case TextEncodingDetect.CharacterEncoding.Ansi:
  52. return Encoding.ASCII;
  53. case TextEncodingDetect.CharacterEncoding.Ascii:
  54. return Encoding.ASCII;
  55. case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
  56. return Encoding.UTF32;
  57. case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
  58. return Encoding.UTF32;
  59. case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
  60. return Encoding.UTF32;
  61. case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
  62. return Encoding.UTF32;
  63. case TextEncodingDetect.CharacterEncoding.Utf8Bom:
  64. return Encoding.UTF8;
  65. case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
  66. return Encoding.UTF8;
  67. default:
  68. return null;
  69. }
  70. }
  71. private bool _langDetectInitialized;
  72. public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
  73. {
  74. var index = 0;
  75. var encoding = GetInitialEncoding(bytes, count);
  76. if (encoding != null && encoding.Equals(Encoding.UTF8))
  77. {
  78. return "utf-8";
  79. }
  80. if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
  81. {
  82. if (!_langDetectInitialized)
  83. {
  84. _langDetectInitialized = true;
  85. LanguageDetector.Initialize(_json);
  86. }
  87. language = DetectLanguage(bytes, index, count);
  88. if (!string.IsNullOrWhiteSpace(language))
  89. {
  90. _logger.Debug("Text language detected as {0}", language);
  91. }
  92. }
  93. var charset = DetectCharset(bytes, index, count, language);
  94. if (!string.IsNullOrWhiteSpace(charset))
  95. {
  96. if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
  97. {
  98. return "utf-8";
  99. }
  100. if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
  101. {
  102. return charset;
  103. }
  104. }
  105. if (!string.IsNullOrWhiteSpace(language))
  106. {
  107. return GetFileCharacterSetFromLanguage(language);
  108. }
  109. return null;
  110. }
  111. private string DetectLanguage(byte[] bytes, int index, int count)
  112. {
  113. try
  114. {
  115. return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
  116. }
  117. catch (NLangDetectException ex)
  118. {
  119. }
  120. try
  121. {
  122. return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
  123. }
  124. catch (NLangDetectException ex)
  125. {
  126. }
  127. try
  128. {
  129. return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
  130. }
  131. catch (NLangDetectException ex)
  132. {
  133. }
  134. return null;
  135. }
  136. public Encoding GetEncodingFromCharset(string charset)
  137. {
  138. if (string.IsNullOrWhiteSpace(charset))
  139. {
  140. throw new ArgumentNullException("charset");
  141. }
  142. _logger.Debug("Getting encoding object for character set: {0}", charset);
  143. try
  144. {
  145. return Encoding.GetEncoding(charset);
  146. }
  147. catch (ArgumentException)
  148. {
  149. charset = charset.Replace("-", string.Empty);
  150. _logger.Debug("Getting encoding object for character set: {0}", charset);
  151. return Encoding.GetEncoding(charset);
  152. }
  153. }
  154. public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
  155. {
  156. var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
  157. return GetEncodingFromCharset(charset);
  158. }
  159. private string GetFileCharacterSetFromLanguage(string language)
  160. {
  161. // https://developer.xamarin.com/api/type/System.Text.Encoding/
  162. switch (language.ToLower())
  163. {
  164. case "hun":
  165. return "windows-1252";
  166. case "pol":
  167. case "cze":
  168. case "ces":
  169. case "slo":
  170. case "srp":
  171. case "hrv":
  172. case "rum":
  173. case "ron":
  174. case "rup":
  175. return "windows-1250";
  176. // albanian
  177. case "alb":
  178. case "sqi":
  179. return "windows-1250";
  180. // slovak
  181. case "slk":
  182. case "slv":
  183. return "windows-1250";
  184. case "ara":
  185. return "windows-1256";
  186. case "heb":
  187. return "windows-1255";
  188. case "grc":
  189. return "windows-1253";
  190. // greek
  191. case "gre":
  192. case "ell":
  193. return "windows-1253";
  194. case "crh":
  195. case "ota":
  196. case "tur":
  197. return "windows-1254";
  198. // bulgarian
  199. case "bul":
  200. case "bgr":
  201. return "windows-1251";
  202. case "rus":
  203. return "windows-1251";
  204. case "vie":
  205. return "windows-1258";
  206. case "kor":
  207. return "cp949";
  208. default:
  209. return "windows-1252";
  210. }
  211. }
  212. private string DetectCharset(byte[] bytes, int index, int count, string language)
  213. {
  214. var detector = new CharsetDetector();
  215. detector.Feed(bytes, index, count);
  216. detector.DataEnd();
  217. var charset = detector.Charset;
  218. // This is often incorrectly indetected. If this happens, try to use other techniques instead
  219. if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
  220. {
  221. if (!string.IsNullOrWhiteSpace(language))
  222. {
  223. return null;
  224. }
  225. }
  226. return charset;
  227. }
  228. }
  229. }