TextEncoding.cs 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. using System;
  2. using System.Text;
  3. using MediaBrowser.Model.IO;
  4. using MediaBrowser.Model.Text;
  5. using System.IO;
  6. using System.Threading;
  7. using System.Threading.Tasks;
  8. using MediaBrowser.Model.MediaInfo;
  9. using MediaBrowser.Model.Logging;
  10. using UniversalDetector;
  11. using NLangDetect.Core;
  12. using MediaBrowser.Model.Serialization;
  13. namespace Emby.Common.Implementations.TextEncoding
  14. {
  15. public class TextEncoding : ITextEncoding
  16. {
  17. private readonly IFileSystem _fileSystem;
  18. private readonly ILogger _logger;
  19. private IJsonSerializer _json;
  20. public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
  21. {
  22. _fileSystem = fileSystem;
  23. _logger = logger;
  24. _json = json;
  25. }
  26. public Encoding GetASCIIEncoding()
  27. {
  28. return Encoding.ASCII;
  29. }
  30. private Encoding GetInitialEncoding(byte[] buffer)
  31. {
  32. if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
  33. return Encoding.UTF8;
  34. if (buffer[0] == 0xfe && buffer[1] == 0xff)
  35. return Encoding.Unicode;
  36. if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
  37. return Encoding.UTF32;
  38. if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
  39. return Encoding.UTF7;
  40. var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length);
  41. switch (result)
  42. {
  43. case TextEncodingDetect.CharacterEncoding.Ansi:
  44. return Encoding.ASCII;
  45. case TextEncodingDetect.CharacterEncoding.Ascii:
  46. return Encoding.ASCII;
  47. case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
  48. return Encoding.UTF32;
  49. case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
  50. return Encoding.UTF32;
  51. case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
  52. return Encoding.UTF32;
  53. case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
  54. return Encoding.UTF32;
  55. case TextEncodingDetect.CharacterEncoding.Utf8Bom:
  56. return Encoding.UTF8;
  57. case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
  58. return Encoding.UTF8;
  59. default:
  60. return null;
  61. }
  62. }
  63. private bool _langDetectInitialized;
  64. public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection)
  65. {
  66. var encoding = GetInitialEncoding(bytes);
  67. if (encoding != null && encoding.Equals(Encoding.UTF8))
  68. {
  69. return "utf-8";
  70. }
  71. if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
  72. {
  73. if (!_langDetectInitialized)
  74. {
  75. _langDetectInitialized = true;
  76. LanguageDetector.Initialize(_json);
  77. }
  78. language = DetectLanguage(bytes);
  79. if (!string.IsNullOrWhiteSpace(language))
  80. {
  81. _logger.Debug("Text language detected as {0}", language);
  82. }
  83. }
  84. var charset = DetectCharset(bytes, language);
  85. if (!string.IsNullOrWhiteSpace(charset))
  86. {
  87. if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
  88. {
  89. return "utf-8";
  90. }
  91. if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
  92. {
  93. return charset;
  94. }
  95. }
  96. if (!string.IsNullOrWhiteSpace(language))
  97. {
  98. return GetFileCharacterSetFromLanguage(language);
  99. }
  100. return null;
  101. }
  102. private string DetectLanguage(byte[] bytes)
  103. {
  104. try
  105. {
  106. return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
  107. }
  108. catch (NLangDetectException ex)
  109. {
  110. }
  111. try
  112. {
  113. return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
  114. }
  115. catch (NLangDetectException ex)
  116. {
  117. }
  118. try
  119. {
  120. return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
  121. }
  122. catch (NLangDetectException ex)
  123. {
  124. }
  125. return null;
  126. }
  127. public Encoding GetEncodingFromCharset(string charset)
  128. {
  129. if (string.IsNullOrWhiteSpace(charset))
  130. {
  131. throw new ArgumentNullException("charset");
  132. }
  133. _logger.Debug("Getting encoding object for character set: {0}", charset);
  134. try
  135. {
  136. return Encoding.GetEncoding(charset);
  137. }
  138. catch (ArgumentException)
  139. {
  140. charset = charset.Replace("-", string.Empty);
  141. _logger.Debug("Getting encoding object for character set: {0}", charset);
  142. return Encoding.GetEncoding(charset);
  143. }
  144. }
  145. public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection)
  146. {
  147. var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection);
  148. return GetEncodingFromCharset(charset);
  149. }
  150. private string GetFileCharacterSetFromLanguage(string language)
  151. {
  152. // https://developer.xamarin.com/api/type/System.Text.Encoding/
  153. switch (language.ToLower())
  154. {
  155. case "hun":
  156. return "windows-1252";
  157. case "pol":
  158. case "cze":
  159. case "ces":
  160. case "slo":
  161. case "srp":
  162. case "hrv":
  163. case "rum":
  164. case "ron":
  165. case "rup":
  166. return "windows-1250";
  167. // albanian
  168. case "alb":
  169. case "sqi":
  170. return "windows-1250";
  171. // slovak
  172. case "slk":
  173. case "slv":
  174. return "windows-1250";
  175. case "ara":
  176. return "windows-1256";
  177. case "heb":
  178. return "windows-1255";
  179. case "grc":
  180. return "windows-1253";
  181. // greek
  182. case "gre":
  183. case "ell":
  184. return "windows-1253";
  185. case "crh":
  186. case "ota":
  187. case "tur":
  188. return "windows-1254";
  189. // bulgarian
  190. case "bul":
  191. case "bgr":
  192. return "windows-1251";
  193. case "rus":
  194. return "windows-1251";
  195. case "vie":
  196. return "windows-1258";
  197. case "kor":
  198. return "cp949";
  199. default:
  200. return "windows-1252";
  201. }
  202. }
  203. private string DetectCharset(byte[] bytes, string language)
  204. {
  205. var detector = new CharsetDetector();
  206. detector.Feed(bytes, 0, bytes.Length);
  207. detector.DataEnd();
  208. var charset = detector.Charset;
  209. // This is often incorrectly indetected. If this happens, try to use other techniques instead
  210. if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
  211. {
  212. if (!string.IsNullOrWhiteSpace(language))
  213. {
  214. return null;
  215. }
  216. }
  217. return charset;
  218. }
  219. }
  220. }