2
0

TextEncoding.cs 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. using System;
  2. using System.Text;
  3. using MediaBrowser.Model.IO;
  4. using MediaBrowser.Model.Logging;
  5. using MediaBrowser.Model.Serialization;
  6. using MediaBrowser.Model.Text;
  7. using NLangDetect.Core;
  8. using UniversalDetector;
  9. namespace Emby.Server.Implementations.TextEncoding
  10. {
  11. public class TextEncoding : ITextEncoding
  12. {
  13. private readonly IFileSystem _fileSystem;
  14. private readonly ILogger _logger;
  15. private IJsonSerializer _json;
  16. public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
  17. {
  18. _fileSystem = fileSystem;
  19. _logger = logger;
  20. _json = json;
  21. }
  22. public Encoding GetASCIIEncoding()
  23. {
  24. return Encoding.ASCII;
  25. }
  26. private Encoding GetInitialEncoding(byte[] buffer)
  27. {
  28. if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
  29. return Encoding.UTF8;
  30. if (buffer[0] == 0xfe && buffer[1] == 0xff)
  31. return Encoding.Unicode;
  32. if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
  33. return Encoding.UTF32;
  34. if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
  35. return Encoding.UTF7;
  36. var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length);
  37. switch (result)
  38. {
  39. case TextEncodingDetect.CharacterEncoding.Ansi:
  40. return Encoding.ASCII;
  41. case TextEncodingDetect.CharacterEncoding.Ascii:
  42. return Encoding.ASCII;
  43. case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
  44. return Encoding.UTF32;
  45. case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
  46. return Encoding.UTF32;
  47. case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
  48. return Encoding.UTF32;
  49. case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
  50. return Encoding.UTF32;
  51. case TextEncodingDetect.CharacterEncoding.Utf8Bom:
  52. return Encoding.UTF8;
  53. case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
  54. return Encoding.UTF8;
  55. default:
  56. return null;
  57. }
  58. }
  59. private bool _langDetectInitialized;
  60. public string GetDetectedEncodingName(byte[] bytes, string language, bool enableLanguageDetection)
  61. {
  62. var encoding = GetInitialEncoding(bytes);
  63. if (encoding != null && encoding.Equals(Encoding.UTF8))
  64. {
  65. return "utf-8";
  66. }
  67. if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
  68. {
  69. if (!_langDetectInitialized)
  70. {
  71. _langDetectInitialized = true;
  72. LanguageDetector.Initialize(_json);
  73. }
  74. language = DetectLanguage(bytes);
  75. if (!string.IsNullOrWhiteSpace(language))
  76. {
  77. _logger.Debug("Text language detected as {0}", language);
  78. }
  79. }
  80. var charset = DetectCharset(bytes, language);
  81. if (!string.IsNullOrWhiteSpace(charset))
  82. {
  83. if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
  84. {
  85. return "utf-8";
  86. }
  87. if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
  88. {
  89. return charset;
  90. }
  91. }
  92. if (!string.IsNullOrWhiteSpace(language))
  93. {
  94. return GetFileCharacterSetFromLanguage(language);
  95. }
  96. return null;
  97. }
  98. private string DetectLanguage(byte[] bytes)
  99. {
  100. try
  101. {
  102. return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
  103. }
  104. catch (NLangDetectException ex)
  105. {
  106. }
  107. try
  108. {
  109. return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
  110. }
  111. catch (NLangDetectException ex)
  112. {
  113. }
  114. try
  115. {
  116. return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
  117. }
  118. catch (NLangDetectException ex)
  119. {
  120. }
  121. return null;
  122. }
  123. public Encoding GetEncodingFromCharset(string charset)
  124. {
  125. if (string.IsNullOrWhiteSpace(charset))
  126. {
  127. throw new ArgumentNullException("charset");
  128. }
  129. _logger.Debug("Getting encoding object for character set: {0}", charset);
  130. try
  131. {
  132. return Encoding.GetEncoding(charset);
  133. }
  134. catch (ArgumentException)
  135. {
  136. charset = charset.Replace("-", string.Empty);
  137. _logger.Debug("Getting encoding object for character set: {0}", charset);
  138. return Encoding.GetEncoding(charset);
  139. }
  140. }
  141. public Encoding GetDetectedEncoding(byte[] bytes, string language, bool enableLanguageDetection)
  142. {
  143. var charset = GetDetectedEncodingName(bytes, language, enableLanguageDetection);
  144. return GetEncodingFromCharset(charset);
  145. }
  146. private string GetFileCharacterSetFromLanguage(string language)
  147. {
  148. // https://developer.xamarin.com/api/type/System.Text.Encoding/
  149. switch (language.ToLower())
  150. {
  151. case "hun":
  152. return "windows-1252";
  153. case "pol":
  154. case "cze":
  155. case "ces":
  156. case "slo":
  157. case "srp":
  158. case "hrv":
  159. case "rum":
  160. case "ron":
  161. case "rup":
  162. return "windows-1250";
  163. // albanian
  164. case "alb":
  165. case "sqi":
  166. return "windows-1250";
  167. // slovak
  168. case "slk":
  169. case "slv":
  170. return "windows-1250";
  171. case "ara":
  172. return "windows-1256";
  173. case "heb":
  174. return "windows-1255";
  175. case "grc":
  176. return "windows-1253";
  177. // greek
  178. case "gre":
  179. case "ell":
  180. return "windows-1253";
  181. case "crh":
  182. case "ota":
  183. case "tur":
  184. return "windows-1254";
  185. // bulgarian
  186. case "bul":
  187. case "bgr":
  188. return "windows-1251";
  189. case "rus":
  190. return "windows-1251";
  191. case "vie":
  192. return "windows-1258";
  193. case "kor":
  194. return "cp949";
  195. default:
  196. return "windows-1252";
  197. }
  198. }
  199. private string DetectCharset(byte[] bytes, string language)
  200. {
  201. var detector = new CharsetDetector();
  202. detector.Feed(bytes, 0, bytes.Length);
  203. detector.DataEnd();
  204. var charset = detector.Charset;
  205. // This is often incorrectly indetected. If this happens, try to use other techniques instead
  206. if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
  207. {
  208. if (!string.IsNullOrWhiteSpace(language))
  209. {
  210. return null;
  211. }
  212. }
  213. return charset;
  214. }
  215. }
  216. }