TextEncoding.cs 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. using System;
  2. using System.Text;
  3. using MediaBrowser.Model.IO;
  4. using MediaBrowser.Model.Text;
  5. using System.IO;
  6. using System.Threading;
  7. using System.Threading.Tasks;
  8. using MediaBrowser.Model.MediaInfo;
  9. using MediaBrowser.Model.Logging;
  10. using UniversalDetector;
  11. namespace Emby.Common.Implementations.TextEncoding
  12. {
  13. public class TextEncoding : ITextEncoding
  14. {
  15. private readonly IFileSystem _fileSystem;
  16. private readonly ILogger _logger;
  17. public TextEncoding(IFileSystem fileSystem, ILogger logger)
  18. {
  19. _fileSystem = fileSystem;
  20. _logger = logger;
  21. }
  22. public Encoding GetASCIIEncoding()
  23. {
  24. return Encoding.ASCII;
  25. }
  26. private Encoding GetInitialEncoding(byte[] buffer)
  27. {
  28. if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
  29. return Encoding.UTF8;
  30. if (buffer[0] == 0xfe && buffer[1] == 0xff)
  31. return Encoding.Unicode;
  32. if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
  33. return Encoding.UTF32;
  34. if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
  35. return Encoding.UTF7;
  36. var result = new TextEncodingDetect().DetectEncoding(buffer, buffer.Length);
  37. switch (result)
  38. {
  39. case TextEncodingDetect.CharacterEncoding.Ansi:
  40. return Encoding.ASCII;
  41. case TextEncodingDetect.CharacterEncoding.Ascii:
  42. return Encoding.ASCII;
  43. case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
  44. return Encoding.UTF32;
  45. case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
  46. return Encoding.UTF32;
  47. case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
  48. return Encoding.UTF32;
  49. case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
  50. return Encoding.UTF32;
  51. case TextEncodingDetect.CharacterEncoding.Utf8Bom:
  52. return Encoding.UTF8;
  53. case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
  54. return Encoding.UTF8;
  55. default:
  56. return null;
  57. }
  58. }
  59. public string GetDetectedEncodingName(byte[] bytes, string language)
  60. {
  61. var encoding = GetInitialEncoding(bytes);
  62. if (encoding != null && encoding.Equals(Encoding.UTF8))
  63. {
  64. return "utf-8";
  65. }
  66. var charset = DetectCharset(bytes, language);
  67. if (!string.IsNullOrWhiteSpace(charset))
  68. {
  69. if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
  70. {
  71. return "utf-8";
  72. }
  73. if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
  74. {
  75. return charset;
  76. }
  77. }
  78. if (!string.IsNullOrWhiteSpace(language))
  79. {
  80. return GetFileCharacterSetFromLanguage(language);
  81. }
  82. return null;
  83. }
  84. public Encoding GetEncodingFromCharset(string charset)
  85. {
  86. if (string.IsNullOrWhiteSpace(charset))
  87. {
  88. throw new ArgumentNullException("charset");
  89. }
  90. _logger.Debug("Getting encoding object for character set: {0}", charset);
  91. try
  92. {
  93. return Encoding.GetEncoding(charset);
  94. }
  95. catch (ArgumentException)
  96. {
  97. charset = charset.Replace("-", string.Empty);
  98. _logger.Debug("Getting encoding object for character set: {0}", charset);
  99. return Encoding.GetEncoding(charset);
  100. }
  101. }
  102. public Encoding GetDetectedEncoding(byte[] bytes, string language)
  103. {
  104. var charset = GetDetectedEncodingName(bytes, language);
  105. return GetEncodingFromCharset(charset);
  106. }
  107. private string GetFileCharacterSetFromLanguage(string language)
  108. {
  109. // https://developer.xamarin.com/api/type/System.Text.Encoding/
  110. switch (language.ToLower())
  111. {
  112. case "hun":
  113. return "windows-1252";
  114. case "pol":
  115. case "cze":
  116. case "ces":
  117. case "slo":
  118. case "slk":
  119. case "slv":
  120. case "srp":
  121. case "hrv":
  122. case "rum":
  123. case "ron":
  124. case "rup":
  125. case "alb":
  126. case "sqi":
  127. return "windows-1250";
  128. case "ara":
  129. return "windows-1256";
  130. case "heb":
  131. return "windows-1255";
  132. case "grc":
  133. case "gre":
  134. return "windows-1253";
  135. case "crh":
  136. case "ota":
  137. case "tur":
  138. return "windows-1254";
  139. case "rus":
  140. return "windows-1251";
  141. case "vie":
  142. return "windows-1258";
  143. case "kor":
  144. return "cp949";
  145. default:
  146. return "windows-1252";
  147. }
  148. }
  149. private string DetectCharset(byte[] bytes, string language)
  150. {
  151. var detector = new CharsetDetector();
  152. detector.Feed(bytes, 0, bytes.Length);
  153. detector.DataEnd();
  154. var charset = detector.Charset;
  155. // This is often incorrectly indetected. If this happens, try to use other techniques instead
  156. if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
  157. {
  158. if (!string.IsNullOrWhiteSpace(language))
  159. {
  160. return null;
  161. }
  162. }
  163. return charset;
  164. }
  165. }
  166. }