| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265 | 
							- using System;
 
- using System.Text;
 
- using MediaBrowser.Model.IO;
 
- using MediaBrowser.Model.Logging;
 
- using MediaBrowser.Model.Serialization;
 
- using MediaBrowser.Model.Text;
 
- using NLangDetect.Core;
 
- using UniversalDetector;
 
- namespace Emby.Server.Implementations.TextEncoding
 
- {
 
-     public class TextEncoding : ITextEncoding
 
-     {
 
-         private readonly IFileSystem _fileSystem;
 
-         private readonly ILogger _logger;
 
-         private IJsonSerializer _json;
 
-         public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
 
-         {
 
-             _fileSystem = fileSystem;
 
-             _logger = logger;
 
-             _json = json;
 
-         }
 
-         public Encoding GetASCIIEncoding()
 
-         {
 
-             return Encoding.ASCII;
 
-         }
 
-         private Encoding GetInitialEncoding(byte[] buffer, int count)
 
-         {
 
-             if (count >= 3)
 
-             {
 
-                 if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
 
-                     return Encoding.UTF8;
 
-             }
 
-             if (count >= 2)
 
-             {
 
-                 if (buffer[0] == 0xfe && buffer[1] == 0xff)
 
-                     return Encoding.Unicode;
 
-             }
 
-             if (count >= 4)
 
-             {
 
-                 if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
 
-                     return Encoding.UTF32;
 
-             }
 
-             if (count >= 3)
 
-             {
 
-                 if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
 
-                     return Encoding.UTF7;
 
-             }
 
-             var result = new TextEncodingDetect().DetectEncoding(buffer, count);
 
-             switch (result)
 
-             {
 
-                 case TextEncodingDetect.CharacterEncoding.Ansi:
 
-                     return Encoding.ASCII;
 
-                 case TextEncodingDetect.CharacterEncoding.Ascii:
 
-                     return Encoding.ASCII;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
 
-                     return Encoding.UTF32;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
 
-                     return Encoding.UTF32;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
 
-                     return Encoding.UTF32;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
 
-                     return Encoding.UTF32;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf8Bom:
 
-                     return Encoding.UTF8;
 
-                 case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
 
-                     return Encoding.UTF8;
 
-                 default:
 
-                     return null;
 
-             }
 
-         }
 
-         private bool _langDetectInitialized;
 
-         public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
 
-         {
 
-             var index = 0;
 
-             var encoding = GetInitialEncoding(bytes, count);
 
-             if (encoding != null && encoding.Equals(Encoding.UTF8))
 
-             {
 
-                 return "utf-8";
 
-             }
 
-             if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
 
-             {
 
-                 if (!_langDetectInitialized)
 
-                 {
 
-                     _langDetectInitialized = true;
 
-                     LanguageDetector.Initialize(_json);
 
-                 }
 
-                 language = DetectLanguage(bytes, index, count);
 
-                 if (!string.IsNullOrWhiteSpace(language))
 
-                 {
 
-                     _logger.Debug("Text language detected as {0}", language);
 
-                 }
 
-             }
 
-             var charset = DetectCharset(bytes, index, count, language);
 
-             if (!string.IsNullOrWhiteSpace(charset))
 
-             {
 
-                 if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
 
-                 {
 
-                     return "utf-8";
 
-                 }
 
-                 if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
 
-                 {
 
-                     return charset;
 
-                 }
 
-             }
 
-             if (!string.IsNullOrWhiteSpace(language))
 
-             {
 
-                 return GetFileCharacterSetFromLanguage(language);
 
-             }
 
-             return null;
 
-         }
 
-         private string DetectLanguage(byte[] bytes, int index, int count)
 
-         {
 
-             try
 
-             {
 
-                 return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
 
-             }
 
-             catch (NLangDetectException ex)
 
-             {
 
-             }
 
-             try
 
-             {
 
-                 return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
 
-             }
 
-             catch (NLangDetectException ex)
 
-             {
 
-             }
 
-             try
 
-             {
 
-                 return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
 
-             }
 
-             catch (NLangDetectException ex)
 
-             {
 
-             }
 
-             return null;
 
-         }
 
-         public Encoding GetEncodingFromCharset(string charset)
 
-         {
 
-             if (string.IsNullOrWhiteSpace(charset))
 
-             {
 
-                 throw new ArgumentNullException("charset");
 
-             }
 
-             _logger.Debug("Getting encoding object for character set: {0}", charset);
 
-             try
 
-             {
 
-                 return Encoding.GetEncoding(charset);
 
-             }
 
-             catch (ArgumentException)
 
-             {
 
-                 charset = charset.Replace("-", string.Empty);
 
-                 _logger.Debug("Getting encoding object for character set: {0}", charset);
 
-                 return Encoding.GetEncoding(charset);
 
-             }
 
-         }
 
-         public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
 
-         {
 
-             var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
 
-             return GetEncodingFromCharset(charset);
 
-         }
 
-         private string GetFileCharacterSetFromLanguage(string language)
 
-         {
 
-             // https://developer.xamarin.com/api/type/System.Text.Encoding/
 
-             switch (language.ToLower())
 
-             {
 
-                 case "hun":
 
-                     return "windows-1252";
 
-                 case "pol":
 
-                 case "cze":
 
-                 case "ces":
 
-                 case "slo":
 
-                 case "srp":
 
-                 case "hrv":
 
-                 case "rum":
 
-                 case "ron":
 
-                 case "rup":
 
-                     return "windows-1250";
 
-                 // albanian
 
-                 case "alb":
 
-                 case "sqi":
 
-                     return "windows-1250";
 
-                 // slovak
 
-                 case "slk":
 
-                 case "slv":
 
-                     return "windows-1250";
 
-                 case "ara":
 
-                     return "windows-1256";
 
-                 case "heb":
 
-                     return "windows-1255";
 
-                 case "grc":
 
-                     return "windows-1253";
 
-                 // greek
 
-                 case "gre":
 
-                 case "ell":
 
-                     return "windows-1253";
 
-                 case "crh":
 
-                 case "ota":
 
-                 case "tur":
 
-                     return "windows-1254";
 
-                 // bulgarian
 
-                 case "bul":
 
-                 case "bgr":
 
-                     return "windows-1251";
 
-                 case "rus":
 
-                     return "windows-1251";
 
-                 case "vie":
 
-                     return "windows-1258";
 
-                 case "kor":
 
-                     return "cp949";
 
-                 default:
 
-                     return "windows-1252";
 
-             }
 
-         }
 
-         private string DetectCharset(byte[] bytes, int index, int count, string language)
 
-         {
 
-             var detector = new CharsetDetector();
 
-             detector.Feed(bytes, index, count);
 
-             detector.DataEnd();
 
-             var charset = detector.Charset;
 
-             // This is often incorrectly indetected. If this happens, try to use other techniques instead
 
-             if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
 
-             {
 
-                 if (!string.IsNullOrWhiteSpace(language))
 
-                 {
 
-                     return null;
 
-                 }
 
-             }
 
-             return charset;
 
-         }
 
-     }
 
- }
 
 
  |