| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 | using System;using System.Text;using MediaBrowser.Model.IO;using MediaBrowser.Model.Logging;using MediaBrowser.Model.Serialization;using MediaBrowser.Model.Text;using NLangDetect.Core;using UniversalDetector;namespace Emby.Server.Implementations.TextEncoding{    public class TextEncoding : ITextEncoding    {        private readonly IFileSystem _fileSystem;        private readonly ILogger _logger;        private IJsonSerializer _json;        public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)        {            _fileSystem = fileSystem;            _logger = logger;            _json = json;        }        public Encoding GetASCIIEncoding()        {            return Encoding.ASCII;        }        private Encoding GetInitialEncoding(byte[] buffer, int count)        {            if (count >= 3)            {                if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)                    return Encoding.UTF8;            }            if (count >= 2)            {                if (buffer[0] == 0xfe && buffer[1] == 0xff)                    return Encoding.Unicode;            }            if (count >= 4)            {                if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)                    return Encoding.UTF32;            }            if (count >= 3)            {                if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)                    return Encoding.UTF7;            }            var result = new TextEncodingDetect().DetectEncoding(buffer, count);            switch (result)            {                case TextEncodingDetect.CharacterEncoding.Ansi:                    return Encoding.ASCII;                case TextEncodingDetect.CharacterEncoding.Ascii:                    return Encoding.ASCII;                case TextEncodingDetect.CharacterEncoding.Utf16BeBom:                    return Encoding.UTF32;                case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:                    return Encoding.UTF32;                case TextEncodingDetect.CharacterEncoding.Utf16LeBom:                    return Encoding.UTF32;                case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:                    return Encoding.UTF32;                case TextEncodingDetect.CharacterEncoding.Utf8Bom:                    return Encoding.UTF8;                case TextEncodingDetect.CharacterEncoding.Utf8Nobom:                    return Encoding.UTF8;                default:                    return null;            }        }        private bool _langDetectInitialized;        public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)        {            var index = 0;            var encoding = GetInitialEncoding(bytes, count);            if (encoding != null && encoding.Equals(Encoding.UTF8))            {                return "utf-8";            }            if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)            {                if (!_langDetectInitialized)                {                    _langDetectInitialized = true;                    LanguageDetector.Initialize(_json);                }                language = DetectLanguage(bytes, index, count);                if (!string.IsNullOrWhiteSpace(language))                {                    _logger.Debug("Text language detected as {0}", language);                }            }            var charset = DetectCharset(bytes, index, count, language);            if (!string.IsNullOrWhiteSpace(charset))            {                if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))                {                    return "utf-8";                }                if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))                {                    return charset;                }            }            if (!string.IsNullOrWhiteSpace(language))            {                return GetFileCharacterSetFromLanguage(language);            }            return null;        }        private string DetectLanguage(byte[] bytes, int index, int count)        {            try            {                return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));            }            catch (NLangDetectException ex)            {            }            try            {                return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));            }            catch (NLangDetectException ex)            {            }            try            {                return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));            }            catch (NLangDetectException ex)            {            }            return null;        }        public Encoding GetEncodingFromCharset(string charset)        {            if (string.IsNullOrWhiteSpace(charset))            {                throw new ArgumentNullException("charset");            }            _logger.Debug("Getting encoding object for character set: {0}", charset);            try            {                return Encoding.GetEncoding(charset);            }            catch (ArgumentException)            {                charset = charset.Replace("-", string.Empty);                _logger.Debug("Getting encoding object for character set: {0}", charset);                return Encoding.GetEncoding(charset);            }        }        public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)        {            var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);            return GetEncodingFromCharset(charset);        }        private string GetFileCharacterSetFromLanguage(string language)        {            // https://developer.xamarin.com/api/type/System.Text.Encoding/            switch (language.ToLower())            {                case "tha":                    return "windows-874";                case "hun":                    return "windows-1252";                case "pol":                case "cze":                case "ces":                case "slo":                case "srp":                case "hrv":                case "rum":                case "ron":                case "rom":                case "rup":                    return "windows-1250";                // albanian                case "alb":                case "sqi":                    return "windows-1250";                // slovak                case "slk":                case "slv":                    return "windows-1250";                case "ara":                    return "windows-1256";                case "heb":                    return "windows-1255";                case "grc":                    return "windows-1253";                // greek                case "gre":                case "ell":                    return "windows-1253";                case "crh":                case "ota":                case "tur":                    return "windows-1254";                // bulgarian                case "bul":                case "bgr":                    return "windows-1251";                case "rus":                    return "windows-1251";                case "vie":                    return "windows-1258";                case "kor":                    return "cp949";                default:                    return "windows-1252";            }        }        private string DetectCharset(byte[] bytes, int index, int count, string language)        {            var detector = new CharsetDetector();            detector.Feed(bytes, index, count);            detector.DataEnd();            var charset = detector.Charset;            // This is often incorrectly indetected. If this happens, try to use other techniques instead            if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))            {                if (!string.IsNullOrWhiteSpace(language))                {                    return null;                }            }            return charset;        }    }}
 |