123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- using System;
- using System.Text;
- using MediaBrowser.Model.IO;
- using MediaBrowser.Model.Serialization;
- using MediaBrowser.Model.Text;
- using Microsoft.Extensions.Logging;
- using NLangDetect.Core;
- using UniversalDetector;
- namespace Emby.Server.Implementations.TextEncoding
- {
- public class TextEncoding : ITextEncoding
- {
- private readonly IFileSystem _fileSystem;
- private readonly ILogger _logger;
- private IJsonSerializer _json;
- public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
- {
- _fileSystem = fileSystem;
- _logger = logger;
- _json = json;
- }
- public Encoding GetASCIIEncoding()
- {
- return Encoding.ASCII;
- }
- private static Encoding GetInitialEncoding(byte[] buffer, int count)
- {
- if (count >= 3)
- {
- if (buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf)
- return Encoding.UTF8;
- }
- if (count >= 2)
- {
- if (buffer[0] == 0xfe && buffer[1] == 0xff)
- return Encoding.Unicode;
- }
- if (count >= 4)
- {
- if (buffer[0] == 0 && buffer[1] == 0 && buffer[2] == 0xfe && buffer[3] == 0xff)
- return Encoding.UTF32;
- }
- if (count >= 3)
- {
- if (buffer[0] == 0x2b && buffer[1] == 0x2f && buffer[2] == 0x76)
- return Encoding.UTF7;
- }
- var result = new TextEncodingDetect().DetectEncoding(buffer, count);
- switch (result)
- {
- case TextEncodingDetect.CharacterEncoding.Ansi:
- return Encoding.ASCII;
- case TextEncodingDetect.CharacterEncoding.Ascii:
- return Encoding.ASCII;
- case TextEncodingDetect.CharacterEncoding.Utf16BeBom:
- return Encoding.UTF32;
- case TextEncodingDetect.CharacterEncoding.Utf16BeNoBom:
- return Encoding.UTF32;
- case TextEncodingDetect.CharacterEncoding.Utf16LeBom:
- return Encoding.UTF32;
- case TextEncodingDetect.CharacterEncoding.Utf16LeNoBom:
- return Encoding.UTF32;
- case TextEncodingDetect.CharacterEncoding.Utf8Bom:
- return Encoding.UTF8;
- case TextEncodingDetect.CharacterEncoding.Utf8Nobom:
- return Encoding.UTF8;
- default:
- return null;
- }
- }
- private bool _langDetectInitialized;
- public string GetDetectedEncodingName(byte[] bytes, int count, string language, bool enableLanguageDetection)
- {
- var index = 0;
- var encoding = GetInitialEncoding(bytes, count);
- if (encoding != null && encoding.Equals(Encoding.UTF8))
- {
- return "utf-8";
- }
- if (string.IsNullOrWhiteSpace(language) && enableLanguageDetection)
- {
- if (!_langDetectInitialized)
- {
- _langDetectInitialized = true;
- LanguageDetector.Initialize(_json);
- }
- language = DetectLanguage(bytes, index, count);
- if (!string.IsNullOrWhiteSpace(language))
- {
- _logger.LogDebug("Text language detected as {0}", language);
- }
- }
- var charset = DetectCharset(bytes, index, count, language);
- if (!string.IsNullOrWhiteSpace(charset))
- {
- if (string.Equals(charset, "utf-8", StringComparison.OrdinalIgnoreCase))
- {
- return "utf-8";
- }
- if (!string.Equals(charset, "windows-1252", StringComparison.OrdinalIgnoreCase))
- {
- return charset;
- }
- }
- if (!string.IsNullOrWhiteSpace(language))
- {
- return GetFileCharacterSetFromLanguage(language);
- }
- return null;
- }
- private string DetectLanguage(byte[] bytes, int index, int count)
- {
- try
- {
- return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes, index, count));
- }
- catch (NLangDetectException ex)
- {
- _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
- }
- try
- {
- return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes, index, count));
- }
- catch (NLangDetectException ex)
- {
- _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
- }
- try
- {
- return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes, index, count));
- }
- catch (NLangDetectException ex)
- {
- _logger.LogDebug(ex, "LanguageDetector.DetectLanguage threw a NLangDetectException.");
- }
- return null;
- }
- public Encoding GetEncodingFromCharset(string charset)
- {
- if (string.IsNullOrWhiteSpace(charset))
- {
- throw new ArgumentNullException(nameof(charset));
- }
- _logger.LogDebug("Getting encoding object for character set: {0}", charset);
- try
- {
- return Encoding.GetEncoding(charset);
- }
- catch (ArgumentException)
- {
- charset = charset.Replace("-", string.Empty);
- _logger.LogDebug("Getting encoding object for character set: {0}", charset);
- return Encoding.GetEncoding(charset);
- }
- }
- public Encoding GetDetectedEncoding(byte[] bytes, int size, string language, bool enableLanguageDetection)
- {
- var charset = GetDetectedEncodingName(bytes, size, language, enableLanguageDetection);
- return GetEncodingFromCharset(charset);
- }
- private static string GetFileCharacterSetFromLanguage(string language)
- {
- // https://developer.xamarin.com/api/type/System.Text.Encoding/
- switch (language.ToLower())
- {
- case "tha":
- return "windows-874";
- case "hun":
- return "windows-1252";
- case "pol":
- case "cze":
- case "ces":
- case "slo":
- case "srp":
- case "hrv":
- case "rum":
- case "ron":
- case "rom":
- case "rup":
- return "windows-1250";
- // albanian
- case "alb":
- case "sqi":
- return "windows-1250";
- // slovak
- case "slk":
- case "slv":
- return "windows-1250";
- case "ara":
- return "windows-1256";
- case "heb":
- return "windows-1255";
- case "grc":
- return "windows-1253";
- // greek
- case "gre":
- case "ell":
- return "windows-1253";
- case "crh":
- case "ota":
- case "tur":
- return "windows-1254";
- // bulgarian
- case "bul":
- case "bgr":
- return "windows-1251";
- case "rus":
- return "windows-1251";
- case "vie":
- return "windows-1258";
- case "kor":
- return "cp949";
- default:
- return "windows-1252";
- }
- }
- private static string DetectCharset(byte[] bytes, int index, int count, string language)
- {
- var detector = new CharsetDetector();
- detector.Feed(bytes, index, count);
- detector.DataEnd();
- var charset = detector.Charset;
- // This is often incorrectly indetected. If this happens, try to use other techniques instead
- if (string.Equals("x-mac-cyrillic", charset, StringComparison.OrdinalIgnoreCase))
- {
- if (!string.IsNullOrWhiteSpace(language))
- {
- return null;
- }
- }
- return charset;
- }
- }
- }
|