Browse Source

add subtitle language detection

Luke Pulverenti 8 years ago
parent
commit
0e7cbb0465
76 changed files with 2101 additions and 26 deletions
  1. 72 0
      Emby.Common.Implementations/Emby.Common.Implementations.csproj
  2. 0 18
      Emby.Common.Implementations/IO/SharpCifs/Util/Sharpen/Extensions.cs
  3. 371 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Detector.cs
  4. 127 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/DetectorFactory.cs
  5. 15 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/ErrorCode.cs
  6. 374 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs
  7. 51 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs
  8. 25 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs
  9. 131 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs
  10. 67 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/GenProfile.cs
  11. 22 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/InternalException.cs
  12. 45 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Language.cs
  13. 37 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/LanguageDetector.cs
  14. 23 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/NLangDetectException.cs
  15. 35 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/ProbVector.cs
  16. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/afr
  17. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ara
  18. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ben
  19. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/bul
  20. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ces
  21. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/dan
  22. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/deu
  23. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ell
  24. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/eng
  25. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/est
  26. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fas
  27. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fin
  28. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fra
  29. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/guj
  30. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/heb
  31. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hin
  32. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hrv
  33. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hun
  34. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ind
  35. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ita
  36. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/jpn
  37. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/kan
  38. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/kor
  39. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/lav
  40. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/lit
  41. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mal
  42. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mar
  43. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mkd
  44. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nep
  45. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nld
  46. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nor
  47. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/pan
  48. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/pol
  49. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/por
  50. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ron
  51. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/rus
  52. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/slk
  53. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/slv
  54. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/som
  55. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/spa
  56. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/sqi
  57. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/swa
  58. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/swe
  59. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tam
  60. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tel
  61. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tgl
  62. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tha
  63. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tur
  64. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ukr
  65. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/urd
  66. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/vie
  67. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/zh-cn
  68. 0 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/zh-tw
  69. 118 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs
  70. 91 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/Messages.cs
  71. 330 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs
  72. 76 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/TagExtractor.cs
  73. 26 0
      Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/messages.properties
  74. 60 3
      Emby.Common.Implementations/TextEncoding/TextEncoding.cs
  75. 1 1
      Emby.Server.Core/ApplicationHost.cs
  76. 4 4
      MediaBrowser.Api/UserLibrary/BaseItemsRequest.cs

+ 72 - 0
Emby.Common.Implementations/Emby.Common.Implementations.csproj

@@ -317,6 +317,23 @@
     <Compile Include="ScheduledTasks\WeeklyTrigger.cs" />
     <Compile Include="ScheduledTasks\WeeklyTrigger.cs" />
     <Compile Include="Serialization\JsonSerializer.cs" />
     <Compile Include="Serialization\JsonSerializer.cs" />
     <Compile Include="Serialization\XmlSerializer.cs" />
     <Compile Include="Serialization\XmlSerializer.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Detector.cs" />
+    <Compile Include="TextEncoding\NLangDetect\DetectorFactory.cs" />
+    <Compile Include="TextEncoding\NLangDetect\ErrorCode.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Extensions\CharExtensions.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Extensions\RandomExtensions.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Extensions\StringExtensions.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Extensions\UnicodeBlock.cs" />
+    <Compile Include="TextEncoding\NLangDetect\GenProfile.cs" />
+    <Compile Include="TextEncoding\NLangDetect\InternalException.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Language.cs" />
+    <Compile Include="TextEncoding\NLangDetect\LanguageDetector.cs" />
+    <Compile Include="TextEncoding\NLangDetect\NLangDetectException.cs" />
+    <Compile Include="TextEncoding\NLangDetect\ProbVector.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Utils\LangProfile.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Utils\Messages.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Utils\NGram.cs" />
+    <Compile Include="TextEncoding\NLangDetect\Utils\TagExtractor.cs" />
     <Compile Include="TextEncoding\TextEncoding.cs" />
     <Compile Include="TextEncoding\TextEncoding.cs" />
     <Compile Include="TextEncoding\TextEncodingDetect.cs" />
     <Compile Include="TextEncoding\TextEncodingDetect.cs" />
     <Compile Include="TextEncoding\UniversalDetector\CharsetDetector.cs" />
     <Compile Include="TextEncoding\UniversalDetector\CharsetDetector.cs" />
@@ -368,7 +385,62 @@
   </ItemGroup>
   </ItemGroup>
   <ItemGroup>
   <ItemGroup>
     <None Include="packages.config" />
     <None Include="packages.config" />
+    <None Include="TextEncoding\NLangDetect\Profiles\afr" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ara" />
+    <None Include="TextEncoding\NLangDetect\Profiles\bul" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ben" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ces" />
+    <None Include="TextEncoding\NLangDetect\Profiles\dan" />
+    <None Include="TextEncoding\NLangDetect\Profiles\deu" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ell" />
+    <None Include="TextEncoding\NLangDetect\Profiles\eng" />
+    <None Include="TextEncoding\NLangDetect\Profiles\spa" />
+    <None Include="TextEncoding\NLangDetect\Profiles\est" />
+    <None Include="TextEncoding\NLangDetect\Profiles\fas" />
+    <None Include="TextEncoding\NLangDetect\Profiles\fin" />
+    <None Include="TextEncoding\NLangDetect\Profiles\fra" />
+    <None Include="TextEncoding\NLangDetect\Profiles\guj" />
+    <None Include="TextEncoding\NLangDetect\Profiles\heb" />
+    <None Include="TextEncoding\NLangDetect\Profiles\hin" />
+    <None Include="TextEncoding\NLangDetect\Profiles\hrv" />
+    <None Include="TextEncoding\NLangDetect\Profiles\hun" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ind" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ita" />
+    <None Include="TextEncoding\NLangDetect\Profiles\jpn" />
+    <None Include="TextEncoding\NLangDetect\Profiles\kan" />
+    <None Include="TextEncoding\NLangDetect\Profiles\kor" />
+    <None Include="TextEncoding\NLangDetect\Profiles\lit" />
+    <None Include="TextEncoding\NLangDetect\Profiles\lav" />
+    <None Include="TextEncoding\NLangDetect\Profiles\mkd" />
+    <None Include="TextEncoding\NLangDetect\Profiles\mal" />
+    <None Include="TextEncoding\NLangDetect\Profiles\mar" />
+    <None Include="TextEncoding\NLangDetect\Profiles\nep" />
+    <None Include="TextEncoding\NLangDetect\Profiles\nld" />
+    <None Include="TextEncoding\NLangDetect\Profiles\nor" />
+    <None Include="TextEncoding\NLangDetect\Profiles\pan" />
+    <None Include="TextEncoding\NLangDetect\Profiles\pol" />
+    <None Include="TextEncoding\NLangDetect\Profiles\por" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ron" />
+    <None Include="TextEncoding\NLangDetect\Profiles\rus" />
+    <None Include="TextEncoding\NLangDetect\Profiles\slk" />
+    <None Include="TextEncoding\NLangDetect\Profiles\slv" />
+    <None Include="TextEncoding\NLangDetect\Profiles\som" />
+    <None Include="TextEncoding\NLangDetect\Profiles\sqi" />
+    <None Include="TextEncoding\NLangDetect\Profiles\swe" />
+    <None Include="TextEncoding\NLangDetect\Profiles\swa" />
+    <None Include="TextEncoding\NLangDetect\Profiles\tam" />
+    <None Include="TextEncoding\NLangDetect\Profiles\tel" />
+    <None Include="TextEncoding\NLangDetect\Profiles\tha" />
+    <None Include="TextEncoding\NLangDetect\Profiles\tgl" />
+    <None Include="TextEncoding\NLangDetect\Profiles\tur" />
+    <None Include="TextEncoding\NLangDetect\Profiles\ukr" />
+    <None Include="TextEncoding\NLangDetect\Profiles\urd" />
+    <None Include="TextEncoding\NLangDetect\Profiles\vie" />
+    <EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-cn" />
+    <EmbeddedResource Include="TextEncoding\NLangDetect\Profiles\zh-tw" />
+    <EmbeddedResource Include="TextEncoding\NLangDetect\Utils\messages.properties" />
   </ItemGroup>
   </ItemGroup>
+  <ItemGroup />
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
   <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
        Other similar extension points exist, see Microsoft.Common.targets.
        Other similar extension points exist, see Microsoft.Common.targets.

+ 0 - 18
Emby.Common.Implementations/IO/SharpCifs/Util/Sharpen/Extensions.cs

@@ -207,24 +207,6 @@ namespace SharpCifs.Util.Sharpen
             return (int)tzone.GetUtcOffset(MillisToDateTimeOffset(date, 0).DateTime).TotalMilliseconds;
             return (int)tzone.GetUtcOffset(MillisToDateTimeOffset(date, 0).DateTime).TotalMilliseconds;
         }
         }
 
 
-        public static InputStream GetResourceAsStream(this Type type, string name)
-        {
-            //Type.`Assembly` property deleted
-            //string str2 = type.Assembly.GetName().Name + ".resources";
-            string str2 = type.GetTypeInfo().Assembly.GetName().Name + ".resources";
-            string[] textArray1 = { str2, ".", type.Namespace, ".", name };
-            string str = string.Concat(textArray1);
-            
-            //Type.`Assembly` property deleted
-            //Stream manifestResourceStream = type.Assembly.GetManifestResourceStream(str);
-            Stream manifestResourceStream = type.GetTypeInfo().Assembly.GetManifestResourceStream(str);
-            if (manifestResourceStream == null)
-            {
-                return null;
-            }
-            return InputStream.Wrap(manifestResourceStream);
-        }
-
         public static long GetTime(this DateTime dateTime)
         public static long GetTime(this DateTime dateTime)
         {
         {
             return new DateTimeOffset(DateTime.SpecifyKind(dateTime, DateTimeKind.Utc), TimeSpan.Zero).ToMillisecondsSinceEpoch();
             return new DateTimeOffset(DateTime.SpecifyKind(dateTime, DateTimeKind.Utc), TimeSpan.Zero).ToMillisecondsSinceEpoch();

+ 371 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Detector.cs

@@ -0,0 +1,371 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using NLangDetect.Core.Extensions;
+using NLangDetect.Core.Utils;
+
+namespace NLangDetect.Core
+{
+    public class Detector
+    {
+        private const double _AlphaDefault = 0.5;
+        private const double _AlphaWidth = 0.05;
+
+        private const int _IterationLimit = 1000;
+        private const double _ProbThreshold = 0.1;
+        private const double _ConvThreshold = 0.99999;
+        private const int _BaseFreq = 10000;
+
+        private static readonly Regex _UrlRegex = new Regex("https?://[-_.?&~;+=/#0-9A-Za-z]+", RegexOptions.Compiled);
+        private static readonly Regex _MailRegex = new Regex("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", RegexOptions.Compiled);
+
+        private readonly Dictionary<string, ProbVector> _wordLangProbMap;
+        private readonly List<string> _langlist;
+
+        private StringBuilder _text;
+        private double[] _langprob;
+
+        private double _alpha = _AlphaDefault;
+        private const int _trialsCount = 7;
+        private int _maxTextLength = 10000;
+        private double[] _priorMap;
+        private int? _seed;
+
+        #region Constructor(s)
+
+        public Detector(DetectorFactory factory)
+        {
+            _wordLangProbMap = factory.WordLangProbMap;
+            _langlist = factory.Langlist;
+            _text = new StringBuilder();
+            _seed = factory.Seed;
+        }
+
+        #endregion
+
+        #region Public methods
+
+        public void SetAlpha(double alpha)
+        {
+            _alpha = alpha;
+        }
+
+        public void SetPriorMap(Dictionary<string, double> priorMap)
+        {
+            _priorMap = new double[_langlist.Count];
+
+            double sump = 0;
+
+            for (int i = 0; i < _priorMap.Length; i++)
+            {
+                string lang = _langlist[i];
+
+                if (priorMap.ContainsKey(lang))
+                {
+                    double p = priorMap[lang];
+
+                    if (p < 0)
+                    {
+                        throw new NLangDetectException("Prior probability must be non-negative.", ErrorCode.InitParamError);
+                    }
+
+                    _priorMap[i] = p;
+                    sump += p;
+                }
+            }
+
+            if (sump <= 0)
+            {
+                throw new NLangDetectException("More one of prior probability must be non-zero.", ErrorCode.InitParamError);
+            }
+
+            for (int i = 0; i < _priorMap.Length; i++)
+            {
+                _priorMap[i] /= sump;
+            }
+        }
+
+        public void SetMaxTextLength(int max_text_length)
+        {
+            _maxTextLength = max_text_length;
+        }
+
+        // TODO IMM HI: TextReader?
+        public void Append(StreamReader streamReader)
+        {
+            var buf = new char[_maxTextLength / 2];
+
+            while (_text.Length < _maxTextLength && !streamReader.EndOfStream)
+            {
+                int length = streamReader.Read(buf, 0, buf.Length);
+
+                Append(new string(buf, 0, length));
+            }
+        }
+
+        public void Append(string text)
+        {
+            text = _UrlRegex.Replace(text, " ");
+            text = _MailRegex.Replace(text, " ");
+
+            char pre = '\0';
+
+            for (int i = 0; i < text.Length && i < _maxTextLength; i++)
+            {
+                char c = NGram.Normalize(text[i]);
+
+                if (c != ' ' || pre != ' ')
+                {
+                    _text.Append(c);
+                }
+
+                pre = c;
+            }
+        }
+
+        private void CleanText()
+        {
+            int latinCount = 0, nonLatinCount = 0;
+
+            for (int i = 0; i < _text.Length; i++)
+            {
+                char c = _text[i];
+
+                if (c <= 'z' && c >= 'A')
+                {
+                    latinCount++;
+                }
+                else if (c >= '\u0300' && c.GetUnicodeBlock() != UnicodeBlock.LatinExtendedAdditional)
+                {
+                    nonLatinCount++;
+                }
+            }
+
+            if (latinCount * 2 < nonLatinCount)
+            {
+                var textWithoutLatin = new StringBuilder();
+
+                for (int i = 0; i < _text.Length; i++)
+                {
+                    char c = _text[i];
+
+                    if (c > 'z' || c < 'A')
+                    {
+                        textWithoutLatin.Append(c);
+                    }
+                }
+
+                _text = textWithoutLatin;
+            }
+        }
+
+        public string Detect()
+        {
+            List<Language> probabilities = GetProbabilities();
+
+            return
+              probabilities.Count > 0
+                ? probabilities[0].Name
+                : null;
+        }
+
+        public List<Language> GetProbabilities()
+        {
+            if (_langprob == null)
+            {
+                DetectBlock();
+            }
+
+            List<Language> list = SortProbability(_langprob);
+
+            return list;
+        }
+
+        #endregion
+
+        #region Private helper methods
+
+        private static double NormalizeProb(double[] probs)
+        {
+            double maxp = 0, sump = 0;
+
+            sump += probs.Sum();
+
+            for (int i = 0; i < probs.Length; i++)
+            {
+                double p = probs[i] / sump;
+
+                if (maxp < p)
+                {
+                    maxp = p;
+                }
+
+                probs[i] = p;
+            }
+
+            return maxp;
+        }
+
+        private static string UnicodeEncode(string word)
+        {
+            var resultSb = new StringBuilder();
+
+            foreach (char ch in word)
+            {
+                if (ch >= '\u0080')
+                {
+                    string st = string.Format("{0:x}", 0x10000 + ch);
+
+                    while (st.Length < 4)
+                    {
+                        st = "0" + st;
+                    }
+
+                    resultSb
+                      .Append("\\u")
+                      .Append(st.SubSequence(1, 5));
+                }
+                else
+                {
+                    resultSb.Append(ch);
+                }
+            }
+
+            return resultSb.ToString();
+        }
+
+        private void DetectBlock()
+        {
+            CleanText();
+
+            List<string> ngrams = ExtractNGrams();
+
+            if (ngrams.Count == 0)
+            {
+                throw new NLangDetectException("no features in text", ErrorCode.CantDetectError);
+            }
+
+            _langprob = new double[_langlist.Count];
+
+            Random rand = (_seed.HasValue ? new Random(_seed.Value) : new Random());
+
+            for (int t = 0; t < _trialsCount; t++)
+            {
+                double[] prob = InitProbability();
+
+                // TODO IMM HI: verify it works
+                double alpha = _alpha + rand.NextGaussian() * _AlphaWidth;
+
+                for (int i = 0; ; i++)
+                {
+                    int r = rand.Next(ngrams.Count);
+
+                    UpdateLangProb(prob, ngrams[r], alpha);
+
+                    if (i % 5 == 0)
+                    {
+                        if (NormalizeProb(prob) > _ConvThreshold || i >= _IterationLimit)
+                        {
+                            break;
+                        }
+                    }
+                }
+
+                for (int j = 0; j < _langprob.Length; j++)
+                {
+                    _langprob[j] += prob[j] / _trialsCount;
+                }
+            }
+        }
+
+        private double[] InitProbability()
+        {
+            var prob = new double[_langlist.Count];
+
+            if (_priorMap != null)
+            {
+                for (int i = 0; i < prob.Length; i++)
+                {
+                    prob[i] = _priorMap[i];
+                }
+            }
+            else
+            {
+                for (int i = 0; i < prob.Length; i++)
+                {
+                    prob[i] = 1.0 / _langlist.Count;
+                }
+            }
+            return prob;
+        }
+
+        private List<string> ExtractNGrams()
+        {
+            var list = new List<string>();
+            NGram ngram = new NGram();
+
+            for (int i = 0; i < _text.Length; i++)
+            {
+                ngram.AddChar(_text[i]);
+
+                for (int n = 1; n <= NGram.GramsCount; n++)
+                {
+                    string w = ngram.Get(n);
+
+                    if (w != null && _wordLangProbMap.ContainsKey(w))
+                    {
+                        list.Add(w);
+                    }
+                }
+            }
+
+            return list;
+        }
+
+        private void UpdateLangProb(double[] prob, string word, double alpha)
+        {
+            if (word == null || !_wordLangProbMap.ContainsKey(word))
+            {
+                return;
+            }
+
+            ProbVector langProbMap = _wordLangProbMap[word];
+            double weight = alpha / _BaseFreq;
+
+            for (int i = 0; i < prob.Length; i++)
+            {
+                prob[i] *= weight + langProbMap[i];
+            }
+        }
+
+        private List<Language> SortProbability(double[] prob)
+        {
+            var list = new List<Language>();
+
+            for (int j = 0; j < prob.Length; j++)
+            {
+                double p = prob[j];
+
+                if (p > _ProbThreshold)
+                {
+                    for (int i = 0; i <= list.Count; i++)
+                    {
+                        if (i == list.Count || list[i].Probability < p)
+                        {
+                            list.Insert(i, new Language(_langlist[j], p));
+
+                            break;
+                        }
+                    }
+                }
+            }
+
+            return list;
+        }
+
+        #endregion
+    }
+}

+ 127 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/DetectorFactory.cs

@@ -0,0 +1,127 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.IO.Compression;
+using NLangDetect.Core.Utils;
+using MediaBrowser.Model.Serialization;
+using System.Linq;
+
+namespace NLangDetect.Core
+{
+    public class DetectorFactory
+    {
+        public Dictionary<string, ProbVector> WordLangProbMap;
+        public List<string> Langlist;
+
+        private static readonly DetectorFactory _instance = new DetectorFactory();
+
+        #region Constructor(s)
+
+        private DetectorFactory()
+        {
+            WordLangProbMap = new Dictionary<string, ProbVector>();
+            Langlist = new List<string>();
+        }
+
+        #endregion
+
+        #region Public methods
+
+        public static void LoadProfiles(IJsonSerializer json)
+        {
+            var assembly = typeof(DetectorFactory).Assembly;
+            var names = assembly.GetManifestResourceNames()
+                      .Where(i => i.IndexOf("NLangDetect.Profiles", StringComparison.Ordinal) != -1)
+                      .ToList();
+
+            var index = 0;
+
+            foreach (var name in names)
+            {
+                using (var stream = assembly.GetManifestResourceStream(name))
+                {
+                    var langProfile = (LangProfile)json.DeserializeFromStream(stream, typeof(LangProfile));
+
+                    AddProfile(langProfile, index);
+                }
+
+                index++;
+            }
+        }
+
+        public static Detector Create()
+        {
+            return CreateDetector();
+        }
+
+        public static Detector Create(double alpha)
+        {
+            Detector detector = CreateDetector();
+
+            detector.SetAlpha(alpha);
+
+            return detector;
+        }
+
+        public static void SetSeed(int? seed)
+        {
+            _instance.Seed = seed;
+        }
+
+        #endregion
+
+        #region Internal methods
+
+        internal static void AddProfile(LangProfile profile, int index)
+        {
+            var lang = profile.name;
+
+            if (_instance.Langlist.Contains(lang))
+            {
+                throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
+            }
+
+            _instance.Langlist.Add(lang);
+
+            foreach (string word in profile.freq.Keys)
+            {
+                if (!_instance.WordLangProbMap.ContainsKey(word))
+                {
+                    _instance.WordLangProbMap.Add(word, new ProbVector());
+                }
+
+                double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];
+
+                _instance.WordLangProbMap[word][index] = prob;
+            }
+        }
+
+        internal static void Clear()
+        {
+            _instance.Langlist.Clear();
+            _instance.WordLangProbMap.Clear();
+        }
+
+        #endregion
+
+        #region Private helper methods
+
+        private static Detector CreateDetector()
+        {
+            if (_instance.Langlist.Count == 0)
+            {
+                throw new NLangDetectException("need to load profiles", ErrorCode.NeedLoadProfileError);
+            }
+
+            return new Detector(_instance);
+        }
+
+        #endregion
+
+        #region Properties
+
+        public int? Seed { get; private set; }
+
+        #endregion
+    }
+}

+ 15 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/ErrorCode.cs

@@ -0,0 +1,15 @@
+namespace NLangDetect.Core
+{
+  public enum ErrorCode
+  {
+    NoTextError,
+    FormatError,
+    FileLoadError,
+    DuplicateLangError,
+    NeedLoadProfileError,
+    CantDetectError,
+    CantOpenTrainData,
+    TrainDataFormatError,
+    InitParamError,
+  }
+}

+ 374 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/CharExtensions.cs

@@ -0,0 +1,374 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class CharExtensions
+  {
+    private const int MIN_CODE_POINT = 0x000000;
+    private const int MAX_CODE_POINT = 0x10ffff;
+
+    private static readonly int[] _unicodeBlockStarts =
+      {
+        #region Unicode block starts
+
+        0x0000, // Basic Latin
+        0x0080, // Latin-1 Supplement
+        0x0100, // Latin Extended-A
+        0x0180, // Latin Extended-B
+        0x0250, // IPA Extensions
+        0x02B0, // Spacing Modifier Letters
+        0x0300, // Combining Diacritical Marks
+        0x0370, // Greek and Coptic
+        0x0400, // Cyrillic
+        0x0500, // Cyrillic Supplementary
+        0x0530, // Armenian
+        0x0590, // Hebrew
+        0x0600, // Arabic
+        0x0700, // Syriac
+        0x0750, // unassigned
+        0x0780, // Thaana
+        0x07C0, // unassigned
+        0x0900, // Devanagari
+        0x0980, // Bengali
+        0x0A00, // Gurmukhi
+        0x0A80, // Gujarati
+        0x0B00, // Oriya
+        0x0B80, // Tamil
+        0x0C00, // Telugu
+        0x0C80, // Kannada
+        0x0D00, // Malayalam
+        0x0D80, // Sinhala
+        0x0E00, // Thai
+        0x0E80, // Lao
+        0x0F00, // Tibetan
+        0x1000, // Myanmar
+        0x10A0, // Georgian
+        0x1100, // Hangul Jamo
+        0x1200, // Ethiopic
+        0x1380, // unassigned
+        0x13A0, // Cherokee
+        0x1400, // Unified Canadian Aboriginal Syllabics
+        0x1680, // Ogham
+        0x16A0, // Runic
+        0x1700, // Tagalog
+        0x1720, // Hanunoo
+        0x1740, // Buhid
+        0x1760, // Tagbanwa
+        0x1780, // Khmer
+        0x1800, // Mongolian
+        0x18B0, // unassigned
+        0x1900, // Limbu
+        0x1950, // Tai Le
+        0x1980, // unassigned
+        0x19E0, // Khmer Symbols
+        0x1A00, // unassigned
+        0x1D00, // Phonetic Extensions
+        0x1D80, // unassigned
+        0x1E00, // Latin Extended Additional
+        0x1F00, // Greek Extended
+        0x2000, // General Punctuation
+        0x2070, // Superscripts and Subscripts
+        0x20A0, // Currency Symbols
+        0x20D0, // Combining Diacritical Marks for Symbols
+        0x2100, // Letterlike Symbols
+        0x2150, // Number Forms
+        0x2190, // Arrows
+        0x2200, // Mathematical Operators
+        0x2300, // Miscellaneous Technical
+        0x2400, // Control Pictures
+        0x2440, // Optical Character Recognition
+        0x2460, // Enclosed Alphanumerics
+        0x2500, // Box Drawing
+        0x2580, // Block Elements
+        0x25A0, // Geometric Shapes
+        0x2600, // Miscellaneous Symbols
+        0x2700, // Dingbats
+        0x27C0, // Miscellaneous Mathematical Symbols-A
+        0x27F0, // Supplemental Arrows-A
+        0x2800, // Braille Patterns
+        0x2900, // Supplemental Arrows-B
+        0x2980, // Miscellaneous Mathematical Symbols-B
+        0x2A00, // Supplemental Mathematical Operators
+        0x2B00, // Miscellaneous Symbols and Arrows
+        0x2C00, // unassigned
+        0x2E80, // CJK Radicals Supplement
+        0x2F00, // Kangxi Radicals
+        0x2FE0, // unassigned
+        0x2FF0, // Ideographic Description Characters
+        0x3000, // CJK Symbols and Punctuation
+        0x3040, // Hiragana
+        0x30A0, // Katakana
+        0x3100, // Bopomofo
+        0x3130, // Hangul Compatibility Jamo
+        0x3190, // Kanbun
+        0x31A0, // Bopomofo Extended
+        0x31C0, // unassigned
+        0x31F0, // Katakana Phonetic Extensions
+        0x3200, // Enclosed CJK Letters and Months
+        0x3300, // CJK Compatibility
+        0x3400, // CJK Unified Ideographs Extension A
+        0x4DC0, // Yijing Hexagram Symbols
+        0x4E00, // CJK Unified Ideographs
+        0xA000, // Yi Syllables
+        0xA490, // Yi Radicals
+        0xA4D0, // unassigned
+        0xAC00, // Hangul Syllables
+        0xD7B0, // unassigned
+        0xD800, // High Surrogates
+        0xDB80, // High Private Use Surrogates
+        0xDC00, // Low Surrogates
+        0xE000, // Private Use
+        0xF900, // CJK Compatibility Ideographs
+        0xFB00, // Alphabetic Presentation Forms
+        0xFB50, // Arabic Presentation Forms-A
+        0xFE00, // Variation Selectors
+        0xFE10, // unassigned
+        0xFE20, // Combining Half Marks
+        0xFE30, // CJK Compatibility Forms
+        0xFE50, // Small Form Variants
+        0xFE70, // Arabic Presentation Forms-B
+        0xFF00, // Halfwidth and Fullwidth Forms
+        0xFFF0, // Specials
+        0x10000, // Linear B Syllabary
+        0x10080, // Linear B Ideograms
+        0x10100, // Aegean Numbers
+        0x10140, // unassigned
+        0x10300, // Old Italic
+        0x10330, // Gothic
+        0x10350, // unassigned
+        0x10380, // Ugaritic
+        0x103A0, // unassigned
+        0x10400, // Deseret
+        0x10450, // Shavian
+        0x10480, // Osmanya
+        0x104B0, // unassigned
+        0x10800, // Cypriot Syllabary
+        0x10840, // unassigned
+        0x1D000, // Byzantine Musical Symbols
+        0x1D100, // Musical Symbols
+        0x1D200, // unassigned
+        0x1D300, // Tai Xuan Jing Symbols
+        0x1D360, // unassigned
+        0x1D400, // Mathematical Alphanumeric Symbols
+        0x1D800, // unassigned
+        0x20000, // CJK Unified Ideographs Extension B
+        0x2A6E0, // unassigned
+        0x2F800, // CJK Compatibility Ideographs Supplement
+        0x2FA20, // unassigned
+        0xE0000, // Tags
+        0xE0080, // unassigned
+        0xE0100, // Variation Selectors Supplement
+        0xE01F0, // unassigned
+        0xF0000, // Supplementary Private Use Area-A
+        0x100000, // Supplementary Private Use Area-B
+
+        #endregion
+      };
+
+    private static readonly UnicodeBlock?[] _unicodeBlocks =
+      {
+        #region Unicode blocks
+        UnicodeBlock.BasicLatin,
+        UnicodeBlock.Latin1Supplement,
+        UnicodeBlock.LatinExtendedA,
+        UnicodeBlock.LatinExtendedB,
+        UnicodeBlock.IpaExtensions,
+        UnicodeBlock.SpacingModifierLetters,
+        UnicodeBlock.CombiningDiacriticalMarks,
+        UnicodeBlock.Greek,
+        UnicodeBlock.Cyrillic,
+        UnicodeBlock.CyrillicSupplementary,
+        UnicodeBlock.Armenian,
+        UnicodeBlock.Hebrew,
+        UnicodeBlock.Arabic,
+        UnicodeBlock.Syriac,
+        null,
+        UnicodeBlock.Thaana,
+        null,
+        UnicodeBlock.Devanagari,
+        UnicodeBlock.Bengali,
+        UnicodeBlock.Gurmukhi,
+        UnicodeBlock.Gujarati,
+        UnicodeBlock.Oriya,
+        UnicodeBlock.Tamil,
+        UnicodeBlock.Telugu,
+        UnicodeBlock.Kannada,
+        UnicodeBlock.Malayalam,
+        UnicodeBlock.Sinhala,
+        UnicodeBlock.Thai,
+        UnicodeBlock.Lao,
+        UnicodeBlock.Tibetan,
+        UnicodeBlock.Myanmar,
+        UnicodeBlock.Georgian,
+        UnicodeBlock.HangulJamo,
+        UnicodeBlock.Ethiopic,
+        null,
+        UnicodeBlock.Cherokee,
+        UnicodeBlock.UnifiedCanadianAboriginalSyllabics,
+        UnicodeBlock.Ogham,
+        UnicodeBlock.Runic,
+        UnicodeBlock.Tagalog,
+        UnicodeBlock.Hanunoo,
+        UnicodeBlock.Buhid,
+        UnicodeBlock.Tagbanwa,
+        UnicodeBlock.Khmer,
+        UnicodeBlock.Mongolian,
+        null,
+        UnicodeBlock.Limbu,
+        UnicodeBlock.TaiLe,
+        null,
+        UnicodeBlock.KhmerSymbols,
+        null,
+        UnicodeBlock.PhoneticExtensions,
+        null,
+        UnicodeBlock.LatinExtendedAdditional,
+        UnicodeBlock.GreekExtended,
+        UnicodeBlock.GeneralPunctuation,
+        UnicodeBlock.SuperscriptsAndSubscripts,
+        UnicodeBlock.CurrencySymbols,
+        UnicodeBlock.CombiningMarksForSymbols,
+        UnicodeBlock.LetterlikeSymbols,
+        UnicodeBlock.NumberForms,
+        UnicodeBlock.Arrows,
+        UnicodeBlock.MathematicalOperators,
+        UnicodeBlock.MiscellaneousTechnical,
+        UnicodeBlock.ControlPictures,
+        UnicodeBlock.OpticalCharacterRecognition,
+        UnicodeBlock.EnclosedAlphanumerics,
+        UnicodeBlock.BoxDrawing,
+        UnicodeBlock.BlockElements,
+        UnicodeBlock.GeometricShapes,
+        UnicodeBlock.MiscellaneousSymbols,
+        UnicodeBlock.Dingbats,
+        UnicodeBlock.MiscellaneousMathematicalSymbolsA,
+        UnicodeBlock.SupplementalArrowsA,
+        UnicodeBlock.BraillePatterns,
+        UnicodeBlock.SupplementalArrowsB,
+        UnicodeBlock.MiscellaneousMathematicalSymbolsB,
+        UnicodeBlock.SupplementalMathematicalOperators,
+        UnicodeBlock.MiscellaneousSymbolsAndArrows,
+        null,
+        UnicodeBlock.CjkRadicalsSupplement,
+        UnicodeBlock.KangxiRadicals,
+        null,
+        UnicodeBlock.IdeographicDescriptionCharacters,
+        UnicodeBlock.CjkSymbolsAndPunctuation,
+        UnicodeBlock.Hiragana,
+        UnicodeBlock.Katakana,
+        UnicodeBlock.Bopomofo,
+        UnicodeBlock.HangulCompatibilityJamo,
+        UnicodeBlock.Kanbun,
+        UnicodeBlock.BopomofoExtended,
+        null,
+        UnicodeBlock.KatakanaPhoneticExtensions,
+        UnicodeBlock.EnclosedCjkLettersAndMonths,
+        UnicodeBlock.CjkCompatibility,
+        UnicodeBlock.CjkUnifiedIdeographsExtensionA,
+        UnicodeBlock.YijingHexagramSymbols,
+        UnicodeBlock.CjkUnifiedIdeographs,
+        UnicodeBlock.YiSyllables,
+        UnicodeBlock.YiRadicals,
+        null,
+        UnicodeBlock.HangulSyllables,
+        null,
+        UnicodeBlock.HighSurrogates,
+        UnicodeBlock.HighPrivateUseSurrogates,
+        UnicodeBlock.LowSurrogates,
+        UnicodeBlock.PrivateUseArea,
+        UnicodeBlock.CjkCompatibilityIdeographs,
+        UnicodeBlock.AlphabeticPresentationForms,
+        UnicodeBlock.ArabicPresentationFormsA,
+        UnicodeBlock.VariationSelectors,
+        null,
+        UnicodeBlock.CombiningHalfMarks,
+        UnicodeBlock.CjkCompatibilityForms,
+        UnicodeBlock.SmallFormVariants,
+        UnicodeBlock.ArabicPresentationFormsB,
+        UnicodeBlock.HalfwidthAndFullwidthForms,
+        UnicodeBlock.Specials,
+        UnicodeBlock.LinearBSyllabary,
+        UnicodeBlock.LinearBIdeograms,
+        UnicodeBlock.AegeanNumbers,
+        null,
+        UnicodeBlock.OldItalic,
+        UnicodeBlock.Gothic,
+        null,
+        UnicodeBlock.Ugaritic,
+        null,
+        UnicodeBlock.Deseret,
+        UnicodeBlock.Shavian,
+        UnicodeBlock.Osmanya,
+        null,
+        UnicodeBlock.CypriotSyllabary,
+        null,
+        UnicodeBlock.ByzantineMusicalSymbols,
+        UnicodeBlock.MusicalSymbols,
+        null,
+        UnicodeBlock.TaiXuanJingSymbols,
+        null,
+        UnicodeBlock.MathematicalAlphanumericSymbols,
+        null,
+        UnicodeBlock.CjkUnifiedIdeographsExtensionB,
+        null,
+        UnicodeBlock.CjkCompatibilityIdeographsSupplement,
+        null,
+        UnicodeBlock.Tags,
+        null,
+        UnicodeBlock.VariationSelectorsSupplement,
+        null,
+        UnicodeBlock.SupplementaryPrivateUseAreaA,
+        UnicodeBlock.SupplementaryPrivateUseAreaB,
+
+        #endregion
+      };
+
+    #region Public methods
+
+    /// <remarks>
+    /// Taken from JDK source: http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/lang/Character.java#Character.UnicodeBlock.0LATIN_EXTENDED_ADDITIONAL
+    /// </remarks>
+    public static UnicodeBlock? GetUnicodeBlock(this char ch)
+    {
+      int codePoint = ch;
+
+      if (!IsValidCodePoint(codePoint))
+      {
+        throw new ArgumentException("Argument is not a valid code point.", "ch");
+      }
+
+      int top, bottom, current;
+
+      bottom = 0;
+      top = _unicodeBlockStarts.Length;
+      current = top / 2;
+
+      // invariant: top > current >= bottom && codePoint >= unicodeBlockStarts[bottom]
+      while (top - bottom > 1)
+      {
+        if (codePoint >= _unicodeBlockStarts[current])
+        {
+          bottom = current;
+        }
+        else
+        {
+          top = current;
+        }
+
+        current = (top + bottom) / 2;
+      }
+
+      return _unicodeBlocks[current];
+    }
+
+    #endregion
+
+    #region Private helper methods
+
+    private static bool IsValidCodePoint(int codePoint)
+    {
+      return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT;
+    }
+
+    #endregion
+  }
+}

+ 51 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/RandomExtensions.cs

@@ -0,0 +1,51 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class RandomExtensions
+  {
+    private const double _Epsilon = 2.22044604925031E-15;
+
+    private static readonly object _mutex = new object();
+
+    private static double _nextNextGaussian;
+    private static bool _hasNextNextGaussian;
+
+    /// <summary>
+    /// Returns the next pseudorandom, Gaussian ("normally") distributed double value with mean 0.0 and standard deviation 1.0 from this random number generator's sequence.
+    /// The general contract of nextGaussian is that one double value, chosen from (approximately) the usual normal distribution with mean 0.0 and standard deviation 1.0, is pseudorandomly generated and returned.
+    /// </summary>
+    /// <remarks>
+    /// Taken from: http://download.oracle.com/javase/6/docs/api/java/util/Random.html (nextGaussian())
+    /// </remarks>
+    public static double NextGaussian(this Random random)
+    {
+      lock (_mutex)
+      {
+        if (_hasNextNextGaussian)
+        {
+          _hasNextNextGaussian = false;
+
+          return _nextNextGaussian;
+        }
+
+        double v1, v2, s;
+
+        do
+        {
+          v1 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+          v2 = 2.0 * random.NextDouble() - 1.0; // between -1.0 and 1.0
+          s = v1 * v1 + v2 * v2;
+        }
+        while (s >= 1.0 || Math.Abs(s - 0.0) < _Epsilon);
+
+        double multiplier = Math.Sqrt(-2.0 * Math.Log(s) / s);
+
+        _nextNextGaussian = v2 * multiplier;
+        _hasNextNextGaussian = true;
+
+        return v1 * multiplier;
+      }
+    }
+  }
+}

+ 25 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/StringExtensions.cs

@@ -0,0 +1,25 @@
+using System;
+
+namespace NLangDetect.Core.Extensions
+{
+  public static class StringExtensions
+  {
+    /// <summary>
+    /// Returns a new character sequence that is a subsequence of this sequence. The subsequence starts with the character at the specified index and ends with the character at index end - 1. The length of the returned sequence is end - start, so if start == end then an empty sequence is returned.
+    /// </summary>
+    /// <param name="s"></param>
+    /// <param name="start">the start index, inclusive</param>
+    /// <param name="end">the end index, exclusive</param>
+    /// <returns>the specified subsequence</returns>
+    /// <exception cref="IndexOutOfRangeException"> if start or end are negative, if end is greater than length(), or if start is greater than end</exception>
+    public static string SubSequence(this string s, int start, int end)
+    {
+      if (start < 0) throw new ArgumentOutOfRangeException("start", "Argument must not be negative.");
+      if (end < 0) throw new ArgumentOutOfRangeException("end", "Argument must not be negative.");
+      if (end > s.Length) throw new ArgumentOutOfRangeException("end", "Argument must not be greater than the input string's length.");
+      if (start > end) throw new ArgumentOutOfRangeException("start", "Argument must not be greater than the 'end' argument.");
+      
+      return s.Substring(start, end - start);
+    }
+  }
+}

+ 131 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Extensions/UnicodeBlock.cs

@@ -0,0 +1,131 @@
+namespace NLangDetect.Core.Extensions
+{
+  public enum UnicodeBlock
+  {
+    BasicLatin,
+    Latin1Supplement,
+    LatinExtendedA,
+    LatinExtendedB,
+    IpaExtensions,
+    SpacingModifierLetters,
+    CombiningDiacriticalMarks,
+    Greek,
+    Cyrillic,
+    CyrillicSupplementary,
+    Armenian,
+    Hebrew,
+    Arabic,
+    Syriac,
+    Thaana,
+    Devanagari,
+    Bengali,
+    Gurmukhi,
+    Gujarati,
+    Oriya,
+    Tamil,
+    Telugu,
+    Kannada,
+    Malayalam,
+    Sinhala,
+    Thai,
+    Lao,
+    Tibetan,
+    Myanmar,
+    Georgian,
+    HangulJamo,
+    Ethiopic,
+    Cherokee,
+    UnifiedCanadianAboriginalSyllabics,
+    Ogham,
+    Runic,
+    Tagalog,
+    Hanunoo,
+    Buhid,
+    Tagbanwa,
+    Khmer,
+    Mongolian,
+    Limbu,
+    TaiLe,
+    KhmerSymbols,
+    PhoneticExtensions,
+    LatinExtendedAdditional,
+    GreekExtended,
+    GeneralPunctuation,
+    SuperscriptsAndSubscripts,
+    CurrencySymbols,
+    CombiningMarksForSymbols,
+    LetterlikeSymbols,
+    NumberForms,
+    Arrows,
+    MathematicalOperators,
+    MiscellaneousTechnical,
+    ControlPictures,
+    OpticalCharacterRecognition,
+    EnclosedAlphanumerics,
+    BoxDrawing,
+    BlockElements,
+    GeometricShapes,
+    MiscellaneousSymbols,
+    Dingbats,
+    MiscellaneousMathematicalSymbolsA,
+    SupplementalArrowsA,
+    BraillePatterns,
+    SupplementalArrowsB,
+    MiscellaneousMathematicalSymbolsB,
+    SupplementalMathematicalOperators,
+    MiscellaneousSymbolsAndArrows,
+    CjkRadicalsSupplement,
+    KangxiRadicals,
+    IdeographicDescriptionCharacters,
+    CjkSymbolsAndPunctuation,
+    Hiragana,
+    Katakana,
+    Bopomofo,
+    HangulCompatibilityJamo,
+    Kanbun,
+    BopomofoExtended,
+    KatakanaPhoneticExtensions,
+    EnclosedCjkLettersAndMonths,
+    CjkCompatibility,
+    CjkUnifiedIdeographsExtensionA,
+    YijingHexagramSymbols,
+    CjkUnifiedIdeographs,
+    YiSyllables,
+    YiRadicals,
+    HangulSyllables,
+    HighSurrogates,
+    HighPrivateUseSurrogates,
+    LowSurrogates,
+    PrivateUseArea,
+    CjkCompatibilityIdeographs,
+    AlphabeticPresentationForms,
+    ArabicPresentationFormsA,
+    VariationSelectors,
+    CombiningHalfMarks,
+    CjkCompatibilityForms,
+    SmallFormVariants,
+    ArabicPresentationFormsB,
+    HalfwidthAndFullwidthForms,
+    Specials,
+    LinearBSyllabary,
+    LinearBIdeograms,
+    AegeanNumbers,
+    OldItalic,
+    Gothic,
+    Ugaritic,
+    Deseret,
+    Shavian,
+    Osmanya,
+    CypriotSyllabary,
+    ByzantineMusicalSymbols,
+    MusicalSymbols,
+    TaiXuanJingSymbols,
+    MathematicalAlphanumericSymbols,
+    CjkUnifiedIdeographsExtensionB,
+    CjkCompatibilityIdeographsSupplement,
+    Tags,
+    VariationSelectorsSupplement,
+    SupplementaryPrivateUseAreaA,
+    SupplementaryPrivateUseAreaB,
+  }
+}

+ 67 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/GenProfile.cs

@@ -0,0 +1,67 @@
+using System;
+using System.IO.Compression;
+using System.Xml;
+using NLangDetect.Core.Utils;
+using System.IO;
+
+namespace NLangDetect.Core
+{
+  // TODO IMM HI: xml reader not tested
+  public static class GenProfile
+  {
+    #region Public methods
+
+    public static LangProfile load(string lang, string file)
+    {
+      LangProfile profile = new LangProfile(lang);
+      TagExtractor tagextractor = new TagExtractor("abstract", 100);
+      Stream inputStream = null;
+
+      try
+      {
+        inputStream = File.OpenRead(file);
+
+        string extension = Path.GetExtension(file) ?? "";
+
+        if (extension.ToUpper() == ".GZ")
+        {
+          inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
+        }
+
+        using (XmlReader xmlReader = XmlReader.Create(inputStream))
+        {
+          while (xmlReader.Read())
+          {
+            switch (xmlReader.NodeType)
+            {
+              case XmlNodeType.Element:
+                tagextractor.SetTag(xmlReader.Name);
+                break;
+
+              case XmlNodeType.Text:
+                tagextractor.Add(xmlReader.Value);
+                break;
+
+              case XmlNodeType.EndElement:
+                tagextractor.CloseTag(profile);
+                break;
+            }
+          }
+        }
+      }
+      finally
+      {
+        if (inputStream != null)
+        {
+          inputStream.Close();
+        }
+      }
+
+      Console.WriteLine(lang + ": " + tagextractor.Count);
+
+      return profile;
+    }
+
+    #endregion
+  }
+}

+ 22 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/InternalException.cs

@@ -0,0 +1,22 @@
+using System;
+
+namespace NLangDetect.Core
+{
+  [Serializable]
+  public class InternalException : Exception
+  {
+    #region Constructor(s)
+
+    public InternalException(string message, Exception innerException)
+      : base(message, innerException)
+    {
+    }
+
+    public InternalException(string message)
+      : this(message, null)
+    {
+    }
+
+    #endregion
+  }
+}

+ 45 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Language.cs

@@ -0,0 +1,45 @@
+using System.Globalization;
+
+namespace NLangDetect.Core
+{
+  // TODO IMM HI: name??
+  public class Language
+  {
+    #region Constructor(s)
+
+    public Language(string name, double probability)
+    {
+      Name = name;
+      Probability = probability;
+    }
+
+    #endregion
+
+    #region Object overrides
+
+    public override string ToString()
+    {
+      if (Name == null)
+      {
+        return "";
+      }
+
+      return
+        string.Format(
+          CultureInfo.InvariantCulture.NumberFormat,
+          "{0}:{1:0.000000}",
+          Name,
+          Probability);
+    }
+
+    #endregion
+
+    #region Properties
+
+    public string Name { get; set; }
+
+    public double Probability { get; set; }
+
+    #endregion
+  }
+}

+ 37 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/LanguageDetector.cs

@@ -0,0 +1,37 @@
+using System;
+using MediaBrowser.Model.Serialization;
+
+namespace NLangDetect.Core
+{
+    // TODO IMM HI: change to non-static class
+    // TODO IMM HI: hide other, unnecassary classes via internal?
+    public static class LanguageDetector
+    {
+        private const double _DefaultAlpha = 0.5;
+
+        #region Public methods
+
+        public static void Initialize(IJsonSerializer json)
+        {
+            DetectorFactory.LoadProfiles(json);
+        }
+
+        public static void Release()
+        {
+            DetectorFactory.Clear();
+        }
+
+        public static string DetectLanguage(string plainText)
+        {
+            if (string.IsNullOrEmpty(plainText)) { throw new ArgumentException("Argument can't be null nor empty.", "plainText"); }
+
+            Detector detector = DetectorFactory.Create(_DefaultAlpha);
+
+            detector.Append(plainText);
+
+            return detector.Detect();
+        }
+
+        #endregion
+    }
+}

+ 23 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/NLangDetectException.cs

@@ -0,0 +1,23 @@
+using System;
+
+namespace NLangDetect.Core
+{
+  public class NLangDetectException : Exception
+  {
+    #region Constructor(s)
+
+    public NLangDetectException(string message, ErrorCode errorCode)
+      : base(message)
+    {
+      ErrorCode = errorCode;
+    }
+
+    #endregion
+
+    #region Properties
+
+    public ErrorCode ErrorCode { get; private set; }
+
+    #endregion
+  }
+}

+ 35 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/ProbVector.cs

@@ -0,0 +1,35 @@
+using System;
+using System.Collections.Generic;
+
+namespace NLangDetect.Core
+{
+  public class ProbVector
+  {
+    private readonly Dictionary<int, double> _dict = new Dictionary<int, double>();
+
+    public double this[int key]
+    {
+      get
+      {
+        double value;
+
+        return _dict.TryGetValue(key, out value) ? value : 0.0;
+      }
+
+      set
+      {
+        if (Math.Abs(value) < double.Epsilon)
+        {
+          if (_dict.ContainsKey(key))
+          {
+            _dict.Remove(key);
+          }
+
+          return;
+        }
+
+        _dict[key] = value;
+      }
+    }
+  }
+}

File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/afr


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ara


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ben


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/bul


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ces


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/dan


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/deu


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ell


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/eng


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/est


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fas


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fin


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/fra


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/guj


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/heb


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hin


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hrv


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/hun


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ind


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ita


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/jpn


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/kan


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/kor


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/lav


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/lit


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mal


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mar


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/mkd


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nep


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nld


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/nor


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/pan


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/pol


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/por


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ron


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/rus


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/slk


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/slv


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/som


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/spa


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/sqi


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/swa


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/swe


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tam


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tel


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tgl


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tha


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/tur


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/ukr


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/urd


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/vie


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/zh-cn


File diff suppressed because it is too large
+ 0 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Profiles/zh-tw


+ 118 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/LangProfile.cs

@@ -0,0 +1,118 @@
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+
+namespace NLangDetect.Core.Utils
+{
+    public class LangProfile
+    {
+        private const int MinimumFreq = 2;
+        private const int LessFreqRatio = 100000;
+
+        public string name { get; set; }
+
+        public Dictionary<string, int> freq { get; set; }
+        public int[] n_words { get; set; }
+
+        #region Constructor(s)
+
+        public LangProfile()
+        {
+            freq = new Dictionary<string, int>();
+            n_words = new int[NGram.GramsCount];
+        }
+
+        public LangProfile(string name)
+        {
+            this.name = name;
+            freq = new Dictionary<string, int>();
+            n_words = new int[NGram.GramsCount];
+        }
+
+        #endregion
+
+        #region Public methods
+
+        public void Add(string gram)
+        {
+            if (name == null || gram == null) return; // Illegal
+            int len = gram.Length;
+            if (len < 1 || len > NGram.GramsCount) return; // Illegal
+
+            n_words[len - 1]++;
+
+            if (freq.ContainsKey(gram))
+            {
+                freq[gram] = freq[gram] + 1;
+            }
+            else
+            {
+                freq.Add(gram, 1);
+            }
+        }
+
+        public void OmitLessFreq()
+        {
+            if (name == null) return; // Illegal
+            int threshold = n_words[0] / LessFreqRatio;
+            if (threshold < MinimumFreq) threshold = MinimumFreq;
+
+            ICollection<string> keys = freq.Keys;
+            int roman = 0;
+            // TODO IMM HI: move up?
+            Regex regex1 = new Regex("^[A-Za-z]$", RegexOptions.Compiled);
+            List<string> keysToRemove = new List<string>();
+
+            foreach (string key in keys)
+            {
+                int count = freq[key];
+
+                if (count <= threshold)
+                {
+                    n_words[key.Length - 1] -= count;
+                    keysToRemove.Add(key);
+                }
+                else
+                {
+                    if (regex1.IsMatch(key))
+                    {
+                        roman += count;
+                    }
+                }
+            }
+
+            foreach (string keyToRemove in keysToRemove)
+            {
+                freq.Remove(keyToRemove);
+            }
+
+            // roman check
+            keysToRemove = new List<string>();
+
+            if (roman < n_words[0] / 3)
+            {
+                ICollection<string> keys2 = freq.Keys;
+
+                // TODO IMM HI: move up?
+                Regex regex2 = new Regex(".*[A-Za-z].*", RegexOptions.Compiled);
+
+                foreach (string key in keys2)
+                {
+                    int count = freq[key];
+
+                    if (regex2.IsMatch(key))
+                    {
+                        n_words[key.Length - 1] -= count;
+                        keysToRemove.Add(key);
+                    }
+                }
+
+                foreach (string keyToRemove in keysToRemove)
+                {
+                    freq.Remove(keyToRemove);
+                }
+            }
+        }
+
+        #endregion
+    }
+}

+ 91 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/Messages.cs

@@ -0,0 +1,91 @@
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Reflection;
+using System.Text.RegularExpressions;
+using System.Linq;
+using System;
+
+namespace NLangDetect.Core.Utils
+{
+    public static class Messages
+    {
+        private static readonly Dictionary<string, string> _messages;
+
+        static Messages()
+        {
+            _messages = LoadMessages();
+        }
+
+        public static string getString(string key)
+        {
+            string value;
+
+            return
+              _messages.TryGetValue(key, out value)
+                ? value
+                : string.Format("!{0}!", key);
+        }
+
+        private static Dictionary<string, string> LoadMessages()
+        {
+            var manifestName = typeof(Messages).Assembly.GetManifestResourceNames().FirstOrDefault(i => i.IndexOf("messages.properties", StringComparison.Ordinal) != -1) ;
+
+            Stream messagesStream =
+              typeof(Messages).Assembly
+                .GetManifestResourceStream(manifestName);
+
+            if (messagesStream == null)
+            {
+                throw new InternalException(string.Format("Couldn't get embedded resource named '{0}'.", manifestName));
+            }
+
+            using (messagesStream)
+            using (var sr = new StreamReader(messagesStream))
+            {
+                var messages = new Dictionary<string, string>();
+
+                while (!sr.EndOfStream)
+                {
+                    string line = sr.ReadLine();
+
+                    if (string.IsNullOrEmpty(line))
+                    {
+                        continue;
+                    }
+
+                    string[] keyValue = line.Split('=');
+
+                    if (keyValue.Length != 2)
+                    {
+                        throw new InternalException(string.Format("Invalid format of the 'Messages.properties' resource. Offending line: '{0}'.", line.Trim()));
+                    }
+
+                    string key = keyValue[0];
+                    string value = UnescapeUnicodeString(keyValue[1]);
+
+                    messages.Add(key, value);
+                }
+
+                return messages;
+            }
+        }
+
+        /// <remarks>
+        /// Taken from: http://stackoverflow.com/questions/1615559/converting-unicode-strings-to-escaped-ascii-string/1615860#1615860
+        /// </remarks>
+        private static string UnescapeUnicodeString(string s)
+        {
+            if (s == null)
+            {
+                return null;
+            }
+
+            return
+              Regex.Replace(
+                s,
+                @"\\u(?<Value>[a-zA-Z0-9]{4})",
+                match => ((char)int.Parse(match.Groups["Value"].Value, NumberStyles.HexNumber)).ToString());
+        }
+    }
+}

+ 330 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/NGram.cs

@@ -0,0 +1,330 @@
+// TODO IMM HI: check which classes can be made internal?
+
+using System.Collections.Generic;
+using System.Text;
+using NLangDetect.Core.Extensions;
+
+namespace NLangDetect.Core.Utils
+{
+  public class NGram
+  {
+    public const int GramsCount = 3;
+
+    private static readonly string Latin1Excluded = Messages.getString("NGram.LATIN1_EXCLUDE");
+
+    private static readonly string[] CjkClass =
+      {
+        #region CJK classes
+
+        Messages.getString("NGram.KANJI_1_0"),
+        Messages.getString("NGram.KANJI_1_2"),
+        Messages.getString("NGram.KANJI_1_4"),
+        Messages.getString("NGram.KANJI_1_8"),
+        Messages.getString("NGram.KANJI_1_11"),
+        Messages.getString("NGram.KANJI_1_12"),
+        Messages.getString("NGram.KANJI_1_13"),
+        Messages.getString("NGram.KANJI_1_14"),
+        Messages.getString("NGram.KANJI_1_16"),
+        Messages.getString("NGram.KANJI_1_18"),
+        Messages.getString("NGram.KANJI_1_22"),
+        Messages.getString("NGram.KANJI_1_27"),
+        Messages.getString("NGram.KANJI_1_29"),
+        Messages.getString("NGram.KANJI_1_31"),
+        Messages.getString("NGram.KANJI_1_35"),
+        Messages.getString("NGram.KANJI_2_0"),
+        Messages.getString("NGram.KANJI_2_1"),
+        Messages.getString("NGram.KANJI_2_4"),
+        Messages.getString("NGram.KANJI_2_9"),
+        Messages.getString("NGram.KANJI_2_10"),
+        Messages.getString("NGram.KANJI_2_11"),
+        Messages.getString("NGram.KANJI_2_12"),
+        Messages.getString("NGram.KANJI_2_13"),
+        Messages.getString("NGram.KANJI_2_15"),
+        Messages.getString("NGram.KANJI_2_16"),
+        Messages.getString("NGram.KANJI_2_18"),
+        Messages.getString("NGram.KANJI_2_21"),
+        Messages.getString("NGram.KANJI_2_22"),
+        Messages.getString("NGram.KANJI_2_23"),
+        Messages.getString("NGram.KANJI_2_28"),
+        Messages.getString("NGram.KANJI_2_29"),
+        Messages.getString("NGram.KANJI_2_30"),
+        Messages.getString("NGram.KANJI_2_31"),
+        Messages.getString("NGram.KANJI_2_32"),
+        Messages.getString("NGram.KANJI_2_35"),
+        Messages.getString("NGram.KANJI_2_36"),
+        Messages.getString("NGram.KANJI_2_37"),
+        Messages.getString("NGram.KANJI_2_38"),
+        Messages.getString("NGram.KANJI_3_1"),
+        Messages.getString("NGram.KANJI_3_2"),
+        Messages.getString("NGram.KANJI_3_3"),
+        Messages.getString("NGram.KANJI_3_4"),
+        Messages.getString("NGram.KANJI_3_5"),
+        Messages.getString("NGram.KANJI_3_8"),
+        Messages.getString("NGram.KANJI_3_9"),
+        Messages.getString("NGram.KANJI_3_11"),
+        Messages.getString("NGram.KANJI_3_12"),
+        Messages.getString("NGram.KANJI_3_13"),
+        Messages.getString("NGram.KANJI_3_15"),
+        Messages.getString("NGram.KANJI_3_16"),
+        Messages.getString("NGram.KANJI_3_18"),
+        Messages.getString("NGram.KANJI_3_19"),
+        Messages.getString("NGram.KANJI_3_22"),
+        Messages.getString("NGram.KANJI_3_23"),
+        Messages.getString("NGram.KANJI_3_27"),
+        Messages.getString("NGram.KANJI_3_29"),
+        Messages.getString("NGram.KANJI_3_30"),
+        Messages.getString("NGram.KANJI_3_31"),
+        Messages.getString("NGram.KANJI_3_32"),
+        Messages.getString("NGram.KANJI_3_35"),
+        Messages.getString("NGram.KANJI_3_36"),
+        Messages.getString("NGram.KANJI_3_37"),
+        Messages.getString("NGram.KANJI_3_38"),
+        Messages.getString("NGram.KANJI_4_0"),
+        Messages.getString("NGram.KANJI_4_9"),
+        Messages.getString("NGram.KANJI_4_10"),
+        Messages.getString("NGram.KANJI_4_16"),
+        Messages.getString("NGram.KANJI_4_17"),
+        Messages.getString("NGram.KANJI_4_18"),
+        Messages.getString("NGram.KANJI_4_22"),
+        Messages.getString("NGram.KANJI_4_24"),
+        Messages.getString("NGram.KANJI_4_28"),
+        Messages.getString("NGram.KANJI_4_34"),
+        Messages.getString("NGram.KANJI_4_39"),
+        Messages.getString("NGram.KANJI_5_10"),
+        Messages.getString("NGram.KANJI_5_11"),
+        Messages.getString("NGram.KANJI_5_12"),
+        Messages.getString("NGram.KANJI_5_13"),
+        Messages.getString("NGram.KANJI_5_14"),
+        Messages.getString("NGram.KANJI_5_18"),
+        Messages.getString("NGram.KANJI_5_26"),
+        Messages.getString("NGram.KANJI_5_29"),
+        Messages.getString("NGram.KANJI_5_34"),
+        Messages.getString("NGram.KANJI_5_39"),
+        Messages.getString("NGram.KANJI_6_0"),
+        Messages.getString("NGram.KANJI_6_3"),
+        Messages.getString("NGram.KANJI_6_9"),
+        Messages.getString("NGram.KANJI_6_10"),
+        Messages.getString("NGram.KANJI_6_11"),
+        Messages.getString("NGram.KANJI_6_12"),
+        Messages.getString("NGram.KANJI_6_16"),
+        Messages.getString("NGram.KANJI_6_18"),
+        Messages.getString("NGram.KANJI_6_20"),
+        Messages.getString("NGram.KANJI_6_21"),
+        Messages.getString("NGram.KANJI_6_22"),
+        Messages.getString("NGram.KANJI_6_23"),
+        Messages.getString("NGram.KANJI_6_25"),
+        Messages.getString("NGram.KANJI_6_28"),
+        Messages.getString("NGram.KANJI_6_29"),
+        Messages.getString("NGram.KANJI_6_30"),
+        Messages.getString("NGram.KANJI_6_32"),
+        Messages.getString("NGram.KANJI_6_34"),
+        Messages.getString("NGram.KANJI_6_35"),
+        Messages.getString("NGram.KANJI_6_37"),
+        Messages.getString("NGram.KANJI_6_39"),
+        Messages.getString("NGram.KANJI_7_0"),
+        Messages.getString("NGram.KANJI_7_3"),
+        Messages.getString("NGram.KANJI_7_6"),
+        Messages.getString("NGram.KANJI_7_7"),
+        Messages.getString("NGram.KANJI_7_9"),
+        Messages.getString("NGram.KANJI_7_11"),
+        Messages.getString("NGram.KANJI_7_12"),
+        Messages.getString("NGram.KANJI_7_13"),
+        Messages.getString("NGram.KANJI_7_16"),
+        Messages.getString("NGram.KANJI_7_18"),
+        Messages.getString("NGram.KANJI_7_19"),
+        Messages.getString("NGram.KANJI_7_20"),
+        Messages.getString("NGram.KANJI_7_21"),
+        Messages.getString("NGram.KANJI_7_23"),
+        Messages.getString("NGram.KANJI_7_25"),
+        Messages.getString("NGram.KANJI_7_28"),
+        Messages.getString("NGram.KANJI_7_29"),
+        Messages.getString("NGram.KANJI_7_32"),
+        Messages.getString("NGram.KANJI_7_33"),
+        Messages.getString("NGram.KANJI_7_35"),
+        Messages.getString("NGram.KANJI_7_37"),
+
+        #endregion
+      };
+
+    private static readonly Dictionary<char, char> _cjkMap;
+
+    private StringBuilder _grams;
+    private bool _capitalword;
+
+    #region Constructor(s)
+
+    static NGram()
+    {
+      _cjkMap = new Dictionary<char, char>();
+
+      foreach (string cjk_list in CjkClass)
+      {
+        char representative = cjk_list[0];
+
+        for (int i = 0; i < cjk_list.Length; i++)
+        {
+          _cjkMap.Add(cjk_list[i], representative);
+        }
+      }
+    }
+
+    public NGram()
+    {
+      _grams = new StringBuilder(" ");
+      _capitalword = false;
+    }
+
+    #endregion
+
+    #region Public methods
+
+    public static char Normalize(char ch)
+    {
+      UnicodeBlock? unicodeBlock = ch.GetUnicodeBlock();
+
+      if (!unicodeBlock.HasValue)
+      {
+        return ch;
+      }
+
+      switch (unicodeBlock.Value)
+      {
+        case UnicodeBlock.BasicLatin:
+          {
+            if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z')
+            {
+              return ' ';
+            }
+
+            break;
+          }
+
+        case UnicodeBlock.Latin1Supplement:
+          {
+            if (Latin1Excluded.IndexOf(ch) >= 0)
+            {
+              return ' ';
+            }
+
+            break;
+          }
+
+        case UnicodeBlock.GeneralPunctuation:
+          {
+            return ' ';
+          }
+
+        case UnicodeBlock.Arabic:
+          {
+            if (ch == '\u06cc')
+            {
+              return '\u064a';
+            }
+
+            break;
+          }
+
+        case UnicodeBlock.LatinExtendedAdditional:
+          {
+            if (ch >= '\u1ea0')
+            {
+              return '\u1ec3';
+            }
+
+            break;
+          }
+
+        case UnicodeBlock.Hiragana:
+          {
+            return '\u3042';
+          }
+
+        case UnicodeBlock.Katakana:
+          {
+            return '\u30a2';
+          }
+
+        case UnicodeBlock.Bopomofo:
+        case UnicodeBlock.BopomofoExtended:
+          {
+            return '\u3105';
+          }
+
+        case UnicodeBlock.CjkUnifiedIdeographs:
+          {
+            if (_cjkMap.ContainsKey(ch))
+            {
+              return _cjkMap[ch];
+            }
+
+            break;
+          }
+
+        case UnicodeBlock.HangulSyllables:
+          {
+            return '\uac00';
+          }
+      }
+
+      return ch;
+    }
+
+    public void AddChar(char ch)
+    {
+      ch = Normalize(ch);
+      char lastchar = _grams[_grams.Length - 1];
+      if (lastchar == ' ')
+      {
+        _grams = new StringBuilder(" ");
+        _capitalword = false;
+        if (ch == ' ') return;
+      }
+      else if (_grams.Length >= GramsCount)
+      {
+        _grams.Remove(0, 1);
+      }
+      _grams.Append(ch);
+
+      if (char.IsUpper(ch))
+      {
+        if (char.IsUpper(lastchar)) _capitalword = true;
+      }
+      else
+      {
+        _capitalword = false;
+      }
+    }
+
+    public string Get(int n)
+    {
+      if (_capitalword)
+      {
+        return null;
+      }
+
+      int len = _grams.Length;
+
+      if (n < 1 || n > 3 || len < n)
+      {
+        return null;
+      }
+
+      if (n == 1)
+      {
+        char ch = _grams[len - 1];
+
+        if (ch == ' ')
+        {
+          return null;
+        }
+
+        return ch.ToString();
+      }
+
+      // TODO IMM HI: is ToString() here effective?
+      return _grams.ToString().SubSequence(len - n, len);
+    }
+
+    #endregion
+  }
+}

+ 76 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/TagExtractor.cs

@@ -0,0 +1,76 @@
+using System.Text;
+
+namespace NLangDetect.Core.Utils
+{
+  public class TagExtractor
+  {
+    // TODO IMM HI: do the really need to be internal?
+    internal string Target;
+    internal int Threshold;
+    internal StringBuilder StringBuilder;
+    internal string Tag;
+
+    #region Constructor(s)
+
+    public TagExtractor(string tag, int threshold)
+    {
+      Target = tag;
+      Threshold = threshold;
+      Count = 0;
+      Clear();
+    }
+
+    #endregion
+
+    #region Public methods
+
+    public void Clear()
+    {
+      StringBuilder = new StringBuilder();
+      Tag = null;
+    }
+
+    public void SetTag(string tag)
+    {
+      Tag = tag;
+    }
+
+    public void Add(string line)
+    {
+      if (Tag == Target && line != null)
+      {
+        StringBuilder.Append(line);
+      }
+    }
+
+    public void CloseTag(LangProfile profile)
+    {
+      if (profile != null && Tag == Target && StringBuilder.Length > Threshold)
+      {
+        var gram = new NGram();
+
+        for (int i = 0; i < StringBuilder.Length; i++)
+        {
+          gram.AddChar(StringBuilder[i]);
+
+          for (int n = 1; n <= NGram.GramsCount; n++)
+          {
+            profile.Add(gram.Get(n));
+          }
+        }
+
+        Count++;
+      }
+
+      Clear();
+    }
+
+    #endregion
+
+    #region Properties
+
+    public int Count { get; private set; }
+
+    #endregion
+  }
+}

File diff suppressed because it is too large
+ 26 - 0
Emby.Common.Implementations/TextEncoding/NLangDetect/Utils/messages.properties


+ 60 - 3
Emby.Common.Implementations/TextEncoding/TextEncoding.cs

@@ -8,6 +8,8 @@ using System.Threading.Tasks;
 using MediaBrowser.Model.MediaInfo;
 using MediaBrowser.Model.MediaInfo;
 using MediaBrowser.Model.Logging;
 using MediaBrowser.Model.Logging;
 using UniversalDetector;
 using UniversalDetector;
+using NLangDetect.Core;
+using MediaBrowser.Model.Serialization;
 
 
 namespace Emby.Common.Implementations.TextEncoding
 namespace Emby.Common.Implementations.TextEncoding
 {
 {
@@ -15,11 +17,13 @@ namespace Emby.Common.Implementations.TextEncoding
     {
     {
         private readonly IFileSystem _fileSystem;
         private readonly IFileSystem _fileSystem;
         private readonly ILogger _logger;
         private readonly ILogger _logger;
+        private IJsonSerializer _json;
 
 
-        public TextEncoding(IFileSystem fileSystem, ILogger logger)
+        public TextEncoding(IFileSystem fileSystem, ILogger logger, IJsonSerializer json)
         {
         {
             _fileSystem = fileSystem;
             _fileSystem = fileSystem;
             _logger = logger;
             _logger = logger;
+            _json = json;
         }
         }
 
 
         public Encoding GetASCIIEncoding()
         public Encoding GetASCIIEncoding()
@@ -63,6 +67,7 @@ namespace Emby.Common.Implementations.TextEncoding
             }
             }
         }
         }
 
 
+        private bool _langDetectInitialized;
         public string GetDetectedEncodingName(byte[] bytes, string language)
         public string GetDetectedEncodingName(byte[] bytes, string language)
         {
         {
             var encoding = GetInitialEncoding(bytes);
             var encoding = GetInitialEncoding(bytes);
@@ -72,6 +77,22 @@ namespace Emby.Common.Implementations.TextEncoding
                 return "utf-8";
                 return "utf-8";
             }
             }
 
 
+            if (!_langDetectInitialized)
+            {
+                _langDetectInitialized = true;
+                LanguageDetector.Initialize(_json);
+            }
+
+            if (string.IsNullOrWhiteSpace(language))
+            {
+                language = DetectLanguage(bytes);
+
+                if (!string.IsNullOrWhiteSpace(language))
+                {
+                    _logger.Debug("Text language detected as {0}", language);
+                }
+            }
+
             var charset = DetectCharset(bytes, language);
             var charset = DetectCharset(bytes, language);
 
 
             if (!string.IsNullOrWhiteSpace(charset))
             if (!string.IsNullOrWhiteSpace(charset))
@@ -95,6 +116,35 @@ namespace Emby.Common.Implementations.TextEncoding
             return null;
             return null;
         }
         }
 
 
+        private string DetectLanguage(byte[] bytes)
+        {
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.UTF8.GetString(bytes));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.ASCII.GetString(bytes));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            try
+            {
+                return LanguageDetector.DetectLanguage(Encoding.Unicode.GetString(bytes));
+            }
+            catch (NLangDetectException ex)
+            {
+            }
+
+            return null;
+        }
+
         public Encoding GetEncodingFromCharset(string charset)
         public Encoding GetEncodingFromCharset(string charset)
         {
         {
             if (string.IsNullOrWhiteSpace(charset))
             if (string.IsNullOrWhiteSpace(charset))
@@ -136,22 +186,29 @@ namespace Emby.Common.Implementations.TextEncoding
                 case "cze":
                 case "cze":
                 case "ces":
                 case "ces":
                 case "slo":
                 case "slo":
-                case "slk":
-                case "slv":
                 case "srp":
                 case "srp":
                 case "hrv":
                 case "hrv":
                 case "rum":
                 case "rum":
                 case "ron":
                 case "ron":
                 case "rup":
                 case "rup":
+                    return "windows-1250";
+                // albanian
                 case "alb":
                 case "alb":
                 case "sqi":
                 case "sqi":
                     return "windows-1250";
                     return "windows-1250";
+                // slovak
+                case "slk":
+                case "slv":
+                    return "windows-1250";
                 case "ara":
                 case "ara":
                     return "windows-1256";
                     return "windows-1256";
                 case "heb":
                 case "heb":
                     return "windows-1255";
                     return "windows-1255";
                 case "grc":
                 case "grc":
+                    return "windows-1253";
+                // greek
                 case "gre":
                 case "gre":
+                case "ell":
                     return "windows-1253";
                     return "windows-1253";
                 case "crh":
                 case "crh":
                 case "ota":
                 case "ota":

+ 1 - 1
Emby.Server.Core/ApplicationHost.cs

@@ -561,7 +561,7 @@ namespace Emby.Server.Core
             StringExtensions.LocalizationManager = LocalizationManager;
             StringExtensions.LocalizationManager = LocalizationManager;
             RegisterSingleInstance(LocalizationManager);
             RegisterSingleInstance(LocalizationManager);
 
 
-            ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"));
+            ITextEncoding textEncoding = new TextEncoding(FileSystemManager, LogManager.GetLogger("TextEncoding"), JsonSerializer);
             RegisterSingleInstance(textEncoding);
             RegisterSingleInstance(textEncoding);
             Utilities.EncodingHelper = textEncoding;
             Utilities.EncodingHelper = textEncoding;
             RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));
             RegisterSingleInstance<IBlurayExaminer>(() => new BdInfoExaminer(FileSystemManager, textEncoding));

+ 4 - 4
MediaBrowser.Api/UserLibrary/BaseItemsRequest.cs

@@ -82,16 +82,16 @@ namespace MediaBrowser.Api.UserLibrary
         [ApiMember(Name = "AiredDuringSeason", Description = "Gets all episodes that aired during a season, including specials.", IsRequired = false, DataType = "int", ParameterType = "query", Verb = "GET")]
         [ApiMember(Name = "AiredDuringSeason", Description = "Gets all episodes that aired during a season, including specials.", IsRequired = false, DataType = "int", ParameterType = "query", Verb = "GET")]
         public int? AiredDuringSeason { get; set; }
         public int? AiredDuringSeason { get; set; }
 
 
-        [ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
+        [ApiMember(Name = "MinPremiereDate", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
         public string MinPremiereDate { get; set; }
         public string MinPremiereDate { get; set; }
 
 
-        [ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
+        [ApiMember(Name = "MinDateLastSaved", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
         public string MinDateLastSaved { get; set; }
         public string MinDateLastSaved { get; set; }
 
 
-        [ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
+        [ApiMember(Name = "MinDateLastSavedForUser", Description = "Optional. The minimum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
         public string MinDateLastSavedForUser { get; set; }
         public string MinDateLastSavedForUser { get; set; }
 
 
-        [ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "POST")]
+        [ApiMember(Name = "MaxPremiereDate", Description = "Optional. The maximum premiere date. Format = ISO", IsRequired = false, DataType = "string", ParameterType = "query", Verb = "GET")]
         public string MaxPremiereDate { get; set; }
         public string MaxPremiereDate { get; set; }
 
 
         [ApiMember(Name = "HasOverview", Description = "Optional filter by items that have an overview or not.", IsRequired = false, DataType = "bool", ParameterType = "query", Verb = "GET")]
         [ApiMember(Name = "HasOverview", Description = "Optional filter by items that have an overview or not.", IsRequired = false, DataType = "bool", ParameterType = "query", Verb = "GET")]

Some files were not shown because too many files changed in this diff