소스 검색

Reduce number of TMDB lookups if filenames have punctuation chars

Previosly TMDB would be queried with the raw name and always fail,
then retry with the cleaned name. Now non-word chars are always
cleaned out first.

If first query fails, retry with more aggressive cleaning.
Erik Rigtorp 5 년 전
부모
커밋
8bd356ab20
1개의 변경된 파일40개의 추가작업 그리고 27개의 파일을 삭제
  1. 40 27
      MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs

+ 40 - 27
MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs

@@ -5,6 +5,7 @@ using System.Linq;
 using System.Net;
 using System.Threading;
 using System.Threading.Tasks;
+using System.Text.RegularExpressions;
 using MediaBrowser.Common.Net;
 using MediaBrowser.Controller.Library;
 using MediaBrowser.Controller.Providers;
@@ -19,6 +20,20 @@ namespace MediaBrowser.Providers.Tmdb.Movies
     public class TmdbSearch
     {
         private static readonly CultureInfo EnUs = new CultureInfo("en-US");
+
+        private static readonly Regex cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled);
+        private static readonly Regex cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled);
+        private static readonly Regex cleanStopWords = new Regex(@"\b( # Start at word boundary
+            19[0-9]{2}|20[0-9]{2}| # 1900-2099
+            S[0-9]{2}| # Season
+            E[0-9]{2}| # Episode
+            (2160|1080|720|576|480)[ip]?| # Resolution
+            [xh]?264| # Encoding
+            (web|dvd|bd|hdtv|hd)rip| # *Rip
+            web|hdtv|mp4|bluray|ktr|dl|single|imageset|internal|doku|dubbed|retail|xxx|flac
+            ).* # Match rest of string",
+            RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
+
         private const string Search3 = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}";
 
         private readonly ILogger _logger;
@@ -61,19 +76,18 @@ namespace MediaBrowser.Providers.Tmdb.Movies
 
             var tmdbImageUrl = tmdbSettings.images.GetImageUrl("original");
 
-            if (!string.IsNullOrWhiteSpace(name))
-            {
-                var parsedName = _libraryManager.ParseName(name);
-                var yearInName = parsedName.Year;
-                name = parsedName.Name;
-                year = year ?? yearInName;
-            }
+            // Does this mean we are reparsing already parsed ItemLookupInfo?
+            var parsedName = _libraryManager.ParseName(name);
+            var yearInName = parsedName.Year;
+            name = parsedName.Name;
+            year = year ?? yearInName;
 
-            _logger.LogInformation("MovieDbProvider: Finding id for item: " + name);
+            _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
             var language = idInfo.MetadataLanguage.ToLowerInvariant();
 
-            //nope - search for it
-            //var searchType = item is BoxSet ? "collection" : "movie";
+            // Replace sequences of non-word characters with space
+            // TMDB expects a space separated list of words make sure that is the case
+            name = cleanNonWord.Replace(name, " ").Trim();
 
             var results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
 
@@ -86,36 +100,35 @@ namespace MediaBrowser.Providers.Tmdb.Movies
                 }
             }
 
+            // Ideally retrying alternatives should be done outside the search
+            // provider so that the retry logic can be common for all search
+            // providers
             if (results.Count == 0)
             {
-                // try with dot and _ turned to space
-                var originalName = name;
-
-                name = name.Replace(",", " ");
-                name = name.Replace(".", " ");
-                name = name.Replace("_", " ");
-                name = name.Replace("-", " ");
-                name = name.Replace("!", " ");
-                name = name.Replace("?", " ");
-
-                var parenthIndex = name.IndexOf('(');
-                if (parenthIndex != -1)
-                {
-                    name = name.Substring(0, parenthIndex);
-                }
+                name = parsedName.Name;
+
+                // Remove things enclosed in []{}() etc
+                name = cleanEnclosed.Replace(name, string.Empty);
 
+                // Replace sequences of non-word characters with space
+                name = cleanNonWord.Replace(name, " ");
+
+                // Clean based on common stop words / tokens
+                name = cleanStopWords.Replace(name, string.Empty);
+
+                // Trim whitespace
                 name = name.Trim();
 
                 // Search again if the new name is different
-                if (!string.Equals(name, originalName))
+                if (!string.Equals(name, parsedName.Name) && !string.IsNullOrWhiteSpace(name))
                 {
+                    _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
                     results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
 
                     if (results.Count == 0 && !string.Equals(language, "en", StringComparison.OrdinalIgnoreCase))
                     {
                         //one more time, in english
                         results = await GetSearchResults(name, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false);
-
                     }
                 }
             }