Browse Source

Reduce number of TMDB lookups if filenames have punctuation chars

Previosly TMDB would be queried with the raw name and always fail,
then retry with the cleaned name. Now non-word chars are always
cleaned out first.

If first query fails, retry with more aggressive cleaning.
Erik Rigtorp 5 years ago
parent
commit
8bd356ab20
1 changed files with 40 additions and 27 deletions
  1. 40 27
      MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs

+ 40 - 27
MediaBrowser.Providers/Tmdb/Movies/TmdbSearch.cs

@@ -5,6 +5,7 @@ using System.Linq;
 using System.Net;
 using System.Threading;
 using System.Threading.Tasks;
+using System.Text.RegularExpressions;
 using MediaBrowser.Common.Net;
 using MediaBrowser.Controller.Library;
 using MediaBrowser.Controller.Providers;
@@ -19,6 +20,20 @@ namespace MediaBrowser.Providers.Tmdb.Movies
     public class TmdbSearch
     {
         private static readonly CultureInfo EnUs = new CultureInfo("en-US");
+
+        private static readonly Regex cleanEnclosed = new Regex(@"\p{Ps}.*\p{Pe}", RegexOptions.Compiled);
+        private static readonly Regex cleanNonWord = new Regex(@"[\W_]+", RegexOptions.Compiled);
+        private static readonly Regex cleanStopWords = new Regex(@"\b( # Start at word boundary
+            19[0-9]{2}|20[0-9]{2}| # 1900-2099
+            S[0-9]{2}| # Season
+            E[0-9]{2}| # Episode
+            (2160|1080|720|576|480)[ip]?| # Resolution
+            [xh]?264| # Encoding
+            (web|dvd|bd|hdtv|hd)rip| # *Rip
+            web|hdtv|mp4|bluray|ktr|dl|single|imageset|internal|doku|dubbed|retail|xxx|flac
+            ).* # Match rest of string",
+            RegexOptions.Compiled | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
+
         private const string Search3 = TmdbUtils.BaseTmdbApiUrl + @"3/search/{3}?api_key={1}&query={0}&language={2}";
 
         private readonly ILogger _logger;
@@ -61,19 +76,18 @@ namespace MediaBrowser.Providers.Tmdb.Movies
 
             var tmdbImageUrl = tmdbSettings.images.GetImageUrl("original");
 
-            if (!string.IsNullOrWhiteSpace(name))
-            {
-                var parsedName = _libraryManager.ParseName(name);
-                var yearInName = parsedName.Year;
-                name = parsedName.Name;
-                year = year ?? yearInName;
-            }
+            // Does this mean we are reparsing already parsed ItemLookupInfo?
+            var parsedName = _libraryManager.ParseName(name);
+            var yearInName = parsedName.Year;
+            name = parsedName.Name;
+            year = year ?? yearInName;
 
-            _logger.LogInformation("MovieDbProvider: Finding id for item: " + name);
+            _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
             var language = idInfo.MetadataLanguage.ToLowerInvariant();
 
-            //nope - search for it
-            //var searchType = item is BoxSet ? "collection" : "movie";
+            // Replace sequences of non-word characters with space
+            // TMDB expects a space separated list of words make sure that is the case
+            name = cleanNonWord.Replace(name, " ").Trim();
 
             var results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
 
@@ -86,36 +100,35 @@ namespace MediaBrowser.Providers.Tmdb.Movies
                 }
             }
 
+            // Ideally retrying alternatives should be done outside the search
+            // provider so that the retry logic can be common for all search
+            // providers
             if (results.Count == 0)
             {
-                // try with dot and _ turned to space
-                var originalName = name;
-
-                name = name.Replace(",", " ");
-                name = name.Replace(".", " ");
-                name = name.Replace("_", " ");
-                name = name.Replace("-", " ");
-                name = name.Replace("!", " ");
-                name = name.Replace("?", " ");
-
-                var parenthIndex = name.IndexOf('(');
-                if (parenthIndex != -1)
-                {
-                    name = name.Substring(0, parenthIndex);
-                }
+                name = parsedName.Name;
+
+                // Remove things enclosed in []{}() etc
+                name = cleanEnclosed.Replace(name, string.Empty);
 
+                // Replace sequences of non-word characters with space
+                name = cleanNonWord.Replace(name, " ");
+
+                // Clean based on common stop words / tokens
+                name = cleanStopWords.Replace(name, string.Empty);
+
+                // Trim whitespace
                 name = name.Trim();
 
                 // Search again if the new name is different
-                if (!string.Equals(name, originalName))
+                if (!string.Equals(name, parsedName.Name) && !string.IsNullOrWhiteSpace(name))
                 {
+                    _logger.LogInformation("TmdbSearch: Finding id for item: {0} ({1})", name, year);
                     results = await GetSearchResults(name, searchType, year, language, tmdbImageUrl, cancellationToken).ConfigureAwait(false);
 
                     if (results.Count == 0 && !string.Equals(language, "en", StringComparison.OrdinalIgnoreCase))
                     {
                         //one more time, in english
                         results = await GetSearchResults(name, searchType, year, "en", tmdbImageUrl, cancellationToken).ConfigureAwait(false);
-
                     }
                 }
             }