Browse Source

[YoutubeDL] Add generic video filtering (Fixes #4916)

This functionality is intended to eventually encompass the current format filtering.
Philipp Hagemeister 10 years ago
parent
commit
347de4931c
5 changed files with 147 additions and 2 deletions
  1. 32 0
      test/test_utils.py
  2. 13 1
      youtube_dl/YoutubeDL.py
  3. 6 1
      youtube_dl/__init__.py
  4. 19 0
      youtube_dl/options.py
  5. 77 0
      youtube_dl/utils.py

+ 32 - 0
test/test_utils.py

@@ -53,6 +53,7 @@ from youtube_dl.utils import (
     version_tuple,
     version_tuple,
     xpath_with_ns,
     xpath_with_ns,
     render_table,
     render_table,
+    match_str,
 )
 )
 
 
 
 
@@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
             '123  4\n'
             '123  4\n'
             '9999 51')
             '9999 51')
 
 
+    def test_match_str(self):
+        self.assertRaises(ValueError, match_str, 'xy>foobar', {})
+        self.assertFalse(match_str('xy', {'x': 1200}))
+        self.assertTrue(match_str('!xy', {'x': 1200}))
+        self.assertTrue(match_str('x', {'x': 1200}))
+        self.assertFalse(match_str('!x', {'x': 1200}))
+        self.assertTrue(match_str('x', {'x': 0}))
+        self.assertFalse(match_str('x>0', {'x': 0}))
+        self.assertFalse(match_str('x>0', {}))
+        self.assertTrue(match_str('x>?0', {}))
+        self.assertTrue(match_str('x>1K', {'x': 1200}))
+        self.assertFalse(match_str('x>2K', {'x': 1200}))
+        self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
+        self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
+        self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
+        self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
+        self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
+        self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 90, 'description': 'foo'}))
+        self.assertTrue(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'description': 'foo'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'dislike_count': 60, 'description': 'foo'}))
+        self.assertFalse(match_str(
+            'like_count > 100 & dislike_count <? 50 & description',
+            {'like_count': 190, 'dislike_count': 10}))
+
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     unittest.main()
     unittest.main()

+ 13 - 1
youtube_dl/YoutubeDL.py

@@ -228,6 +228,11 @@ class YoutubeDL(object):
     external_downloader:  Executable of the external downloader to call.
     external_downloader:  Executable of the external downloader to call.
     listformats:       Print an overview of available video formats and exit.
     listformats:       Print an overview of available video formats and exit.
     list_thumbnails:   Print a table of all thumbnails and exit.
     list_thumbnails:   Print a table of all thumbnails and exit.
+    match_filter:      A function that gets called with the info_dict of
+                       every video.
+                       If it returns a message, the video is ignored.
+                       If it returns None, the video is downloaded.
+                       match_filter_func in utils.py is one example for this.
 
 
 
 
     The following parameters are not used by YoutubeDL itself, they are used by
     The following parameters are not used by YoutubeDL itself, they are used by
@@ -583,9 +588,16 @@ class YoutubeDL(object):
             if max_views is not None and view_count > max_views:
             if max_views is not None and view_count > max_views:
                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
-            return 'Skipping "%s" because it is age restricted' % title
+            return 'Skipping "%s" because it is age restricted' % video_title
         if self.in_download_archive(info_dict):
         if self.in_download_archive(info_dict):
             return '%s has already been recorded in archive' % video_title
             return '%s has already been recorded in archive' % video_title
+
+        match_filter = self.params.get('match_filter')
+        if match_filter is not None:
+            ret = match_filter(info_dict)
+            if ret is not None:
+                return ret
+
         return None
         return None
 
 
     @staticmethod
     @staticmethod

+ 6 - 1
youtube_dl/__init__.py

@@ -23,9 +23,10 @@ from .compat import (
 )
 )
 from .utils import (
 from .utils import (
     DateRange,
     DateRange,
-    DEFAULT_OUTTMPL,
     decodeOption,
     decodeOption,
+    DEFAULT_OUTTMPL,
     DownloadError,
     DownloadError,
+    match_filter_func,
     MaxDownloadsReached,
     MaxDownloadsReached,
     preferredencoding,
     preferredencoding,
     read_batch_urls,
     read_batch_urls,
@@ -247,6 +248,9 @@ def _real_main(argv=None):
             xattr  # Confuse flake8
             xattr  # Confuse flake8
         except ImportError:
         except ImportError:
             parser.error('setting filesize xattr requested but python-xattr is not available')
             parser.error('setting filesize xattr requested but python-xattr is not available')
+    match_filter = (
+        None if opts.match_filter is None
+        else match_filter_func(opts.match_filter))
 
 
     ydl_opts = {
     ydl_opts = {
         'usenetrc': opts.usenetrc,
         'usenetrc': opts.usenetrc,
@@ -344,6 +348,7 @@ def _real_main(argv=None):
         'list_thumbnails': opts.list_thumbnails,
         'list_thumbnails': opts.list_thumbnails,
         'playlist_items': opts.playlist_items,
         'playlist_items': opts.playlist_items,
         'xattr_set_filesize': opts.xattr_set_filesize,
         'xattr_set_filesize': opts.xattr_set_filesize,
+        'match_filter': match_filter,
     }
     }
 
 
     with YoutubeDL(ydl_opts) as ydl:
     with YoutubeDL(ydl_opts) as ydl:

+ 19 - 0
youtube_dl/options.py

@@ -244,6 +244,25 @@ def parseOpts(overrideArguments=None):
         '--max-views',
         '--max-views',
         metavar='COUNT', dest='max_views', default=None, type=int,
         metavar='COUNT', dest='max_views', default=None, type=int,
         help='Do not download any videos with more than COUNT views')
         help='Do not download any videos with more than COUNT views')
+    selection.add_option(
+        '--match-filter',
+        metavar='FILTER', dest='match_filter', default=None,
+        help=(
+            '(Experimental) Generic video filter. '
+            'Specify any key (see help for -o for a list of available keys) to'
+            ' match if the key is present, '
+            '!key to check if the key is not present,'
+            'key > NUMBER (like "comment_count > 12", also works with '
+            '>=, <, <=, !=, =) to compare against a number, and '
+            '& to require multiple matches. '
+            'Values which are not known are excluded unless you'
+            ' put a question mark (?) after the operator.'
+            'For example, to only match videos that have been liked more than '
+            '100 times and disliked less than 50 times (or the dislike '
+            'functionality is not available at the given service), but who '
+            'also have a description, use  --match-filter '
+            '"like_count > 100 & dislike_count <? 50 & description" .'
+        ))
     selection.add_option(
     selection.add_option(
         '--no-playlist',
         '--no-playlist',
         action='store_true', dest='noplaylist', default=False,
         action='store_true', dest='noplaylist', default=False,

+ 77 - 0
youtube_dl/utils.py

@@ -17,6 +17,7 @@ import io
 import json
 import json
 import locale
 import locale
 import math
 import math
+import operator
 import os
 import os
 import pipes
 import pipes
 import platform
 import platform
@@ -1678,3 +1679,79 @@ def render_table(header_row, data):
     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
     return '\n'.join(format_str % tuple(row) for row in table)
     return '\n'.join(format_str % tuple(row) for row in table)
+
+
+def _match_one(filter_part, dct):
+    COMPARISON_OPERATORS = {
+        '<': operator.lt,
+        '<=': operator.le,
+        '>': operator.gt,
+        '>=': operator.ge,
+        '=': operator.eq,
+        '!=': operator.ne,
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<key>[a-z_]+)
+        \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+        (?:
+            (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
+            (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
+        )
+        \s*$
+        ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = COMPARISON_OPERATORS[m.group('op')]
+        if m.group('strval') is not None:
+            if m.group('op') not in ('=', '!='):
+                raise ValueError(
+                    'Operator %s does not support string values!' % m.group('op'))
+            comparison_value = m.group('strval')
+        else:
+            try:
+                comparison_value = int(m.group('intval'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('intval'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('intval') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid integer value %r in filter part %r' % (
+                            m.group('intval'), filter_part))
+        actual_value = dct.get(m.group('key'))
+        if actual_value is None:
+            return m.group('none_inclusive')
+        return op(actual_value, comparison_value)
+
+    UNARY_OPERATORS = {
+        '': lambda v: v is not None,
+        '!': lambda v: v is None,
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<op>%s)\s*(?P<key>[a-z_]+)
+        \s*$
+        ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = UNARY_OPERATORS[m.group('op')]
+        actual_value = dct.get(m.group('key'))
+        return op(actual_value)
+
+    raise ValueError('Invalid filter part %r' % filter_part)
+
+
+def match_str(filter_str, dct):
+    """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
+
+    return all(
+        _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
+
+
+def match_filter_func(filter_str):
+    def _match_func(info_dict):
+        if match_str(filter_str, info_dict):
+            return None
+        else:
+            video_title = info_dict.get('title', info_dict.get('id', 'video'))
+            return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
+    return _match_func