Browse Source

[servingsys] Add support

This also adds support for brightcove advertisements.
Fixes #2181
Philipp Hagemeister 11 years ago
parent
commit
7b0817e8e1

+ 1 - 0
youtube_dl/YoutubeDL.py

@@ -151,6 +151,7 @@ class YoutubeDL(object):
     bidi_workaround:   Work around buggy terminals without bidirectional text
     bidi_workaround:   Work around buggy terminals without bidirectional text
                        support, using fridibi
                        support, using fridibi
     debug_printtraffic:Print out sent and received HTTP traffic
     debug_printtraffic:Print out sent and received HTTP traffic
+    include_ads:       Download ads as well
 
 
     The following parameters are not used by YoutubeDL itself, they are used by
     The following parameters are not used by YoutubeDL itself, they are used by
     the FileDownloader:
     the FileDownloader:

+ 5 - 1
youtube_dl/__init__.py

@@ -238,7 +238,10 @@ def parseOpts(overrideArguments=None):
     selection.add_option('--download-archive', metavar='FILE',
     selection.add_option('--download-archive', metavar='FILE',
                          dest='download_archive',
                          dest='download_archive',
                          help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
                          help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
-
+    selection.add_option(
+        '--include-ads', dest='include_ads',
+        action='store_true',
+        help='Download advertisements as well (experimental)')
 
 
     authentication.add_option('-u', '--username',
     authentication.add_option('-u', '--username',
             dest='username', metavar='USERNAME', help='account username')
             dest='username', metavar='USERNAME', help='account username')
@@ -716,6 +719,7 @@ def _real_main(argv=None):
         'bidi_workaround': opts.bidi_workaround,
         'bidi_workaround': opts.bidi_workaround,
         'debug_printtraffic': opts.debug_printtraffic,
         'debug_printtraffic': opts.debug_printtraffic,
         'prefer_ffmpeg': opts.prefer_ffmpeg,
         'prefer_ffmpeg': opts.prefer_ffmpeg,
+        'include_ads': opts.include_ads,
     }
     }
 
 
     with YoutubeDL(ydl_opts) as ydl:
     with YoutubeDL(ydl_opts) as ydl:

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -152,6 +152,7 @@ from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .rtlnow import RTLnowIE
 from .rutube import RutubeIE
 from .rutube import RutubeIE
+from .servingsys import ServingSysIE
 from .sina import SinaIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
 from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
 from .slideshare import SlideshareIE

+ 44 - 7
youtube_dl/extractor/brightcove.py

@@ -9,9 +9,11 @@ from .common import InfoExtractor
 from ..utils import (
 from ..utils import (
     compat_urllib_parse,
     compat_urllib_parse,
     find_xpath_attr,
     find_xpath_attr,
+    fix_xml_ampersands,
     compat_urlparse,
     compat_urlparse,
     compat_str,
     compat_str,
     compat_urllib_request,
     compat_urllib_request,
+    compat_parse_qs,
 
 
     ExtractorError,
     ExtractorError,
     unsmuggle_url,
     unsmuggle_url,
@@ -83,17 +85,30 @@ class BrightcoveIE(InfoExtractor):
                             lambda m: m.group(1) + '/>', object_str)
                             lambda m: m.group(1) + '/>', object_str)
         # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
         # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
         object_str = object_str.replace('<--', '<!--')
         object_str = object_str.replace('<--', '<!--')
+        object_str = fix_xml_ampersands(object_str)
 
 
         object_doc = xml.etree.ElementTree.fromstring(object_str)
         object_doc = xml.etree.ElementTree.fromstring(object_str)
-        assert 'BrightcoveExperience' in object_doc.attrib['class']
-        params = {
-            'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
-        }
+
+        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
+        flashvars = dict(
+            (k, v[0])
+            for k, v in compat_parse_qs(fv_el.attrib['value']).items())
+
         def find_param(name):
         def find_param(name):
+            if name in flashvars:
+                return flashvars[name]
             node = find_xpath_attr(object_doc, './param', 'name', name)
             node = find_xpath_attr(object_doc, './param', 'name', name)
             if node is not None:
             if node is not None:
                 return node.attrib['value']
                 return node.attrib['value']
             return None
             return None
+
+        params = {}
+
+        playerID = find_param('playerID')
+        if playerID is None:
+            raise ExtractorError('Cannot find player ID')
+        params['playerID'] = playerID
+
         playerKey = find_param('playerKey')
         playerKey = find_param('playerKey')
         # Not all pages define this value
         # Not all pages define this value
         if playerKey is not None:
         if playerKey is not None:
@@ -114,8 +129,12 @@ class BrightcoveIE(InfoExtractor):
         if it can't be found
         if it can't be found
         """
         """
         m_brightcove = re.search(
         m_brightcove = re.search(
-            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
-            webpage, re.DOTALL)
+            r'''(?sx)<object
+            (?:
+                :[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 |
+                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
+            ).+?</object>''',
+            webpage)
         if m_brightcove is not None:
         if m_brightcove is not None:
             return cls._build_brighcove_url(m_brightcove.group())
             return cls._build_brighcove_url(m_brightcove.group())
         else:
         else:
@@ -156,6 +175,7 @@ class BrightcoveIE(InfoExtractor):
         info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
         info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
         info = json.loads(info)['data']
         info = json.loads(info)['data']
         video_info = info['programmedContent']['videoPlayer']['mediaDTO']
         video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
 
 
         return self._extract_video_info(video_info)
         return self._extract_video_info(video_info)
 
 
@@ -193,6 +213,23 @@ class BrightcoveIE(InfoExtractor):
             info.update({
             info.update({
                 'url': video_info['FLVFullLengthURL'],
                 'url': video_info['FLVFullLengthURL'],
             })
             })
-        else:
+
+        if self._downloader.params.get('include_ads', False):
+            adServerURL = video_info.get('_youtubedl_adServerURL')
+            if adServerURL:
+                ad_info = {
+                    '_type': 'url',
+                    'url': adServerURL,
+                }
+                if 'url' in info:
+                    return {
+                        '_type': 'playlist',
+                        'title': info['title'],
+                        'entries': [ad_info, info],
+                    }
+                else:
+                    return ad_info
+
+        if 'url' not in info:
             raise ExtractorError('Unable to extract video url for %s' % info['id'])
             raise ExtractorError('Unable to extract video url for %s' % info['id'])
         return info
         return info

+ 70 - 0
youtube_dl/extractor/servingsys.py

@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+)
+
+
+class ServingSysIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?',
+        'playlist': [{
+            'file': '29955898.flv',
+            'md5': 'baed851342df6846eb8677a60a011a0f',
+            'info_dict': {
+                'title': 'AdAPPter_Hyundai_demo (1)',
+                'duration': 74,
+                'tbr': 1378,
+                'width': 640,
+                'height': 400,
+            },
+        }, {
+            'file': '29907998.flv',
+            'md5': '979b4da2655c4bc2d81aeb915a8c5014',
+            'info_dict': {
+                'title': 'AdAPPter_Hyundai_demo (2)',
+                'duration': 34,
+                'width': 854,
+                'height': 480,
+                'tbr': 516,
+            },
+        }],
+        'params': {
+            'playlistend': 2,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        pl_id = mobj.group('id')
+
+        vast_doc = self._download_xml(url, pl_id)
+        title = vast_doc.find('.//AdTitle').text
+        media = vast_doc.find('.//MediaFile').text
+        info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL')
+
+        doc = self._download_xml(info_url, pl_id, 'Downloading video info')
+        entries = [{
+            '_type': 'video',
+            'id': a.attrib['id'],
+            'title': '%s (%s)' % (title, a.attrib['assetID']),
+            'url': a.attrib['URL'],
+            'duration': int_or_none(a.attrib.get('length')),
+            'tbr': int_or_none(a.attrib.get('bitrate')),
+            'height': int_or_none(a.attrib.get('height')),
+            'width': int_or_none(a.attrib.get('width')),
+        } for a in doc.findall('.//AdditionalAssets/asset')]
+
+        return {
+            '_type': 'playlist',
+            'id': pl_id,
+            'title': title,
+            'entries': entries,
+        }
+
+