Browse Source

[Senate] Add new extractor (#5302)

Yen Chi Hsuan 10 years ago
parent
commit
c6391cd587

+ 2 - 0
youtube_dl/downloader/f4m.py

@@ -389,6 +389,8 @@ class F4mFD(FileDownloader):
             url = base_url + name
             url = base_url + name
             if akamai_pv:
             if akamai_pv:
                 url += '?' + akamai_pv.strip(';')
                 url += '?' + akamai_pv.strip(';')
+            if info_dict.get('extra_param_to_segment_url'):
+                url += info_dict.get('extra_param_to_segment_url')
             frag_filename = '%s-%s' % (tmpfilename, name)
             frag_filename = '%s-%s' % (tmpfilename, name)
             try:
             try:
                 success = http_dl.download(frag_filename, {'url': url})
                 success = http_dl.download(frag_filename, {'url': url})

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -447,6 +447,7 @@ from .scivee import SciVeeIE
 from .screencast import ScreencastIE
 from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .screencastomatic import ScreencastOMaticIE
 from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
 from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
+from .senateisvp import SenateISVPIE
 from .servingsys import ServingSysIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
 from .sexykarma import SexyKarmaIE

+ 129 - 0
youtube_dl/extractor/senateisvp.py

@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class SenateISVPIE(InfoExtractor):
+    _COMM_MAP = [
+        ["ag", "76440", "http://ag-f.akamaihd.net"],
+        ["aging", "76442", "http://aging-f.akamaihd.net"],
+        ["approps", "76441", "http://approps-f.akamaihd.net"],
+        ["armed", "76445", "http://armed-f.akamaihd.net"],
+        ["banking", "76446", "http://banking-f.akamaihd.net"],
+        ["budget", "76447", "http://budget-f.akamaihd.net"],
+        ["cecc", "76486", "http://srs-f.akamaihd.net"],
+        ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
+        ["csce", "75229", "http://srs-f.akamaihd.net"],
+        ["dpc", "76590", "http://dpc-f.akamaihd.net"],
+        ["energy", "76448", "http://energy-f.akamaihd.net"],
+        ["epw", "76478", "http://epw-f.akamaihd.net"],
+        ["ethics", "76449", "http://ethics-f.akamaihd.net"],
+        ["finance", "76450", "http://finance-f.akamaihd.net"],
+        ["foreign", "76451", "http://foreign-f.akamaihd.net"],
+        ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
+        ["help", "76452", "http://help-f.akamaihd.net"],
+        ["indian", "76455", "http://indian-f.akamaihd.net"],
+        ["intel", "76456", "http://intel-f.akamaihd.net"],
+        ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
+        ["jccic", "85180", "http://jccic-f.akamaihd.net"],
+        ["jec", "76458", "http://jec-f.akamaihd.net"],
+        ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
+        ["rpc", "76591", "http://rpc-f.akamaihd.net"],
+        ["rules", "76460", "http://rules-f.akamaihd.net"],
+        ["saa", "76489", "http://srs-f.akamaihd.net"],
+        ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
+        ["srs", "75229", "http://srs-f.akamaihd.net"],
+        ["uscc", "76487", "http://srs-f.akamaihd.net"],
+        ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
+        ["arch", "", "http://ussenate-f.akamaihd.net/"]
+    ]
+    _IE_NAME = 'senate.gov'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+    _TESTS = [{
+        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+        'md5': '7314c4b96dad66dd8e63dc3518ceaa6f',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player',
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+        'md5': '2917c827513700aa9b70eaebf25116da',
+        'info_dict': {
+            'id': 'commerce011514',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player'
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+        # checksum differs each time
+        'info_dict': {
+            'id': 'intel090613',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player'
+        }
+    }]
+
+    def _get_info_for_comm(self, committee):
+        for entry in self._COMM_MAP:
+            if entry[0] == committee:
+                return entry[1:]
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+        if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+
+        video_type = qs['type'][0]
+        committee = video_type if video_type == 'arch' else qs['comm'][0]
+        stream_num, domain = self._get_info_for_comm(committee)
+
+        formats = []
+        if video_type == 'arch':
+            filename = video_id if '.' in video_id else video_id + '.mp4'
+            formats = [{
+                # All parameters in the query string are necessary to prevent a 403 error
+                'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
+            }]
+        else:
+            hdcore_sign = '?hdcore=3.1.0'
+            url_params = (domain, video_id, stream_num)
+            f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign
+            m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+            for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+                formats.append(entry)
+            for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+                mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+                if mobj:
+                    entry['format_id'] += mobj.group('tag')
+                formats.append(entry)
+
+            self._sort_formats(formats)
+
+        info_dict = {
+            'id': video_id,
+            'title': title,
+        }
+
+        if len(formats) >= 1:
+            info_dict.update({'formats': formats})
+        else:
+            info_dict.update(formats[0])
+
+        return info_dict