浏览代码

[Senate] Add new extractor (#5302)

Yen Chi Hsuan 10 年之前
父节点
当前提交
c6391cd587
共有 3 个文件被更改,包括 132 次插入0 次删除
  1. 2 0
      youtube_dl/downloader/f4m.py
  2. 1 0
      youtube_dl/extractor/__init__.py
  3. 129 0
      youtube_dl/extractor/senateisvp.py

+ 2 - 0
youtube_dl/downloader/f4m.py

@@ -389,6 +389,8 @@ class F4mFD(FileDownloader):
             url = base_url + name
             url = base_url + name
             if akamai_pv:
             if akamai_pv:
                 url += '?' + akamai_pv.strip(';')
                 url += '?' + akamai_pv.strip(';')
+            if info_dict.get('extra_param_to_segment_url'):
+                url += info_dict.get('extra_param_to_segment_url')
             frag_filename = '%s-%s' % (tmpfilename, name)
             frag_filename = '%s-%s' % (tmpfilename, name)
             try:
             try:
                 success = http_dl.download(frag_filename, {'url': url})
                 success = http_dl.download(frag_filename, {'url': url})

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -447,6 +447,7 @@ from .scivee import SciVeeIE
 from .screencast import ScreencastIE
 from .screencast import ScreencastIE
 from .screencastomatic import ScreencastOMaticIE
 from .screencastomatic import ScreencastOMaticIE
 from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
 from .screenwavemedia import CinemassacreIE, ScreenwaveMediaIE, TeamFourIE
+from .senateisvp import SenateISVPIE
 from .servingsys import ServingSysIE
 from .servingsys import ServingSysIE
 from .sexu import SexuIE
 from .sexu import SexuIE
 from .sexykarma import SexyKarmaIE
 from .sexykarma import SexyKarmaIE

+ 129 - 0
youtube_dl/extractor/senateisvp.py

@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class SenateISVPIE(InfoExtractor):
+    _COMM_MAP = [
+        ["ag", "76440", "http://ag-f.akamaihd.net"],
+        ["aging", "76442", "http://aging-f.akamaihd.net"],
+        ["approps", "76441", "http://approps-f.akamaihd.net"],
+        ["armed", "76445", "http://armed-f.akamaihd.net"],
+        ["banking", "76446", "http://banking-f.akamaihd.net"],
+        ["budget", "76447", "http://budget-f.akamaihd.net"],
+        ["cecc", "76486", "http://srs-f.akamaihd.net"],
+        ["commerce", "80177", "http://commerce1-f.akamaihd.net"],
+        ["csce", "75229", "http://srs-f.akamaihd.net"],
+        ["dpc", "76590", "http://dpc-f.akamaihd.net"],
+        ["energy", "76448", "http://energy-f.akamaihd.net"],
+        ["epw", "76478", "http://epw-f.akamaihd.net"],
+        ["ethics", "76449", "http://ethics-f.akamaihd.net"],
+        ["finance", "76450", "http://finance-f.akamaihd.net"],
+        ["foreign", "76451", "http://foreign-f.akamaihd.net"],
+        ["govtaff", "76453", "http://govtaff-f.akamaihd.net"],
+        ["help", "76452", "http://help-f.akamaihd.net"],
+        ["indian", "76455", "http://indian-f.akamaihd.net"],
+        ["intel", "76456", "http://intel-f.akamaihd.net"],
+        ["intlnarc", "76457", "http://intlnarc-f.akamaihd.net"],
+        ["jccic", "85180", "http://jccic-f.akamaihd.net"],
+        ["jec", "76458", "http://jec-f.akamaihd.net"],
+        ["judiciary", "76459", "http://judiciary-f.akamaihd.net"],
+        ["rpc", "76591", "http://rpc-f.akamaihd.net"],
+        ["rules", "76460", "http://rules-f.akamaihd.net"],
+        ["saa", "76489", "http://srs-f.akamaihd.net"],
+        ["smbiz", "76461", "http://smbiz-f.akamaihd.net"],
+        ["srs", "75229", "http://srs-f.akamaihd.net"],
+        ["uscc", "76487", "http://srs-f.akamaihd.net"],
+        ["vetaff", "76462", "http://vetaff-f.akamaihd.net"],
+        ["arch", "", "http://ussenate-f.akamaihd.net/"]
+    ]
+    _IE_NAME = 'senate.gov'
+    _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)'
+    _TESTS = [{
+        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+        'md5': '7314c4b96dad66dd8e63dc3518ceaa6f',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player',
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+        'md5': '2917c827513700aa9b70eaebf25116da',
+        'info_dict': {
+            'id': 'commerce011514',
+            'ext': 'flv',
+            'title': 'Integrated Senate Video Player'
+        }
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+        # checksum differs each time
+        'info_dict': {
+            'id': 'intel090613',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player'
+        }
+    }]
+
+    def _get_info_for_comm(self, committee):
+        for entry in self._COMM_MAP:
+            if entry[0] == committee:
+                return entry[1:]
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+        if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+
+        video_type = qs['type'][0]
+        committee = video_type if video_type == 'arch' else qs['comm'][0]
+        stream_num, domain = self._get_info_for_comm(committee)
+
+        formats = []
+        if video_type == 'arch':
+            filename = video_id if '.' in video_id else video_id + '.mp4'
+            formats = [{
+                # All parameters in the query string are necessary to prevent a 403 error
+                'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
+            }]
+        else:
+            hdcore_sign = '?hdcore=3.1.0'
+            url_params = (domain, video_id, stream_num)
+            f4m_url = '%s/z/%s_1@%s/manifest.f4m' % url_params + hdcore_sign
+            m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+            for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+                formats.append(entry)
+            for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+                mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+                if mobj:
+                    entry['format_id'] += mobj.group('tag')
+                formats.append(entry)
+
+            self._sort_formats(formats)
+
+        info_dict = {
+            'id': video_id,
+            'title': title,
+        }
+
+        if len(formats) >= 1:
+            info_dict.update({'formats': formats})
+        else:
+            info_dict.update(formats[0])
+
+        return info_dict