浏览代码

Support BBC News (bbc.com/news)

fnord 10 年之前
父节点
当前提交
9e96dc8b35
共有 3 个文件被更改,包括 164 次插入0 次删除
  1. 1 0
      docs/supportedsites.md
  2. 1 0
      youtube_dl/extractor/__init__.py
  3. 162 0
      youtube_dl/extractor/bbcnews.py

+ 1 - 0
docs/supportedsites.md

@@ -50,6 +50,7 @@
  - **Bandcamp**
  - **Bandcamp**
  - **Bandcamp:album**
  - **Bandcamp:album**
  - **bbc.co.uk**: BBC iPlayer
  - **bbc.co.uk**: BBC iPlayer
+ - **bbc.com**: BBC news videos
  - **BeatportPro**
  - **BeatportPro**
  - **Beeg**
  - **Beeg**
  - **BehindKink**
  - **BehindKink**

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -36,6 +36,7 @@ from .baidu import BaiduVideoIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bbccouk import BBCCoUkIE
 from .bbccouk import BBCCoUkIE
+from .bbcnews import BBCNewsIE
 from .beeg import BeegIE
 from .beeg import BeegIE
 from .behindkink import BehindKinkIE
 from .behindkink import BehindKinkIE
 from .beatportpro import BeatportProIE
 from .beatportpro import BeatportProIE

+ 162 - 0
youtube_dl/extractor/bbcnews.py

@@ -0,0 +1,162 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+from ..compat import compat_HTTPError
+import re
+from .bbccouk import BBCCoUkIE
+
+class BBCNewsIE(BBCCoUkIE):
+    IE_NAME = 'bbc.com'
+    IE_DESC = 'BBC news'
+    _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.bbc.com/news/world-europe-32668511',
+        'info_dict': {
+            'id': 'world-europe-32668511',
+            'title': 'Russia stages massive WW2 parade despite Western boycott',
+        },
+        'playlist_count': 2,
+    },{
+        'url': 'http://www.bbc.com/news/business-28299555',
+        'info_dict': {
+            'id': 'business-28299555',
+            'title': 'Farnborough Airshow: Video highlights',
+        },
+        'playlist_count': 9,
+    },{
+        'url': 'http://www.bbc.com/news/world-europe-32041533',
+        'note': 'Video',
+        'info_dict': {
+            'id': 'p02mprgb',
+            'ext': 'mp4',
+            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+            'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+            'duration': 47,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }]
+
+    def _duration_str2int(self, str):
+        if not str:
+            return None
+        ret = re.match(r'^\d+$', str)
+        if ret:
+            return int(ret.group(0))
+        ret = re.match(r'PT((?P<h>\d+)H)?((?P<m>\d+)M)?(?P<s>\d+)S$', str)
+        if ret:
+            total=int(ret.group('s'))
+            if ret.group('m'):
+                total+=(int(ret.group('m'))*60)
+            if ret.group('h'):
+                total+=(int(ret.group('h'))*3600)
+            return total
+        return None
+
+    def _download_media_selector(self, programme_id):
+        # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not
+        # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/
+        # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it
+
+        try:
+            media_selection = self._download_xml(
+               'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id,
+                programme_id, 'Downloading media selection XML')
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8'))
+            else:
+                raise
+        formats = []
+        subtitles = None
+
+        for media in self._extract_medias(media_selection):
+            kind = media.get('kind')
+            if kind == 'audio':
+                formats.extend(self._extract_audio(media, programme_id))
+            elif kind == 'video':
+                formats.extend(self._extract_video(media, programme_id))
+            elif kind == 'captions':
+                subtitles = self.extract_subtitles(media, programme_id)
+
+        formats = []
+        subtitles = None
+
+        for media in self._extract_medias(media_selection):
+            kind = media.get('kind')
+            if kind == 'audio':
+                formats.extend(self._extract_audio(media, programme_id))
+            elif kind == 'video':
+                formats.extend(self._extract_video(media, programme_id))
+            elif kind == 'captions':
+                subtitles = self.extract_subtitles(media, programme_id)
+
+        return formats, subtitles
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+        webpage = self._download_webpage(url, list_id)
+
+        list_title = self._html_search_regex(r'<title>(.*?)(?:\s*-\s*BBC News)?</title>', webpage, 'list title')
+
+        pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None)
+        if pubdate:
+           pubdate = pubdate.replace('-','')
+
+        ret = []
+        # works with bbc.com/news/something-something-123456 articles
+        matches = re.findall(r"data-media-meta='({[^']+})'", webpage)
+        if not matches:
+           # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc}
+           # in http://www.bbc.com/news/video_and_audio/international
+           matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage)
+        if not matches:
+           raise ExtractorError('No video found', expected=True)
+
+        for ent in matches:
+            jent = self._parse_json(ent,list_id)
+
+            programme_id = jent.get('externalId',None)
+            xml_url = jent.get('href', None)
+
+            title = jent['caption']
+            duration = self._duration_str2int(jent.get('duration',None))
+            description = list_title + ' - ' + jent.get('caption','')
+            thumbnail = None
+            if jent.has_key('image'):
+               thumbnail=jent['image'].get('href',None)
+
+            if programme_id:
+               formats, subtitles = self._download_media_selector(programme_id)
+            elif xml_url:
+               # Cheap fallback
+               # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml
+               xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)')
+               programme_id = self._search_regex(r'<mediator [^>]*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)')
+               formats, subtitles = self._download_media_selector(programme_id)
+            else:
+               raise ExtractorError('data-media-meta entry has no externalId or href value.')
+               
+            self._sort_formats(formats)
+
+            ret.append( {
+                'id': programme_id,
+                'uploader': 'BBC News',
+                'upload_date': pubdate,
+                'title': title,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'formats': formats,
+                'subtitles': subtitles,
+            } )
+
+        if len(ret) > 0:
+           return self.playlist_result(ret, list_id, list_title)
+        raise ExtractorError('No video found', expected=True)