Jelajahi Sumber

[bbccouk] Convert to new subtitles system

I haven't found any video available outside the UK, so I haven't added tests.

I have updated how the srt file is build, because (at least for www.bbc.co.uk/programmes/p02j9b69) the subtitles is inside 'span' elements.
Jaime Marquínez Ferrándiz 10 tahun lalu
induk
melakukan
f13b1e7d7f
1 mengubah file dengan 22 tambahan dan 11 penghapusan
  1. 22 11
      youtube_dl/extractor/bbccouk.py

+ 22 - 11
youtube_dl/extractor/bbccouk.py

@@ -2,12 +2,12 @@ from __future__ import unicode_literals
 
 
 import xml.etree.ElementTree
 import xml.etree.ElementTree
 
 
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
 from ..utils import ExtractorError
 from ..utils import ExtractorError
 from ..compat import compat_HTTPError
 from ..compat import compat_HTTPError
 
 
 
 
-class BBCCoUkIE(SubtitlesInfoExtractor):
+class BBCCoUkIE(InfoExtractor):
     IE_NAME = 'bbc.co.uk'
     IE_NAME = 'bbc.co.uk'
     IE_DESC = 'BBC iPlayer'
     IE_DESC = 'BBC iPlayer'
     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
     _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'
@@ -215,17 +215,32 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             formats.extend(conn_formats)
             formats.extend(conn_formats)
         return formats
         return formats
 
 
-    def _extract_captions(self, media, programme_id):
+    def _get_subtitles(self, media, programme_id):
         subtitles = {}
         subtitles = {}
         for connection in self._extract_connections(media):
         for connection in self._extract_connections(media):
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
             lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
             ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
             ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
             srt = ''
             srt = ''
+
+            def _extract_text(p):
+                if p.text is not None:
+                    stripped_text = p.text.strip()
+                    if stripped_text:
+                        return stripped_text
+                return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span'))
             for pos, p in enumerate(ps):
             for pos, p in enumerate(ps):
-                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
-                                                          p.text.strip() if p.text is not None else '')
-            subtitles[lang] = srt
+                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p))
+            subtitles[lang] = [
+                {
+                    'url': connection.get('href'),
+                    'ext': 'ttml',
+                },
+                {
+                    'data': srt,
+                    'ext': 'srt',
+                },
+            ]
         return subtitles
         return subtitles
 
 
     def _download_media_selector(self, programme_id):
     def _download_media_selector(self, programme_id):
@@ -249,7 +264,7 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             elif kind == 'video':
             elif kind == 'video':
                 formats.extend(self._extract_video(media, programme_id))
                 formats.extend(self._extract_video(media, programme_id))
             elif kind == 'captions':
             elif kind == 'captions':
-                subtitles = self._extract_captions(media, programme_id)
+                subtitles = self.extract_subtitles(media, programme_id)
 
 
         return formats, subtitles
         return formats, subtitles
 
 
@@ -324,10 +339,6 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
         else:
         else:
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
             programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
 
 
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(programme_id, subtitles)
-            return
-
         self._sort_formats(formats)
         self._sort_formats(formats)
 
 
         return {
         return {