Pārlūkot izejas kodu

[youtube] Support automatic captions with original language different from English (fixes #1225) and download in multiple languages.

Jaime Marquínez Ferrándiz 12 gadi atpakaļ
vecāks
revīzija
055e6f3657
2 mainītis faili ar 47 papildinājumiem un 41 dzēšanām
  1. 22 29
      youtube_dl/extractor/subtitles.py
  2. 25 12
      youtube_dl/extractor/youtube.py

+ 22 - 29
youtube_dl/extractor/subtitles.py

@@ -15,28 +15,33 @@ class SubtitlesInfoExtractor(InfoExtractor):
         self.to_screen(u'%s: Available subtitles for video: %s' %
         self.to_screen(u'%s: Available subtitles for video: %s' %
                        (video_id, sub_lang))
                        (video_id, sub_lang))
 
 
-    def _extract_subtitles(self, video_id):
+    def extract_subtitles(self, video_id, video_webpage=None):
         """ returns {sub_lang: sub} or {} if subtitles not found """
         """ returns {sub_lang: sub} or {} if subtitles not found """
-        available_subs_list = self._get_available_subtitles(video_id)
+        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
+            available_subs_list = self._get_available_subtitles(video_id)
+        elif self._downloader.params.get('writeautomaticsub', False):
+            available_subs_list = self._get_available_automatic_caption(video_id, video_webpage)
+        else:
+            return None
+
         if not available_subs_list:  # error, it didn't get the available subtitles
         if not available_subs_list:  # error, it didn't get the available subtitles
             return {}
             return {}
         if self._downloader.params.get('allsubtitles', False):
         if self._downloader.params.get('allsubtitles', False):
             sub_lang_list = available_subs_list
             sub_lang_list = available_subs_list
         else:
         else:
-            if self._downloader.params.get('writesubtitles', False):
-                if self._downloader.params.get('subtitleslangs', False):
-                    requested_langs = self._downloader.params.get('subtitleslangs')
-                elif 'en' in available_subs_list:
-                    requested_langs = ['en']
-                else:
-                    requested_langs = [list(available_subs_list.keys())[0]]
+            if self._downloader.params.get('subtitleslangs', False):
+                requested_langs = self._downloader.params.get('subtitleslangs')
+            elif 'en' in available_subs_list:
+                requested_langs = ['en']
+            else:
+                requested_langs = [list(available_subs_list.keys())[0]]
 
 
-                sub_lang_list = {}
-                for sub_lang in requested_langs:
-                    if not sub_lang in available_subs_list:
-                        self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
-                        continue
-                    sub_lang_list[sub_lang] = available_subs_list[sub_lang]
+            sub_lang_list = {}
+            for sub_lang in requested_langs:
+                if not sub_lang in available_subs_list:
+                    self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
+                    continue
+                sub_lang_list[sub_lang] = available_subs_list[sub_lang]
 
 
         subtitles = {}
         subtitles = {}
         for sub_lang, url in sub_lang_list.items():
         for sub_lang, url in sub_lang_list.items():
@@ -64,23 +69,11 @@ class SubtitlesInfoExtractor(InfoExtractor):
         """
         """
         pass
         pass
 
 
-    def _request_automatic_caption(self, video_id, webpage):
+    def _get_available_automatic_caption(self, video_id, webpage):
         """
         """
-        returns {sub_lang: sub} or {} if not available
+        returns {sub_lang: url} or {} if not available
         Must be redefined by the subclasses that support automatic captions,
         Must be redefined by the subclasses that support automatic captions,
         otherwise it will return {}
         otherwise it will return {}
         """
         """
         self._downloader.report_warning(u'Automatic Captions not supported by this server')
         self._downloader.report_warning(u'Automatic Captions not supported by this server')
         return {}
         return {}
-
-    def extract_subtitles(self, video_id, video_webpage=None):
-        """
-        Extract the subtitles and/or the automatic captions if requested.
-        Returns None or a dictionary in the format {sub_lang: sub}
-        """
-        video_subtitles = None
-        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
-            video_subtitles = self._extract_subtitles(video_id)
-        elif self._downloader.params.get('writeautomaticsub', False):
-            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
-        return video_subtitles

+ 25 - 12
youtube_dl/extractor/youtube.py

@@ -5,6 +5,7 @@ import netrc
 import re
 import re
 import socket
 import socket
 import itertools
 import itertools
+import xml.etree.ElementTree
 
 
 from .common import InfoExtractor, SearchInfoExtractor
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
@@ -478,14 +479,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             return {}
             return {}
         return sub_lang_list
         return sub_lang_list
 
 
-    def _request_automatic_caption(self, video_id, webpage):
+    def _get_available_automatic_caption(self, video_id, webpage):
         """We need the webpage for getting the captions url, pass it as an
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
            argument to speed up the process."""
-        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
         sub_format = self._downloader.params.get('subtitlesformat')
         sub_format = self._downloader.params.get('subtitlesformat')
         self.to_screen(u'%s: Looking for automatic captions' % video_id)
         self.to_screen(u'%s: Looking for automatic captions' % video_id)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
-        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+        err_msg = u'Couldn\'t find automatic captions for %s' % video_id
         if mobj is None:
         if mobj is None:
             self._downloader.report_warning(err_msg)
             self._downloader.report_warning(err_msg)
             return {}
             return {}
@@ -494,16 +494,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             args = player_config[u'args']
             args = player_config[u'args']
             caption_url = args[u'ttsurl']
             caption_url = args[u'ttsurl']
             timestamp = args[u'timestamp']
             timestamp = args[u'timestamp']
-            params = compat_urllib_parse.urlencode({
-                'lang': 'en',
-                'tlang': sub_lang,
-                'fmt': sub_format,
-                'ts': timestamp,
-                'kind': 'asr',
+            # We get the available subtitles
+            list_params = compat_urllib_parse.urlencode({
+                'type': 'list',
+                'tlangs': 1,
+                'asrs': 1,
             })
             })
-            subtitles_url = caption_url + '&' + params
-            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
-            return {sub_lang: sub}
+            list_url = caption_url + '&' + list_params
+            list_page = self._download_webpage(list_url, video_id)
+            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+            original_lang = caption_list.find('track').attrib['lang_code']
+
+            sub_lang_list = {}
+            for lang_node in caption_list.findall('target'):
+                sub_lang = lang_node.attrib['lang_code']
+                params = compat_urllib_parse.urlencode({
+                    'lang': original_lang,
+                    'tlang': sub_lang,
+                    'fmt': sub_format,
+                    'ts': timestamp,
+                    'kind': 'asr',
+                })
+                sub_lang_list[sub_lang] = caption_url + '&' + params
+            return sub_lang_list
         # An extractor error can be raise by the download process if there are
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
         # no automatic captions but there are subtitles
         except (KeyError, ExtractorError):
         except (KeyError, ExtractorError):