浏览代码

[youtube] fix automatic captions extraction(closes #27162)(closes #27388)

Remita Amine 4 年之前
父节点
当前提交
4ef1fc9707
共有 1 个文件被更改,包括 14 次插入18 次删除
  1. 14 18
      youtube_dl/extractor/youtube.py

+ 14 - 18
youtube_dl/extractor/youtube.py

@@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             return self._parse_json(
             return self._parse_json(
                 uppercase_escape(config), video_id, fatal=False)
                 uppercase_escape(config), video_id, fatal=False)
 
 
-    def _get_automatic_captions(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, player_response, player_config):
         """We need the webpage for getting the captions url, pass it as an
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
            argument to speed up the process."""
         self.to_screen('%s: Looking for automatic captions' % video_id)
         self.to_screen('%s: Looking for automatic captions' % video_id)
-        player_config = self._get_ytplayer_config(video_id, webpage)
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
-        if not player_config:
+        if not (player_response or player_config):
             self._downloader.report_warning(err_msg)
             self._downloader.report_warning(err_msg)
             return {}
             return {}
         try:
         try:
-            args = player_config['args']
+            args = player_config.get('args') if player_config else {}
             caption_url = args.get('ttsurl')
             caption_url = args.get('ttsurl')
             if caption_url:
             if caption_url:
                 timestamp = args['timestamp']
                 timestamp = args['timestamp']
@@ -1391,19 +1390,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 return captions
                 return captions
 
 
             # New captions format as of 22.06.2017
             # New captions format as of 22.06.2017
-            player_response = args.get('player_response')
-            if player_response and isinstance(player_response, compat_str):
-                player_response = self._parse_json(
-                    player_response, video_id, fatal=False)
-                if player_response:
-                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                    base_url = renderer['captionTracks'][0]['baseUrl']
-                    sub_lang_list = []
-                    for lang in renderer['translationLanguages']:
-                        lang_code = lang.get('languageCode')
-                        if lang_code:
-                            sub_lang_list.append(lang_code)
-                    return make_captions(base_url, sub_lang_list)
+            if player_response:
+                renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                base_url = renderer['captionTracks'][0]['baseUrl']
+                sub_lang_list = []
+                for lang in renderer['translationLanguages']:
+                    lang_code = lang.get('languageCode')
+                    if lang_code:
+                        sub_lang_list.append(lang_code)
+                return make_captions(base_url, sub_lang_list)
 
 
             # Some videos don't provide ttsurl but rather caption_tracks and
             # Some videos don't provide ttsurl but rather caption_tracks and
             # caption_translation_languages (e.g. 20LmZk1hakA)
             # caption_translation_languages (e.g. 20LmZk1hakA)
@@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # Get video info
         # Get video info
         video_info = {}
         video_info = {}
         embed_webpage = None
         embed_webpage = None
+        ytplayer_config = None
 
 
         if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
         if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
             age_gate = True
             age_gate = True
@@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
 
         # subtitles
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
-        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+        automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
 
 
         video_duration = try_get(
         video_duration = try_get(
             video_info, lambda x: int_or_none(x['length_seconds'][0]))
             video_info, lambda x: int_or_none(x['length_seconds'][0]))