Explorar el Código

[youtube] fix automatic captions extraction(closes #27162)(closes #27388)

Remita Amine hace 4 años
padre
commit
4ef1fc9707
Se han modificado 1 ficheros con 14 adiciones y 18 borrados
  1. 14 18
      youtube_dl/extractor/youtube.py

+ 14 - 18
youtube_dl/extractor/youtube.py

@@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             return self._parse_json(
             return self._parse_json(
                 uppercase_escape(config), video_id, fatal=False)
                 uppercase_escape(config), video_id, fatal=False)
 
 
-    def _get_automatic_captions(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, player_response, player_config):
         """We need the webpage for getting the captions url, pass it as an
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
            argument to speed up the process."""
         self.to_screen('%s: Looking for automatic captions' % video_id)
         self.to_screen('%s: Looking for automatic captions' % video_id)
-        player_config = self._get_ytplayer_config(video_id, webpage)
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
-        if not player_config:
+        if not (player_response or player_config):
             self._downloader.report_warning(err_msg)
             self._downloader.report_warning(err_msg)
             return {}
             return {}
         try:
         try:
-            args = player_config['args']
+            args = player_config.get('args') if player_config else {}
             caption_url = args.get('ttsurl')
             caption_url = args.get('ttsurl')
             if caption_url:
             if caption_url:
                 timestamp = args['timestamp']
                 timestamp = args['timestamp']
@@ -1391,19 +1390,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 return captions
                 return captions
 
 
             # New captions format as of 22.06.2017
             # New captions format as of 22.06.2017
-            player_response = args.get('player_response')
-            if player_response and isinstance(player_response, compat_str):
-                player_response = self._parse_json(
-                    player_response, video_id, fatal=False)
-                if player_response:
-                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
-                    base_url = renderer['captionTracks'][0]['baseUrl']
-                    sub_lang_list = []
-                    for lang in renderer['translationLanguages']:
-                        lang_code = lang.get('languageCode')
-                        if lang_code:
-                            sub_lang_list.append(lang_code)
-                    return make_captions(base_url, sub_lang_list)
+            if player_response:
+                renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                base_url = renderer['captionTracks'][0]['baseUrl']
+                sub_lang_list = []
+                for lang in renderer['translationLanguages']:
+                    lang_code = lang.get('languageCode')
+                    if lang_code:
+                        sub_lang_list.append(lang_code)
+                return make_captions(base_url, sub_lang_list)
 
 
             # Some videos don't provide ttsurl but rather caption_tracks and
             # Some videos don't provide ttsurl but rather caption_tracks and
             # caption_translation_languages (e.g. 20LmZk1hakA)
             # caption_translation_languages (e.g. 20LmZk1hakA)
@@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # Get video info
         # Get video info
         video_info = {}
         video_info = {}
         embed_webpage = None
         embed_webpage = None
+        ytplayer_config = None
 
 
         if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
         if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
             age_gate = True
             age_gate = True
@@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
 
         # subtitles
         # subtitles
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
         video_subtitles = self.extract_subtitles(video_id, video_webpage)
-        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+        automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
 
 
         video_duration = try_get(
         video_duration = try_get(
             video_info, lambda x: int_or_none(x['length_seconds'][0]))
             video_info, lambda x: int_or_none(x['length_seconds'][0]))