googledrive.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. int_or_none,
  7. lowercase_escape,
  8. error_to_compat_str,
  9. update_url_query,
  10. )
  11. class GoogleDriveIE(InfoExtractor):
  12. _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
  13. _TESTS = [{
  14. 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  15. 'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
  16. 'info_dict': {
  17. 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  18. 'ext': 'mp4',
  19. 'title': 'Big Buck Bunny.mp4',
  20. 'duration': 45,
  21. }
  22. }, {
  23. # video id is longer than 28 characters
  24. 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
  25. 'md5': 'c230c67252874fddd8170e3fd1a45886',
  26. 'info_dict': {
  27. 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
  28. 'ext': 'mp4',
  29. 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
  30. 'duration': 189,
  31. },
  32. 'only_matching': True
  33. }]
  34. _FORMATS_EXT = {
  35. '5': 'flv',
  36. '6': 'flv',
  37. '13': '3gp',
  38. '17': '3gp',
  39. '18': 'mp4',
  40. '22': 'mp4',
  41. '34': 'flv',
  42. '35': 'flv',
  43. '36': '3gp',
  44. '37': 'mp4',
  45. '38': 'mp4',
  46. '43': 'webm',
  47. '44': 'webm',
  48. '45': 'webm',
  49. '46': 'webm',
  50. '59': 'mp4',
  51. }
  52. _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
  53. _CAPTIONS_ENTRY_TAG = {
  54. 'subtitles': 'track',
  55. 'automatic_captions': 'target',
  56. }
  57. _caption_formats_ext = []
  58. _captions_by_country_xml = None
  59. @staticmethod
  60. def _extract_url(webpage):
  61. mobj = re.search(
  62. r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
  63. webpage)
  64. if mobj:
  65. return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  66. def _set_captions_data(self, video_id, video_subtitles_id, hl):
  67. try:
  68. self._captions_by_country_xml = self._download_xml(self._BASE_URL_CAPTIONS, video_id, query={
  69. 'id': video_id,
  70. 'vid': video_subtitles_id,
  71. 'hl': hl,
  72. 'v': video_id,
  73. 'type': 'list',
  74. 'tlangs': '1',
  75. 'fmts': '1',
  76. 'vssids': '1',
  77. })
  78. except ExtractorError as ee:
  79. self.report_warning('unable to download video subtitles: %s' % error_to_compat_str(ee))
  80. if self._captions_by_country_xml is not None:
  81. caption_available_extensions = self._captions_by_country_xml.findall('format')
  82. for caption_extension in caption_available_extensions:
  83. if caption_extension.attrib.get('fmt_code') and not caption_extension.attrib.get('default'):
  84. self._caption_formats_ext.append(caption_extension.attrib['fmt_code'])
  85. def _get_captions_by_type(self, video_id, video_subtitles_id, caption_type, caption_original_lang_code=None):
  86. if not video_subtitles_id or not caption_type:
  87. return None
  88. captions = {}
  89. for caption_entry in self._captions_by_country_xml.findall(self._CAPTIONS_ENTRY_TAG[caption_type]):
  90. caption_lang_code = caption_entry.attrib.get('lang_code')
  91. if not caption_lang_code:
  92. continue
  93. caption_format_data = []
  94. for caption_format in self._caption_formats_ext:
  95. query = {
  96. 'vid': video_subtitles_id,
  97. 'v': video_id,
  98. 'fmt': caption_format,
  99. 'lang': caption_lang_code if caption_original_lang_code is None else caption_original_lang_code,
  100. 'type': 'track',
  101. 'name': '',
  102. 'kind': '',
  103. }
  104. if caption_original_lang_code is not None:
  105. query.update({'tlang': caption_lang_code})
  106. caption_format_data.append({
  107. 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
  108. 'ext': caption_format,
  109. })
  110. captions[caption_lang_code] = caption_format_data
  111. if not captions:
  112. self.report_warning('video doesn\'t have %s' % caption_type.replace('_', ' '))
  113. return captions
  114. def _get_subtitles(self, video_id, video_subtitles_id, hl):
  115. if not video_subtitles_id or not hl:
  116. return None
  117. if self._captions_by_country_xml is None:
  118. self._set_captions_data(video_id, video_subtitles_id, hl)
  119. if self._captions_by_country_xml is None:
  120. return None
  121. return self._get_captions_by_type(video_id, video_subtitles_id, 'subtitles')
  122. def _get_automatic_captions(self, video_id, video_subtitles_id, hl):
  123. if not video_subtitles_id or not hl:
  124. return None
  125. if self._captions_by_country_xml is None:
  126. self._set_captions_data(video_id, video_subtitles_id, hl)
  127. if self._captions_by_country_xml is None:
  128. return None
  129. self.to_screen('%s: Looking for automatic captions' % video_id)
  130. subtitle_original_track = self._captions_by_country_xml.find('track')
  131. if subtitle_original_track is None:
  132. return None
  133. subtitle_original_lang_code = subtitle_original_track.attrib.get('lang_code')
  134. if not subtitle_original_lang_code:
  135. return None
  136. return self._get_captions_by_type(video_id, video_subtitles_id, 'automatic_captions', subtitle_original_lang_code)
  137. def _real_extract(self, url):
  138. video_id = self._match_id(url)
  139. webpage = self._download_webpage(
  140. 'http://docs.google.com/file/d/%s' % video_id, video_id)
  141. reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
  142. if reason:
  143. raise ExtractorError(reason)
  144. title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
  145. duration = int_or_none(self._search_regex(
  146. r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None))
  147. fmt_stream_map = self._search_regex(
  148. r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',')
  149. fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
  150. resolutions = {}
  151. for fmt in fmt_list:
  152. mobj = re.search(
  153. r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
  154. if mobj:
  155. resolutions[mobj.group('format_id')] = (
  156. int(mobj.group('width')), int(mobj.group('height')))
  157. formats = []
  158. for fmt_stream in fmt_stream_map:
  159. fmt_stream_split = fmt_stream.split('|')
  160. if len(fmt_stream_split) < 2:
  161. continue
  162. format_id, format_url = fmt_stream_split[:2]
  163. f = {
  164. 'url': lowercase_escape(format_url),
  165. 'format_id': format_id,
  166. 'ext': self._FORMATS_EXT[format_id],
  167. }
  168. resolution = resolutions.get(format_id)
  169. if resolution:
  170. f.update({
  171. 'width': resolution[0],
  172. 'height': resolution[1],
  173. })
  174. formats.append(f)
  175. self._sort_formats(formats)
  176. hl = self._search_regex(
  177. r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
  178. video_subtitles_id = None
  179. ttsurl = self._search_regex(
  180. r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
  181. if ttsurl:
  182. # the video Id for subtitles will be the last value in the ttsurl query string
  183. video_subtitles_id = ttsurl.encode('utf-8').decode('unicode_escape').split('=')[-1]
  184. return {
  185. 'id': video_id,
  186. 'title': title,
  187. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  188. 'duration': duration,
  189. 'formats': formats,
  190. 'subtitles': self.extract_subtitles(video_id, video_subtitles_id, hl),
  191. 'automatic_captions': self.extract_automatic_captions(video_id, video_subtitles_id, hl),
  192. }