googledrive.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import RegexNotFoundError
  4. class GoogleDriveEmbedIE(InfoExtractor):
  5. _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  6. _TEST = {
  7. 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview',
  8. 'info_dict': {
  9. 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE',
  10. 'ext': 'mp4',
  11. 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv',
  12. }
  13. }
  14. @staticmethod
  15. def _extract_url(webpage):
  16. mobj = re.search(
  17. r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
  18. webpage)
  19. if mobj:
  20. return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  21. def _real_extract(self, url):
  22. video_id = self._match_id(url)
  23. return {
  24. '_type': 'url',
  25. 'ie_key': 'GoogleDrive',
  26. 'url': 'https://drive.google.com/file/d/%s' % video_id
  27. }
  28. class GoogleDriveIE(InfoExtractor):
  29. _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  30. _TEST = {
  31. 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  32. 'info_dict': {
  33. 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  34. 'ext': 'mp4',
  35. 'title': 'Big Buck Bunny.mp4',
  36. }
  37. }
  38. _formats = {
  39. '5': {'ext': 'flv'},
  40. '6': {'ext': 'flv'},
  41. '13': {'ext': '3gp'},
  42. '17': {'ext': '3gp'},
  43. '18': {'ext': 'mp4'},
  44. '22': {'ext': 'mp4'},
  45. '34': {'ext': 'flv'},
  46. '35': {'ext': 'flv'},
  47. '36': {'ext': '3gp'},
  48. '37': {'ext': 'mp4'},
  49. '38': {'ext': 'mp4'},
  50. '43': {'ext': 'webm'},
  51. '44': {'ext': 'webm'},
  52. '45': {'ext': 'webm'},
  53. '46': {'ext': 'webm'},
  54. '59': {'ext': 'mp4'}
  55. }
  56. def _real_extract(self, url):
  57. video_id = self._match_id(url)
  58. webpage = self._download_webpage(
  59. 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape'
  60. )
  61. try:
  62. title = self._html_search_regex(
  63. r'"title"\s*,\s*"([^"]+)',
  64. webpage,
  65. 'title'
  66. )
  67. fmt_stream_map = self._html_search_regex(
  68. r'"fmt_stream_map"\s*,\s*"([^"]+)',
  69. webpage,
  70. 'fmt_stream_map'
  71. )
  72. fmt_list = self._html_search_regex(
  73. r'"fmt_list"\s*,\s*"([^"]+)',
  74. webpage,
  75. 'fmt_list'
  76. )
  77. # timestamp = self._html_search_regex(
  78. # r'"timestamp"\s*,\s*"([^"]+)',
  79. # webpage,
  80. # 'timestamp'
  81. # )
  82. length_seconds = self._html_search_regex(
  83. r'"length_seconds"\s*,\s*"([^"]+)',
  84. webpage,
  85. 'length_seconds'
  86. )
  87. except RegexNotFoundError:
  88. try:
  89. reason = self._html_search_regex(
  90. r'"reason","([^"]+)',
  91. webpage,
  92. 'reason'
  93. )
  94. self.report_warning(reason)
  95. return
  96. except RegexNotFoundError:
  97. self.report_warning('not a video')
  98. return
  99. fmt_stream_map = fmt_stream_map.split(',')
  100. fmt_list = fmt_list.split(',')
  101. formats = []
  102. for i in range(len(fmt_stream_map)):
  103. fmt_id, fmt_url = fmt_stream_map[i].split('|')
  104. resolution = fmt_list[i].split('/')[1]
  105. width, height = resolution.split('x')
  106. formats.append({
  107. 'url': fmt_url,
  108. 'format_id': fmt_id,
  109. 'resolution': resolution,
  110. 'width': int(width),
  111. 'height': int(height),
  112. 'ext': self._formats[fmt_id]['ext']
  113. })
  114. self._sort_formats(formats)
  115. return {
  116. 'id': video_id,
  117. 'title': title,
  118. # 'timestamp': int(timestamp),
  119. 'duration': int(length_seconds),
  120. 'formats': formats
  121. }