googledrive.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import RegexNotFoundError
  4. class GoogleDriveEmbedIE(InfoExtractor):
  5. _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  6. _TEST = {
  7. 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview',
  8. 'info_dict': {
  9. 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE',
  10. 'ext': 'mp4',
  11. 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv',
  12. }
  13. }
  14. @staticmethod
  15. def _extract_url(webpage):
  16. mobj = re.search(
  17. r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
  18. webpage)
  19. if mobj:
  20. return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  21. def _real_extract(self, url):
  22. video_id = self._match_id(url)
  23. return {
  24. '_type': 'url',
  25. 'ie-key': 'GoogleDrive',
  26. 'url': 'https://drive.google.com/file/d/%s' % video_id
  27. }
  28. class GoogleDriveIE(InfoExtractor):
  29. _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  30. _TEST = {
  31. 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  32. 'info_dict': {
  33. 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  34. 'ext': 'mp4',
  35. 'title': 'Big Buck Bunny.mp4',
  36. }
  37. }
  38. _formats = {
  39. '5': {'ext': 'flv'},
  40. '6': {'ext': 'flv'},
  41. '13': {'ext': '3gp'},
  42. '17': {'ext': '3gp'},
  43. '18': {'ext': 'mp4'},
  44. '22': {'ext': 'mp4'},
  45. '34': {'ext': 'flv'},
  46. '35': {'ext': 'flv'},
  47. '36': {'ext': '3gp'},
  48. '37': {'ext': 'mp4'},
  49. '38': {'ext': 'mp4'},
  50. '43': {'ext': 'webm'},
  51. '44': {'ext': 'webm'},
  52. '45': {'ext': 'webm'},
  53. '46': {'ext': 'webm'},
  54. '59': {'ext': 'mp4'}
  55. }
  56. def _real_extract(self, url):
  57. video_id = self._match_id(url)
  58. webpage = self._download_webpage(
  59. 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape'
  60. )
  61. try:
  62. title = self._html_search_regex(
  63. r'"title","(?P<title>.*?)"',
  64. webpage,
  65. 'title',
  66. group='title'
  67. )
  68. fmt_stream_map = self._html_search_regex(
  69. r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"',
  70. webpage,
  71. 'fmt_stream_map',
  72. group='fmt_stream_map'
  73. )
  74. fmt_list = self._html_search_regex(
  75. r'"fmt_list","(?P<fmt_list>.*?)"',
  76. webpage,
  77. 'fmt_list',
  78. group='fmt_list'
  79. )
  80. # timestamp = self._html_search_regex(
  81. # r'"timestamp","(?P<timestamp>.*?)"',
  82. # webpage,
  83. # 'timestamp',
  84. # group='timestamp'
  85. # )
  86. length_seconds = self._html_search_regex(
  87. r'"length_seconds","(?P<length_seconds>.*?)"',
  88. webpage,
  89. 'length_seconds',
  90. group='length_seconds'
  91. )
  92. except RegexNotFoundError:
  93. try:
  94. reason = self._html_search_regex(
  95. r'"reason","(?P<reason>.*?)"',
  96. webpage,
  97. 'reason',
  98. group='reason'
  99. )
  100. self.report_warning(reason)
  101. return
  102. except RegexNotFoundError:
  103. self.report_warning('not a video')
  104. return
  105. fmt_stream_map = fmt_stream_map.split(',')
  106. fmt_list = fmt_list.split(',')
  107. formats = []
  108. for i in range(len(fmt_stream_map)):
  109. fmt_id, fmt_url = fmt_stream_map[i].split('|')
  110. resolution = fmt_list[i].split('/')[1]
  111. width, height = resolution.split('x')
  112. formats.append({
  113. 'url': fmt_url,
  114. 'format_id': fmt_id,
  115. 'resolution': resolution,
  116. 'width': int(width),
  117. 'height': int(height),
  118. 'ext': self._formats[fmt_id]['ext']
  119. })
  120. self._sort_formats(formats)
  121. return {
  122. 'id': video_id,
  123. 'title': title,
  124. # 'timestamp': int(timestamp),
  125. 'duration': int(length_seconds),
  126. 'formats': formats
  127. }