discovery.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. parse_duration,
  6. parse_iso8601,
  7. )
  8. from ..compat import (
  9. compat_str,
  10. compat_urlparse,
  11. )
  12. class DiscoveryIE(InfoExtractor):
  13. _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
  14. discovery|
  15. investigationdiscovery|
  16. discoverylife|
  17. animalplanet|
  18. ahctv|
  19. destinationamerica|
  20. sciencechannel|
  21. tlc|
  22. velocity
  23. )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)'''
  24. _TESTS = [{
  25. 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
  26. 'info_dict': {
  27. 'id': '20769',
  28. 'ext': 'mp4',
  29. 'title': 'Mission Impossible Outtakes',
  30. 'description': ('Watch Jamie Hyneman and Adam Savage practice being'
  31. ' each other -- to the point of confusing Jamie\'s dog -- and '
  32. 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
  33. ' back.'),
  34. 'duration': 156,
  35. 'timestamp': 1302032462,
  36. 'upload_date': '20110405',
  37. },
  38. 'params': {
  39. 'skip_download': True, # requires ffmpeg
  40. }
  41. }, {
  42. 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
  43. 'info_dict': {
  44. 'id': 'mythbusters-the-simpsons',
  45. 'title': 'MythBusters: The Simpsons',
  46. },
  47. 'playlist_mincount': 10,
  48. }, {
  49. 'url': 'http://www.animalplanet.com/longfin-eels-maneaters/',
  50. 'info_dict': {
  51. 'id': '78326',
  52. 'ext': 'mp4',
  53. 'title': 'Longfin Eels: Maneaters?',
  54. 'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.',
  55. 'upload_date': '20140725',
  56. 'timestamp': 1406246400,
  57. 'duration': 116,
  58. },
  59. }]
  60. def _real_extract(self, url):
  61. display_id = self._match_id(url)
  62. info = self._download_json(url + '?flat=1', display_id)
  63. video_title = info.get('playlist_title') or info.get('video_title')
  64. entries = []
  65. for idx, video_info in enumerate(info['playlist']):
  66. m3u8_url = video_info['src']
  67. formats = m3u8_formats = self._extract_m3u8_formats(
  68. m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls',
  69. note='Download m3u8 information for video %d' % (idx + 1))
  70. qualities_basename = self._search_regex(
  71. '/([^/]+)\.csmil/', m3u8_url, 'qualities basename', default=None)
  72. if qualities_basename:
  73. m3u8_path = compat_urlparse.urlparse(m3u8_url).path
  74. QUALITIES_RE = r'((,\d+k)+,?)'
  75. qualities = self._search_regex(
  76. QUALITIES_RE, qualities_basename,
  77. 'qualities', default=None)
  78. if qualities:
  79. qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(',')))
  80. qualities.sort()
  81. http_path = m3u8_path[1:].split('/', 1)[1]
  82. http_template = re.sub(QUALITIES_RE, r'%dk', http_path)
  83. http_template = http_template.replace('.csmil/master.m3u8', '')
  84. http_template = compat_urlparse.urljoin(
  85. 'http://discsmil.edgesuite.net/', http_template)
  86. if m3u8_formats:
  87. self._sort_formats(m3u8_formats)
  88. m3u8_formats = list(filter(
  89. lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
  90. m3u8_formats))
  91. if len(qualities) == len(m3u8_formats):
  92. for q, m3u8_format in zip(qualities, m3u8_formats):
  93. f = m3u8_format.copy()
  94. f.update({
  95. 'url': http_template % q,
  96. 'format_id': f['format_id'].replace('hls', 'http'),
  97. 'protocol': 'http',
  98. })
  99. formats.append(f)
  100. else:
  101. for q in qualities:
  102. formats.append({
  103. 'url': http_template % q,
  104. 'ext': 'mp4',
  105. 'format_id': 'http-%d' % q,
  106. 'tbr': q,
  107. })
  108. self._sort_formats(formats)
  109. subtitles = []
  110. caption_url = video_info.get('captionsUrl')
  111. if caption_url:
  112. subtitles = {
  113. 'en': [{
  114. 'url': caption_url,
  115. }]
  116. }
  117. entries.append({
  118. 'id': compat_str(video_info['id']),
  119. 'formats': formats,
  120. 'title': video_info['title'],
  121. 'description': video_info.get('description'),
  122. 'duration': parse_duration(video_info.get('video_length')),
  123. 'webpage_url': video_info.get('href') or video_info.get('url'),
  124. 'thumbnail': video_info.get('thumbnailURL'),
  125. 'alt_title': video_info.get('secondary_title'),
  126. 'timestamp': parse_iso8601(video_info.get('publishedDate')),
  127. 'subtitles': subtitles,
  128. })
  129. return self.playlist_result(entries, display_id, video_title)