cbsnews.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import json
  5. from .common import InfoExtractor
  6. from ..utils import remove_start
  7. class CBSNewsIE(InfoExtractor):
  8. IE_DESC = 'CBS News'
  9. _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)'
  10. _TESTS = [
  11. {
  12. 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
  13. 'info_dict': {
  14. 'id': 'tesla-and-spacex-elon-musks-industrial-empire',
  15. 'ext': 'flv',
  16. 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
  17. 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
  18. 'duration': 791,
  19. },
  20. 'params': {
  21. # rtmp download
  22. 'skip_download': True,
  23. },
  24. },
  25. {
  26. 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
  27. 'info_dict': {
  28. 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
  29. 'ext': 'flv',
  30. 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
  31. 'thumbnail': 're:^https?://.*\.jpg$',
  32. 'duration': 205,
  33. },
  34. 'params': {
  35. # rtmp download
  36. 'skip_download': True,
  37. },
  38. },
  39. {
  40. 'url': 'http://www.cbsnews.com/videos/mountain-lions-of-l-a/',
  41. 'info_dict': {
  42. 'id': 'Mountain Lions of L.A.',
  43. 'ext': 'flv',
  44. 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
  45. 'thumbnail': 're:^http?://.*\.jpg$',
  46. 'subtitles': 're:^http?://.*\.xml$',
  47. 'duration': 787,
  48. },
  49. 'params': {
  50. # rtmp download
  51. 'skip_download': True,
  52. },
  53. },
  54. ]
  55. def _real_extract(self, url):
  56. mobj = re.match(self._VALID_URL, url)
  57. video_id = mobj.group('id')
  58. webpage = self._download_webpage(url, video_id)
  59. video_info = json.loads(self._html_search_regex(
  60. r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
  61. webpage, 'video JSON info'))
  62. item = video_info['item'] if 'item' in video_info else video_info
  63. title = item.get('articleTitle') or item.get('hed')
  64. duration = item.get('duration')
  65. thumbnail = item.get('mediaImage') or item.get('thumbnail')
  66. formats = []
  67. for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
  68. uri = item.get('media' + format_id + 'URI')
  69. if not uri:
  70. continue
  71. uri = remove_start(uri, '{manifest:none}')
  72. fmt = {
  73. 'url': uri,
  74. 'format_id': format_id,
  75. }
  76. if uri.startswith('rtmp'):
  77. play_path = re.sub(
  78. r'{slistFilePath}', '',
  79. uri.split('<break>')[-1].split('{break}')[-1])
  80. play_path = re.sub(
  81. r'{manifest:.+}.*$', '', play_path)
  82. fmt.update({
  83. 'app': 'ondemand?auth=cbs',
  84. 'play_path': 'mp4:' + play_path,
  85. 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',
  86. 'page_url': 'http://www.cbsnews.com',
  87. 'ext': 'flv',
  88. })
  89. elif uri.endswith('.m3u8'):
  90. fmt['ext'] = 'mp4'
  91. formats.append(fmt)
  92. if 'mpxRefId' in video_info:
  93. cap_url = 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId']
  94. subtitles = {
  95. 'en': [{
  96. 'url': cap_url,
  97. 'ext': 'xml'
  98. }], }
  99. else:
  100. subtitles = {}
  101. return {
  102. 'id': video_id,
  103. 'title': title,
  104. 'thumbnail': thumbnail,
  105. 'duration': duration,
  106. 'formats': formats,
  107. 'subtitles': subtitles,
  108. }