democracynow.py 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. class DemocracynowIE(InfoExtractor):
  6. _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P<id>[^\?]*)'
  7. IE_NAME = 'democracynow'
  8. _TESTS = [{
  9. 'url': 'http://www.democracynow.org/shows/2015/7/3',
  10. 'info_dict': {
  11. 'id': '2015-0703-001',
  12. 'ext': 'mp4',
  13. 'title': 'July 03, 2015 - Democracy Now!',
  14. 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs',
  15. 'uploader': 'Democracy Now',
  16. 'upload_date': None,
  17. },
  18. }, {
  19. 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
  20. 'info_dict': {
  21. 'id': '2015-0703-001',
  22. 'ext': 'mp4',
  23. 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
  24. 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
  25. 'uploader': 'Democracy Now',
  26. 'upload_date': None,
  27. },
  28. }]
  29. def _real_extract(self, url):
  30. display_id = self._match_id(url)
  31. base_host = re.search(r'^(.+?://[^/]+)', url).group(1)
  32. if display_id == '':
  33. display_id = 'home'
  34. webpage = self._download_webpage(url, display_id)
  35. re_desc = re.search(r'<meta property=.og:description. content=(["\'])(.+?)\1', webpage, re.DOTALL)
  36. description = re_desc.group(2) if re_desc else ''
  37. jstr = self._search_regex(r'({.+?"related_video_xml".+?})', webpage, 'json', default=None)
  38. js = self._parse_json(jstr, display_id)
  39. video_id = None
  40. formats = []
  41. subtitles = {}
  42. for key in ('caption_file', '.......'):
  43. # ....... = pending vtt support that doesn't clobber srt 'chapter_file':
  44. url = js.get(key, '')
  45. if url == '' or url is None:
  46. continue
  47. if not re.match(r'^https?://', url):
  48. url = base_host + url
  49. ext = re.search(r'\.([^\.]+)$', url).group(1)
  50. subtitles['eng'] = [{
  51. 'ext': ext,
  52. 'url': url,
  53. }]
  54. for key in ('file', 'audio'):
  55. url = js.get(key, '')
  56. if url == '' or url is None:
  57. continue
  58. if not re.match(r'^https?://', url):
  59. url = base_host + url
  60. purl = re.search(r'/(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?)\.(?P<ext>[^\.\?]+)(?P<hasparams>\?|$)', url)
  61. if video_id is None:
  62. video_id = purl.group('fn')
  63. if js.get('start') is not None:
  64. url += '&' if purl.group('hasparams') == '?' else '?'
  65. url = url + 'start=' + str(js.get('start'))
  66. formats.append({
  67. 'format_id': purl.group('dir'),
  68. 'ext': purl.group('ext'),
  69. 'url': url,
  70. })
  71. self._sort_formats(formats)
  72. ret = {
  73. 'id': video_id,
  74. 'title': js.get('title'),
  75. 'description': description,
  76. 'uploader': 'Democracy Now',
  77. 'subtitles': subtitles,
  78. 'formats': formats,
  79. }
  80. return ret