theintercept.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. ExtractorError,
  7. )
  8. class TheInterceptIE(InfoExtractor):
  9. _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>.+?)/'
  10. _TESTS = [{
  11. 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/',
  12. 'info_dict': {
  13. 'id': 'thisisacoup-episode-four-surrender-or-die',
  14. 'ext': 'mp4',
  15. 'title': '#ThisIsACoup – Episode Four: Surrender or Die',
  16. 'upload_date': '20151218',
  17. 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140',
  18. }
  19. }]
  20. def _real_extract(self, url):
  21. display_id = self._match_id(url)
  22. webpage = self._download_webpage(url, display_id)
  23. mobj = re.search(r'initialStoreTree =(?P<json_data>.+})', webpage)
  24. if mobj is None:
  25. raise ExtractorError('Unable to extract initialStoreTree')
  26. json_data = self._parse_json(mobj.group('json_data'), display_id)
  27. info = None
  28. for post in json_data['resources']['posts'].values():
  29. if post['slug'] == display_id:
  30. info = post
  31. break
  32. if info is None:
  33. raise ExtractorError('Unable to find info for %s'%display_id)
  34. title = info['title']
  35. description = info['excerpt']
  36. upload_date = info['date'][:10].replace('-', '')
  37. video_id = info['fov_videoid']
  38. creator = ','.join([a['display_name'] for a in info['authors']])
  39. thumbnail = self._og_search_property('image', webpage)
  40. content_id = thumbnail.split('/')[-1].split('.')[0]
  41. content_url = 'https://content.jwplatform.com/jw6/{content_id}.xml'.format(content_id=content_id)
  42. content = self._download_xml(content_url, video_id)
  43. formats = []
  44. for source in content.findall('.//{http://rss.jwpcdn.com/}source'):
  45. if source.attrib['file'].endswith('.m3u8'):
  46. formats.extend(self._extract_m3u8_formats(
  47. source.attrib['file'], video_id, 'mp4', preference=1, m3u8_id='hls'))
  48. return {
  49. 'creator': creator,
  50. 'description': description,
  51. 'display_id': display_id,
  52. 'formats': formats,
  53. 'id': video_id,
  54. 'id': video_id,
  55. 'thumbnail': thumbnail,
  56. 'title': title,
  57. 'upload_date': upload_date,
  58. }