heise.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. ExtractorError,
  7. compat_urllib_parse,
  8. get_meta_content,
  9. parse_iso8601,
  10. )
  11. class HeiseIE(InfoExtractor):
  12. _VALID_URL = (
  13. r'^https?://(?:www\.)?heise\.de/video/artikel/' +
  14. r'.+?(?P<id>[0-9]+)\.html$'
  15. )
  16. _TEST = {
  17. 'url': (
  18. 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' +
  19. 'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
  20. ),
  21. 'md5': 'ffed432483e922e88545ad9f2f15d30e',
  22. 'info_dict': {
  23. 'id': '2404147',
  24. 'ext': 'mp4',
  25. 'title': (
  26. "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " +
  27. "Peilsender Smartphone"
  28. ),
  29. 'format_id': 'mp4_720',
  30. 'timestamp': 1411812600,
  31. 'upload_date': '20140927',
  32. }
  33. }
  34. _CONFIG = (
  35. r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' +
  36. r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"'
  37. )
  38. _PREFIX = 'http://www.heise.de/videout/info?'
  39. def _warn(self, fmt, *args):
  40. self.report_warning(fmt.format(*args), self._id)
  41. def _parse_config_url(self, html):
  42. m = re.search(self._CONFIG, html)
  43. if not m:
  44. raise ExtractorError('No config found')
  45. qs = compat_urllib_parse.urlencode(dict((k, v) for k, v
  46. in m.groupdict().items()
  47. if v is not None))
  48. return self._PREFIX + qs
  49. def _real_extract(self, url):
  50. mobj = re.match(self._VALID_URL, url)
  51. self._id = mobj.group('id')
  52. html = self._download_webpage(url, self._id)
  53. config = self._download_json(self._parse_config_url(html), self._id)
  54. info = {
  55. 'id': self._id
  56. }
  57. title = get_meta_content('fulltitle', html)
  58. if title:
  59. info['title'] = title
  60. elif config.get('title'):
  61. info['title'] = config['title']
  62. else:
  63. self._warn('title: not found')
  64. info['title'] = 'heise'
  65. if (not config.get('formats') or
  66. not hasattr(config['formats'], 'items')):
  67. raise ExtractorError('No formats found')
  68. formats = []
  69. for t, rs in config['formats'].items():
  70. if not rs or not hasattr(rs, 'items'):
  71. self._warn('formats: {0}: no resolutions', t)
  72. continue
  73. for res, obj in rs.items():
  74. format_id = '{0}_{1}'.format(t, res)
  75. if (not obj or not obj.get('url') or
  76. not isinstance(obj['url'], str)):
  77. self._warn('formats: {0}: no url', format_id)
  78. continue
  79. fmt = {
  80. 'url': obj['url'],
  81. 'format_id': format_id
  82. }
  83. try:
  84. fmt['height'] = int(res)
  85. except ValueError as e:
  86. self._warn('formats: {0}: height: {1}', t, e)
  87. formats.append(fmt)
  88. self._sort_formats(formats)
  89. info['formats'] = formats
  90. if config.get('poster') and isinstance(config['poster'], str):
  91. info['thumbnail'] = config['poster']
  92. date = get_meta_content('date', html)
  93. if date and isinstance(date, str):
  94. try:
  95. info['timestamp'] = parse_iso8601(date)
  96. except ValueError as e:
  97. self._warn('timestamp: {0}', e)
  98. return info