expressen.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. determine_ext,
  6. int_or_none,
  7. unescapeHTML,
  8. unified_timestamp,
  9. )
  10. class ExpressenIE(InfoExtractor):
  11. _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  12. _TESTS = [{
  13. 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
  14. 'md5': '2fbbe3ca14392a6b1b36941858d33a45',
  15. 'info_dict': {
  16. 'id': '8690962',
  17. 'ext': 'mp4',
  18. 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
  19. 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
  20. 'thumbnail': r're:^https?://.*\.jpg$',
  21. 'duration': 788,
  22. 'timestamp': 1526639109,
  23. 'upload_date': '20180518',
  24. },
  25. }, {
  26. 'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
  27. 'only_matching': True,
  28. }]
  29. def _real_extract(self, url):
  30. display_id = self._match_id(url)
  31. webpage = self._download_webpage(url, display_id)
  32. def extract_data(name):
  33. return self._parse_json(
  34. self._search_regex(
  35. r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
  36. webpage, 'info', group='value'),
  37. display_id, transform_source=unescapeHTML)
  38. info = extract_data('video-tracking-info')
  39. video_id = info['videoId']
  40. data = extract_data('article-data')
  41. stream = data['stream']
  42. if determine_ext(stream) == 'm3u8':
  43. formats = self._extract_m3u8_formats(
  44. stream, display_id, 'mp4', entry_protocol='m3u8_native',
  45. m3u8_id='hls')
  46. else:
  47. formats = [{
  48. 'url': stream,
  49. }]
  50. self._sort_formats(formats)
  51. title = info.get('titleRaw') or data['title']
  52. description = info.get('descriptionRaw')
  53. thumbnail = info.get('socialMediaImage') or data.get('image')
  54. duration = int_or_none(info.get('videoTotalSecondsDuration') or
  55. data.get('totalSecondsDuration'))
  56. timestamp = unified_timestamp(info.get('publishDate'))
  57. return {
  58. 'id': video_id,
  59. 'display_id': display_id,
  60. 'title': title,
  61. 'description': description,
  62. 'thumbnail': thumbnail,
  63. 'duration': duration,
  64. 'timestamp': timestamp,
  65. 'formats': formats,
  66. }