applepodcasts.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. clean_podcast_url,
  6. int_or_none,
  7. parse_iso8601,
  8. try_get,
  9. )
  10. class ApplePodcastsIE(InfoExtractor):
  11. _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
  12. _TESTS = [{
  13. 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  14. 'md5': 'df02e6acb11c10e844946a39e7222b08',
  15. 'info_dict': {
  16. 'id': '1000482637777',
  17. 'ext': 'mp3',
  18. 'title': '207 - Whitney Webb Returns',
  19. 'description': 'md5:13a73bade02d2e43737751e3987e1399',
  20. 'upload_date': '20200705',
  21. 'timestamp': 1593921600,
  22. 'duration': 6425,
  23. 'series': 'The Tim Dillon Show',
  24. }
  25. }, {
  26. 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  27. 'only_matching': True,
  28. }, {
  29. 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
  30. 'only_matching': True,
  31. }, {
  32. 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
  33. 'only_matching': True,
  34. }]
  35. def _real_extract(self, url):
  36. episode_id = self._match_id(url)
  37. webpage = self._download_webpage(url, episode_id)
  38. ember_data = self._parse_json(self._search_regex(
  39. r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
  40. webpage, 'ember data'), episode_id)
  41. ember_data = ember_data.get(episode_id) or ember_data
  42. episode = ember_data['data']['attributes']
  43. description = episode.get('description') or {}
  44. series = None
  45. for inc in (ember_data.get('included') or []):
  46. if inc.get('type') == 'media/podcast':
  47. series = try_get(inc, lambda x: x['attributes']['name'])
  48. return {
  49. 'id': episode_id,
  50. 'title': episode['name'],
  51. 'url': clean_podcast_url(episode['assetUrl']),
  52. 'description': description.get('standard') or description.get('short'),
  53. 'timestamp': parse_iso8601(episode.get('releaseDateTime')),
  54. 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
  55. 'series': series,
  56. }