libsyn.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. # encoding: utf-8
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. unified_strdate,
  5. )
  6. class LibsynIE(InfoExtractor):
  7. _VALID_URL = r'(?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)(?:/.*)?'
  8. def _real_extract(self, url):
  9. if url.startswith('//'):
  10. url = 'https:' + url
  11. display_id = self._match_id(url)
  12. webpage = self._download_webpage(url, display_id)
  13. podcast_title = self._search_regex(r'<h2>(.*?)</h2>', webpage, 'show title')
  14. podcast_episode_title = self._search_regex(r'<h3>(.*?)</h3>', webpage, 'episode title')
  15. podcast_date = unified_strdate(self._search_regex(r'<div class="release_date">Released: (.*?)</div>', webpage, 'release date'))
  16. podcast_description = self._search_regex(r'<div id="info_text_body">(.*?)</div>', webpage, 'description')
  17. url0 = self._search_regex(r'var mediaURLLibsyn = "(?P<url0>https?://.*)";', webpage, 'first media URL')
  18. url1 = self._search_regex(r'var mediaURL = "(?P<url1>https?://.*)";', webpage, 'second media URL')
  19. if url0 != url1:
  20. formats = [{
  21. 'url': url0
  22. }, {
  23. 'url': url1
  24. }]
  25. else:
  26. formats = [{
  27. 'url': url0
  28. }]
  29. return {
  30. 'id': display_id,
  31. 'title': podcast_episode_title,
  32. 'description': podcast_description,
  33. 'upload_date': podcast_date,
  34. 'formats': formats,
  35. }