ustream.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. from HTMLParser import HTMLParser
  2. import json
  3. import re
  4. from urlparse import urljoin
  5. from .common import InfoExtractor
  6. class UstreamIE(InfoExtractor):
  7. _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
  8. IE_NAME = u'ustream'
  9. _TEST = {
  10. u'url': u'http://www.ustream.tv/recorded/20274954',
  11. u'file': u'20274954.flv',
  12. u'md5': u'088f151799e8f572f84eb62f17d73e5c',
  13. u'info_dict': {
  14. u"uploader": u"Young Americans for Liberty",
  15. u"title": u"Young Americans for Liberty February 7, 2012 2:28 AM"
  16. }
  17. }
  18. def _real_extract(self, url):
  19. m = re.match(self._VALID_URL, url)
  20. video_id = m.group('videoID')
  21. video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
  22. webpage = self._download_webpage(url, video_id)
  23. self.report_extraction(video_id)
  24. video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
  25. webpage, u'title')
  26. uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
  27. webpage, u'uploader', fatal=False, flags=re.DOTALL)
  28. thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
  29. webpage, u'thumbnail', fatal=False)
  30. info = {
  31. 'id': video_id,
  32. 'url': video_url,
  33. 'ext': 'flv',
  34. 'title': video_title,
  35. 'uploader': uploader,
  36. 'thumbnail': thumbnail,
  37. }
  38. return info
  39. # More robust than regular expressions
  40. class ChannelParser(HTMLParser):
  41. """
  42. <meta name="ustream:channel_id" content="1234">
  43. """
  44. channel_id = None
  45. def handle_starttag(self, tag, attrs):
  46. if tag != 'meta':
  47. return
  48. values = dict(attrs)
  49. if values.get('name') != 'ustream:channel_id':
  50. return
  51. value = values.get('content', '')
  52. if value.isdigit():
  53. self.channel_id = value
  54. class SocialstreamParser(HTMLParser):
  55. """
  56. <li class="content123 video" data-content-id="123" data-length="1452"
  57. data-href="/recorded/123" data-og-url="/recorded/123">
  58. """
  59. def __init__(self):
  60. HTMLParser.__init__(self)
  61. self.content_ids = []
  62. def handle_starttag(self, tag, attrs):
  63. if tag != 'li':
  64. return
  65. for (attr, value) in attrs:
  66. if attr == 'data-content-id' and value.isdigit():
  67. self.content_ids.append(value)
  68. class UstreamChannelIE(InfoExtractor):
  69. _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
  70. IE_NAME = u'ustream:channel'
  71. def _real_extract(self, url):
  72. m = re.match(self._VALID_URL, url)
  73. slug = m.group('slug')
  74. # Slugs can be non-ascii, but youtube-dl can't handle non-ascii command lines,
  75. # so if we got this far it's probably percent encoded and we needn't worry.
  76. p = ChannelParser()
  77. p.feed(self._download_webpage(url, slug))
  78. p.close()
  79. channel_id = p.channel_id
  80. p = SocialstreamParser()
  81. BASE = 'http://www.ustream.tv'
  82. next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
  83. while next_url:
  84. reply = json.loads(self._download_webpage(urljoin(BASE, next_url), channel_id))
  85. p.feed(reply['data'])
  86. next_url = reply['nextUrl']
  87. p.close()
  88. video_ids = p.content_ids
  89. # From YoutubeChannelIE
  90. self._downloader.to_screen(u'[ustream] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  91. urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
  92. url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
  93. return [self.playlist_result(url_entries, channel_id)]