viki.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. from __future__ import unicode_literals
  2. import re
  3. from ..compat import compat_urlparse
  4. from ..utils import (
  5. ExtractorError,
  6. unescapeHTML,
  7. unified_strdate,
  8. US_RATINGS,
  9. )
  10. from .common import InfoExtractor
  11. class VikiIE(InfoExtractor):
  12. IE_NAME = 'viki'
  13. _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
  14. _TEST = {
  15. 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
  16. 'info_dict': {
  17. 'id': '1023585v',
  18. 'ext': 'mp4',
  19. 'title': 'Heirs Episode 14',
  20. 'uploader': 'SBS',
  21. 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
  22. 'upload_date': '20131121',
  23. 'age_limit': 13,
  24. },
  25. 'skip': 'Blocked in the US',
  26. }
  27. def _real_extract(self, url):
  28. video_id = self._match_id(url)
  29. webpage = self._download_webpage(url, video_id)
  30. title = self._og_search_title(webpage)
  31. description = self._og_search_description(webpage)
  32. thumbnail = self._og_search_thumbnail(webpage)
  33. uploader_m = re.search(
  34. r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
  35. if uploader_m is None:
  36. uploader = None
  37. else:
  38. uploader = uploader_m.group(1).strip()
  39. rating_str = self._html_search_regex(
  40. r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
  41. 'rating information', default='').strip()
  42. age_limit = US_RATINGS.get(rating_str)
  43. info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
  44. info_webpage = self._download_webpage(
  45. info_url, video_id, note='Downloading info page')
  46. if re.match(r'\s*<div\s+class="video-error', info_webpage):
  47. raise ExtractorError(
  48. 'Video %s is blocked from your location.' % video_id,
  49. expected=True)
  50. video_url = self._html_search_regex(
  51. r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
  52. upload_date_str = self._html_search_regex(
  53. r'"created_at":"([^"]+)"', info_webpage, 'upload date')
  54. upload_date = (
  55. unified_strdate(upload_date_str)
  56. if upload_date_str is not None
  57. else None
  58. )
  59. # subtitles
  60. video_subtitles = self.extract_subtitles(video_id, info_webpage)
  61. return {
  62. 'id': video_id,
  63. 'title': title,
  64. 'url': video_url,
  65. 'description': description,
  66. 'thumbnail': thumbnail,
  67. 'age_limit': age_limit,
  68. 'uploader': uploader,
  69. 'subtitles': video_subtitles,
  70. 'upload_date': upload_date,
  71. }
  72. def _get_subtitles(self, video_id, info_webpage):
  73. res = {}
  74. for sturl_html in re.findall(r'<track src="([^"]+)"', info_webpage):
  75. sturl = unescapeHTML(sturl_html)
  76. m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
  77. if not m:
  78. continue
  79. res[m.group('lang')] = [{
  80. 'url': compat_urlparse.urljoin('http://www.viki.com', sturl),
  81. 'ext': 'vtt',
  82. }]
  83. return res