gdcvault.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. import xml.etree.ElementTree
  5. from .common import InfoExtractor
  6. from ..utils import unified_strdate
  7. class GDCVaultIE(InfoExtractor):
  8. _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
  9. _TESTS = [
  10. {
  11. u'url': u'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
  12. u'md5': u'05763e5edd1a74776999a12b02ee1c4e',
  13. u'info_dict': {
  14. u"id": u"1015683",
  15. u"ext": u"flv",
  16. u"title": u"Embracing the Dark Art of Mathematical Modeling in AI"
  17. }
  18. },
  19. {
  20. u'url': u'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
  21. u'md5': u'7ce8388f544c88b7ac11c7ab1b593704',
  22. u'info_dict': {
  23. u"id": u"1019721",
  24. u"ext": u"mp4",
  25. u"title": u"Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)"
  26. }
  27. },
  28. ]
  29. def _real_extract(self, url):
  30. mobj = re.match(self._VALID_URL, url)
  31. video_id = mobj.group('id')
  32. webpage_url = 'http://www.gdcvault.com/play/' + video_id
  33. start_page = self._download_webpage(webpage_url, video_id)
  34. self.report_extraction(video_id)
  35. xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
  36. xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
  37. if xml_name is None:
  38. # Fallback to the older format
  39. xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
  40. xml_decription_url = xml_root + 'xml/' + xml_name
  41. xml_description = self._download_xml(xml_decription_url, video_id)
  42. video_title = xml_description.find('./metadata/title').text
  43. video_details = {
  44. 'id': video_id,
  45. 'title': video_title,
  46. }
  47. video_formats = []
  48. mp4_video = xml_description.find('./metadata/mp4video')
  49. if mp4_video is not None:
  50. mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
  51. video_root = mobj.group('root')
  52. formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
  53. for format in formats:
  54. mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
  55. url = video_root + mobj.group('path')
  56. vbr = format.find('bitrate').text
  57. video_formats.append({
  58. 'url': url,
  59. 'vbr': int(vbr),
  60. })
  61. video_details['formats'] = video_formats
  62. else:
  63. # Fallback to flv
  64. akami_url = xml_description.find('./metadata/akamaiHost').text
  65. slide_video_path = xml_description.find('./metadata/slideVideo').text
  66. video_formats.append({
  67. 'url': 'rtmp://' + akami_url + '/' + slide_video_path,
  68. 'format_note': 'slide deck video',
  69. 'quality': -2,
  70. 'preference': -2,
  71. 'format_id': 'slides',
  72. })
  73. speaker_video_path = xml_description.find('./metadata/speakerVideo').text
  74. video_formats.append({
  75. 'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
  76. 'format_note': 'speaker video',
  77. 'quality': -1,
  78. 'preference': -1,
  79. 'format_id': 'speaker',
  80. })
  81. return [{
  82. 'id': video_id,
  83. 'title': video_title,
  84. 'formats': video_formats,
  85. }]