grooveshark.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import time
  4. import math
  5. import re
  6. from urllib import quote, urlencode
  7. from os.path import basename
  8. from .common import InfoExtractor
  9. from ..utils import ExtractorError, compat_urllib_request, compat_html_parser
  10. from ..utils import compat_urlparse
  11. urlunparse = compat_urlparse.urlunparse
  12. urldefrag = compat_urlparse.urldefrag
  13. class GroovesharkHtmlParser(compat_html_parser.HTMLParser):
  14. def __init__(self):
  15. self._current_object = None
  16. self.objects = []
  17. compat_html_parser.HTMLParser.__init__(self)
  18. def handle_starttag(self, tag, attrs):
  19. attrs = dict((k, v) for k, v in attrs)
  20. if tag == 'object':
  21. self._current_object = {'attrs': attrs, 'params': []}
  22. elif tag == 'param':
  23. self._current_object['params'].append(attrs)
  24. def handle_endtag(self, tag):
  25. if tag == 'object':
  26. self.objects.append(self._current_object)
  27. self._current_object = None
  28. @classmethod
  29. def extract_object_tags(cls, html):
  30. p = cls()
  31. p.feed(html)
  32. p.close()
  33. return p.objects
  34. class GroovesharkIE(InfoExtractor):
  35. _VALID_URL = r'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
  36. _TEST = {
  37. 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
  38. 'md5': 'bbccc50b19daca23b8f961152c1dc95b',
  39. 'info_dict': {
  40. 'id': '6SS1DW',
  41. 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
  42. 'ext': 'mp3',
  43. 'duration': 227,
  44. }
  45. }
  46. do_playerpage_request = True
  47. do_bootstrap_request = True
  48. def _parse_target(self, target):
  49. uri = compat_urlparse.urlparse(target)
  50. hash = uri.fragment[1:].split('?')[0]
  51. token = basename(hash.rstrip('/'))
  52. return (uri, hash, token)
  53. def _build_bootstrap_url(self, target):
  54. (uri, hash, token) = self._parse_target(target)
  55. query = 'getCommunicationToken=1&hash=%s&%d' % (quote(hash, safe=''), self.ts)
  56. return (urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
  57. def _build_meta_url(self, target):
  58. (uri, hash, token) = self._parse_target(target)
  59. query = 'hash=%s&%d' % (quote(hash, safe=''), self.ts)
  60. return (urlunparse((uri.scheme, uri.netloc, '/preload.php', None, query, None)), token)
  61. def _build_stream_url(self, meta):
  62. return urlunparse(('http', meta['streamKey']['ip'], '/stream.php', None, None, None))
  63. def _build_swf_referer(self, target, obj):
  64. (uri, _, _) = self._parse_target(target)
  65. return urlunparse((uri.scheme, uri.netloc, obj['attrs']['data'], None, None, None))
  66. def _transform_bootstrap(self, js):
  67. return re.split('(?m)^\s*try\s*{', js)[0] \
  68. .split(' = ', 1)[1].strip().rstrip(';')
  69. def _transform_meta(self, js):
  70. return js.split('\n')[0].split('=')[1].rstrip(';')
  71. def _get_meta(self, target):
  72. (meta_url, token) = self._build_meta_url(target)
  73. self.to_screen('Metadata URL: %s' % meta_url)
  74. headers = {'Referer': urldefrag(target)[0]}
  75. req = compat_urllib_request.Request(meta_url, headers=headers)
  76. res = self._download_json(req, token,
  77. transform_source=self._transform_meta)
  78. if 'getStreamKeyWithSong' not in res:
  79. raise ExtractorError(
  80. 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
  81. if res['getStreamKeyWithSong'] is None:
  82. raise ExtractorError(
  83. 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
  84. expected=True)
  85. return res['getStreamKeyWithSong']
  86. def _get_bootstrap(self, target):
  87. (bootstrap_url, token) = self._build_bootstrap_url(target)
  88. headers = {'Referer': urldefrag(target)[0]}
  89. req = compat_urllib_request.Request(bootstrap_url, headers=headers)
  90. res = self._download_json(req, token, fatal=False,
  91. note='Downloading player bootstrap data',
  92. errnote='Unable to download player bootstrap data',
  93. transform_source=self._transform_bootstrap)
  94. return res
  95. def _get_playerpage(self, target):
  96. (_, _, token) = self._parse_target(target)
  97. webpage = self._download_webpage(
  98. target, token,
  99. note='Downloading player page',
  100. errnote='Unable to download player page',
  101. fatal=False)
  102. if webpage is not None:
  103. # Search (for example German) error message
  104. error_msg = self._html_search_regex(
  105. r'<div id="content">\s*<h2>(.*?)</h2>', webpage,
  106. 'error message', default=None)
  107. if error_msg is not None:
  108. error_msg = error_msg.replace('\n', ' ')
  109. raise ExtractorError('Grooveshark said: %s' % error_msg)
  110. if webpage is not None:
  111. o = GroovesharkHtmlParser.extract_object_tags(webpage)
  112. return (webpage, [x for x in o if x['attrs']['id'] == 'jsPlayerEmbed'])
  113. return (webpage, None)
  114. def _real_initialize(self):
  115. self.ts = int(time.time() * 1000) # timestamp in millis
  116. def _real_extract(self, url):
  117. (target_uri, _, token) = self._parse_target(url)
  118. # 1. Fill cookiejar by making a request to the player page
  119. swf_referer = None
  120. if self.do_playerpage_request:
  121. (_, player_objs) = self._get_playerpage(url)
  122. if player_objs is not None:
  123. swf_referer = self._build_swf_referer(url, player_objs[0])
  124. self.to_screen('SWF Referer: %s' % swf_referer)
  125. # 2. Ask preload.php for swf bootstrap data to better mimic webapp
  126. if self.do_bootstrap_request:
  127. bootstrap = self._get_bootstrap(url)
  128. self.to_screen('CommunicationToken: %s' % bootstrap['getCommunicationToken'])
  129. # 3. Ask preload.php for track metadata.
  130. meta = self._get_meta(url)
  131. # 4. Construct stream request for track.
  132. stream_url = self._build_stream_url(meta)
  133. duration = int(math.ceil(float(meta['streamKey']['uSecs']) / 1000000))
  134. post_dict = {'streamKey': meta['streamKey']['streamKey']}
  135. post_data = urlencode(post_dict).encode('utf-8')
  136. headers = {
  137. 'Content-Length': len(post_data),
  138. 'Content-Type': 'application/x-www-form-urlencoded'
  139. }
  140. if swf_referer is not None:
  141. headers['Referer'] = swf_referer
  142. return {
  143. 'id': token,
  144. 'title': meta['song']['Name'],
  145. 'http_method': 'POST',
  146. 'url': stream_url,
  147. 'ext': 'mp3',
  148. 'format': 'mp3 audio',
  149. 'duration': duration,
  150. 'http_post_data': post_data,
  151. 'http_headers': headers,
  152. }