kuwo.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import itertools
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. get_element_by_id,
  8. clean_html,
  9. ExtractorError,
  10. remove_start,
  11. )
  12. class KuwoIE(InfoExtractor):
  13. IE_NAME = 'kuwo:song'
  14. _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>[0-9]+?)/'
  15. _TESTS = [{
  16. 'url': 'http://www.kuwo.cn/yinyue/635632/',
  17. 'info_dict': {
  18. 'id': '635632',
  19. 'ext': 'ape',
  20. 'title': '爱我别走',
  21. 'creator': '张震岳',
  22. 'upload_date': '20080122',
  23. 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
  24. },
  25. }, {
  26. 'url': 'http://www.kuwo.cn/yinyue/6446136/',
  27. 'info_dict': {
  28. 'id': '6446136',
  29. 'ext': 'mp3',
  30. 'title': '心',
  31. 'creator': 'IU',
  32. 'upload_date': '20150518',
  33. },
  34. 'params': {
  35. 'format': 'mp3-320'
  36. },
  37. }]
  38. _FORMATS = [
  39. {'format': 'ape', 'ext': 'ape', 'preference': 100},
  40. {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
  41. {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
  42. {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
  43. {'format': 'wma', 'ext': 'wma', 'preference': 20},
  44. {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
  45. ]
  46. def _get_formats(self, song_id):
  47. formats = []
  48. for file_format in self._FORMATS:
  49. song_url = self._download_webpage(
  50. "http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url" %
  51. (file_format['ext'], file_format.get('br', ''), song_id),
  52. song_id, note="Download %s url info" % file_format["format"],
  53. )
  54. if song_url.startswith('http://') or song_url.startswith('https://'):
  55. formats.append({
  56. 'url': song_url,
  57. 'format_id': file_format['format'],
  58. 'format': file_format['format'],
  59. 'preference': file_format['preference'],
  60. 'abr': file_format.get('abr'),
  61. })
  62. self._sort_formats(formats)
  63. return formats
  64. def _real_extract(self, url):
  65. song_id = self._match_id(url)
  66. webpage = self._download_webpage(
  67. url, song_id, note='Download song detail info',
  68. errnote='Unable to get song detail info')
  69. song_name = self._html_search_regex(
  70. r'<h1[^>]+title="([^"]+)">', webpage, 'song name')
  71. singer_name = self._html_search_regex(
  72. r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
  73. webpage, 'singer name', default=None)
  74. lrc_content = clean_html(get_element_by_id("lrcContent", webpage))
  75. if lrc_content == '暂无': # indicates no lyrics
  76. lrc_content = None
  77. formats = self._get_formats(song_id)
  78. album_id = self._html_search_regex(
  79. r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
  80. webpage, 'album id', default=None, fatal=False)
  81. publish_time = None
  82. if album_id is not None:
  83. album_info_page = self._download_webpage(
  84. "http://www.kuwo.cn/album/%s/" % album_id, song_id,
  85. note='Download album detail info',
  86. errnote='Unable to get album detail info')
  87. publish_time = self._html_search_regex(
  88. r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
  89. 'publish time', default=None)
  90. if publish_time:
  91. publish_time = publish_time.replace('-', '')
  92. return {
  93. 'id': song_id,
  94. 'title': song_name,
  95. 'creator': singer_name,
  96. 'upload_date': publish_time,
  97. 'description': lrc_content,
  98. 'formats': formats,
  99. }
  100. class KuwoAlbumIE(InfoExtractor):
  101. IE_NAME = 'kuwo:album'
  102. _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>[0-9]+?)/'
  103. _TEST = {
  104. 'url': 'http://www.kuwo.cn/album/502294/',
  105. 'info_dict': {
  106. 'id': '502294',
  107. 'title': 'M',
  108. 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
  109. },
  110. 'playlist_count': 2,
  111. }
  112. def _real_extract(self, url):
  113. album_id = self._match_id(url)
  114. webpage = self._download_webpage(
  115. url, album_id, note='Download album info',
  116. errnote='Unable to get album info')
  117. album_name = self._html_search_regex(
  118. r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
  119. 'album name')
  120. album_intro = remove_start(
  121. clean_html(get_element_by_id("intro", webpage)),
  122. '%s简介:' % album_name)
  123. entries = [
  124. self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id)
  125. for song_id in re.findall(
  126. r'<p[^>]+class="listen"><a[^>]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"',
  127. webpage)
  128. ]
  129. return self.playlist_result(entries, album_id, album_name, album_intro)
  130. class KuwoChartIE(InfoExtractor):
  131. IE_NAME = 'kuwo:chart'
  132. _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
  133. _TEST = {
  134. 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
  135. 'info_dict': {
  136. 'id': '香港中文龙虎榜',
  137. 'title': '香港中文龙虎榜',
  138. 'description': 're:[0-9]{4}第[0-9]{2}期',
  139. },
  140. 'playlist_mincount': 10,
  141. }
  142. def _real_extract(self, url):
  143. chart_id = self._match_id(url)
  144. webpage = self._download_webpage(
  145. url, chart_id, note='Download chart info',
  146. errnote='Unable to get chart info')
  147. chart_name = self._html_search_regex(
  148. r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
  149. chart_desc = self._html_search_regex(
  150. r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
  151. entries = [
  152. self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id)
  153. for song_id in re.findall(
  154. r'<a[^>]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage)
  155. ]
  156. return self.playlist_result(entries, chart_id, chart_name, chart_desc)
  157. class KuwoSingerIE(InfoExtractor):
  158. IE_NAME = 'kuwo:singer'
  159. _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
  160. _TESTS = [{
  161. 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
  162. 'info_dict': {
  163. 'id': 'bruno+mars',
  164. 'title': 'Bruno Mars',
  165. },
  166. 'playlist_count': 10,
  167. }, {
  168. 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
  169. 'info_dict': {
  170. 'id': 'Ali',
  171. 'title': 'Ali',
  172. },
  173. 'playlist_mincount': 95,
  174. }]
  175. def _real_extract(self, url):
  176. singer_id = self._match_id(url)
  177. webpage = self._download_webpage(
  178. url, singer_id, note='Download singer info',
  179. errnote='Unable to get singer info')
  180. singer_name = self._html_search_regex(
  181. r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
  182. )
  183. entries = []
  184. first_page_only = False if re.search(r'/music(?:_[0-9]+)?\.htm', url) else True
  185. for page_num in itertools.count(1):
  186. webpage = self._download_webpage(
  187. 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
  188. singer_id, note='Download song list page #%d' % page_num,
  189. errnote='Unable to get song list page #%d' % page_num)
  190. entries.extend([
  191. self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id)
  192. for song_id in re.findall(
  193. r'<p[^>]+class="m_name"><a[^>]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/',
  194. webpage)
  195. ][:10 if first_page_only else None])
  196. if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
  197. break
  198. return self.playlist_result(entries, singer_id, singer_name)
  199. class KuwoCategoryIE(InfoExtractor):
  200. IE_NAME = 'kuwo:category'
  201. _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>[0-9]+?).htm'
  202. _TEST = {
  203. 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
  204. 'info_dict': {
  205. 'id': '86375',
  206. 'title': '八十年代精选',
  207. 'description': '这些都是属于八十年代的回忆!',
  208. },
  209. 'playlist_count': 30,
  210. }
  211. def _real_extract(self, url):
  212. category_id = self._match_id(url)
  213. webpage = self._download_webpage(
  214. url, category_id, note='Download category info',
  215. errnote='Unable to get category info')
  216. category_name = self._html_search_regex(
  217. r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
  218. category_desc = remove_start(
  219. get_element_by_id("intro", webpage).strip(),
  220. '%s简介:' % category_name)
  221. jsonm = self._parse_json(self._html_search_regex(
  222. r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
  223. entries = [
  224. self.url_result(
  225. "http://www.kuwo.cn/yinyue/%s/" % song['musicrid'],
  226. 'Kuwo', song['musicrid'])
  227. for song in jsonm['musiclist']
  228. ]
  229. return self.playlist_result(entries, category_id, category_name, category_desc)
  230. class KuwoMvIE(KuwoIE):
  231. IE_NAME = 'kuwo:mv'
  232. _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>[0-9]+?)/'
  233. _TESTS = [{
  234. 'url': 'http://www.kuwo.cn/mv/6480076/',
  235. 'info_dict': {
  236. 'id': '6480076',
  237. 'ext': 'mkv',
  238. 'title': '我们家MV',
  239. 'creator': '2PM',
  240. },
  241. }]
  242. _FORMATS = KuwoIE._FORMATS + [
  243. {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
  244. {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
  245. ]
  246. def _real_extract(self, url):
  247. song_id = self._match_id(url)
  248. webpage = self._download_webpage(
  249. url, song_id, note='Download mv detail info: %s' % song_id,
  250. errnote='Unable to get mv detail info: %s' % song_id)
  251. mobj = re.search(
  252. r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
  253. webpage)
  254. if mobj:
  255. song_name = mobj.group('song')
  256. singer_name = mobj.group('singer')
  257. else:
  258. raise ExtractorError("Unable to find song or singer names")
  259. formats = self._get_formats(song_id)
  260. return {
  261. 'id': song_id,
  262. 'title': song_name,
  263. 'creator': singer_name,
  264. 'formats': formats,
  265. }