vlive.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import time
  5. import itertools
  6. from .common import InfoExtractor
  7. from ..compat import (
  8. compat_urllib_parse_urlencode,
  9. compat_str,
  10. )
  11. from ..utils import (
  12. dict_get,
  13. ExtractorError,
  14. float_or_none,
  15. int_or_none,
  16. remove_start,
  17. try_get,
  18. urlencode_postdata,
  19. )
  20. class VLiveIE(InfoExtractor):
  21. IE_NAME = 'vlive'
  22. _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
  23. _NETRC_MACHINE = 'vlive'
  24. _TESTS = [{
  25. 'url': 'http://www.vlive.tv/video/1326',
  26. 'md5': 'cc7314812855ce56de70a06a27314983',
  27. 'info_dict': {
  28. 'id': '1326',
  29. 'ext': 'mp4',
  30. 'title': "[V LIVE] Girl's Day's Broadcast",
  31. 'creator': "Girl's Day",
  32. 'view_count': int,
  33. },
  34. }, {
  35. 'url': 'http://www.vlive.tv/video/16937',
  36. 'info_dict': {
  37. 'id': '16937',
  38. 'ext': 'mp4',
  39. 'title': '[V LIVE] 첸백시 걍방',
  40. 'creator': 'EXO',
  41. 'view_count': int,
  42. 'subtitles': 'mincount:12',
  43. },
  44. 'params': {
  45. 'skip_download': True,
  46. },
  47. }, {
  48. 'url': 'https://www.vlive.tv/video/129100',
  49. 'md5': 'ca2569453b79d66e5b919e5d308bff6b',
  50. 'info_dict': {
  51. 'id': '129100',
  52. 'ext': 'mp4',
  53. 'title': "[V LIVE] [BTS+] Run BTS! 2019 - EP.71 :: Behind the scene",
  54. 'creator': "BTS+",
  55. 'view_count': int,
  56. 'subtitles': 'mincount:10',
  57. },
  58. 'skip': 'This video is only available for CH+ subscribers',
  59. }]
  60. @classmethod
  61. def suitable(cls, url):
  62. return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
  63. def _real_initialize(self):
  64. self._login()
  65. def _login(self):
  66. email, password = self._get_login_info()
  67. if None in (email, password):
  68. return
  69. def is_logged_in():
  70. login_info = self._download_json(
  71. 'https://www.vlive.tv/auth/loginInfo', None,
  72. note='Downloading login info',
  73. headers={'Referer': 'https://www.vlive.tv/home'})
  74. return try_get(
  75. login_info, lambda x: x['message']['login'], bool) or False
  76. LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
  77. self._request_webpage(
  78. LOGIN_URL, None, note='Downloading login cookies')
  79. self._download_webpage(
  80. LOGIN_URL, None, note='Logging in',
  81. data=urlencode_postdata({'email': email, 'pwd': password}),
  82. headers={
  83. 'Referer': LOGIN_URL,
  84. 'Content-Type': 'application/x-www-form-urlencoded'
  85. })
  86. if not is_logged_in():
  87. raise ExtractorError('Unable to log in', expected=True)
  88. def _real_extract(self, url):
  89. video_id = self._match_id(url)
  90. webpage = self._download_webpage(
  91. 'https://www.vlive.tv/video/%s' % video_id, video_id)
  92. VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
  93. VIDEO_PARAMS_FIELD = 'video params'
  94. params = self._parse_json(self._search_regex(
  95. VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
  96. transform_source=lambda s: '[' + s + ']', fatal=False)
  97. if not params or len(params) < 7:
  98. params = self._search_regex(
  99. VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
  100. params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
  101. status, long_video_id, key = params[2], params[5], params[6]
  102. status = remove_start(status, 'PRODUCT_')
  103. if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
  104. return self._live(video_id, webpage)
  105. elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
  106. return self._replay(video_id, webpage, long_video_id, key)
  107. if status == 'LIVE_END':
  108. raise ExtractorError('Uploading for replay. Please wait...',
  109. expected=True)
  110. elif status == 'COMING_SOON':
  111. raise ExtractorError('Coming soon!', expected=True)
  112. elif status == 'CANCELED':
  113. raise ExtractorError('We are sorry, '
  114. 'but the live broadcast has been canceled.',
  115. expected=True)
  116. elif status == 'ONLY_APP':
  117. raise ExtractorError('Unsupported video type', expected=True)
  118. else:
  119. raise ExtractorError('Unknown status %s' % status)
  120. def _get_common_fields(self, webpage):
  121. title = self._og_search_title(webpage)
  122. creator = self._html_search_regex(
  123. r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)',
  124. webpage, 'creator', fatal=False)
  125. thumbnail = self._og_search_thumbnail(webpage)
  126. return {
  127. 'title': title,
  128. 'creator': creator,
  129. 'thumbnail': thumbnail,
  130. }
  131. def _live(self, video_id, webpage):
  132. init_page = self._download_init_page(video_id)
  133. live_params = self._search_regex(
  134. r'"liveStreamInfo"\s*:\s*(".*"),',
  135. init_page, 'live stream info')
  136. live_params = self._parse_json(live_params, video_id)
  137. live_params = self._parse_json(live_params, video_id)
  138. formats = []
  139. for vid in live_params.get('resolutions', []):
  140. formats.extend(self._extract_m3u8_formats(
  141. vid['cdnUrl'], video_id, 'mp4',
  142. m3u8_id=vid.get('name'),
  143. fatal=False, live=True))
  144. self._sort_formats(formats)
  145. info = self._get_common_fields(webpage)
  146. info.update({
  147. 'title': self._live_title(info['title']),
  148. 'id': video_id,
  149. 'formats': formats,
  150. 'is_live': True,
  151. })
  152. return info
  153. def _replay(self, video_id, webpage, long_video_id, key):
  154. if '' in (long_video_id, key):
  155. init_page = self._download_init_page(video_id)
  156. video_info = self._parse_json(self._search_regex(
  157. (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script',
  158. r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'),
  159. video_id)
  160. if video_info.get('status') == 'NEED_CHANNEL_PLUS':
  161. self.raise_login_required(
  162. 'This video is only available for CH+ subscribers')
  163. long_video_id, key = video_info['vid'], video_info['inkey']
  164. playinfo = self._download_json(
  165. 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
  166. % compat_urllib_parse_urlencode({
  167. 'videoId': long_video_id,
  168. 'key': key,
  169. 'ptc': 'http',
  170. 'doct': 'json', # document type (xml or json)
  171. 'cpt': 'vtt', # captions type (vtt or ttml)
  172. }), video_id)
  173. formats = [{
  174. 'url': vid['source'],
  175. 'format_id': vid.get('encodingOption', {}).get('name'),
  176. 'abr': float_or_none(vid.get('bitrate', {}).get('audio')),
  177. 'vbr': float_or_none(vid.get('bitrate', {}).get('video')),
  178. 'width': int_or_none(vid.get('encodingOption', {}).get('width')),
  179. 'height': int_or_none(vid.get('encodingOption', {}).get('height')),
  180. 'filesize': int_or_none(vid.get('size')),
  181. } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
  182. self._sort_formats(formats)
  183. view_count = int_or_none(playinfo.get('meta', {}).get('count'))
  184. subtitles = {}
  185. for caption in playinfo.get('captions', {}).get('list', []):
  186. lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
  187. if lang and caption.get('source'):
  188. subtitles[lang] = [{
  189. 'ext': 'vtt',
  190. 'url': caption['source']}]
  191. info = self._get_common_fields(webpage)
  192. info.update({
  193. 'id': video_id,
  194. 'formats': formats,
  195. 'view_count': view_count,
  196. 'subtitles': subtitles,
  197. })
  198. return info
  199. def _download_init_page(self, video_id):
  200. return self._download_webpage(
  201. 'https://www.vlive.tv/video/init/view',
  202. video_id, note='Downloading live webpage',
  203. data=urlencode_postdata({'videoSeq': video_id}),
  204. headers={
  205. 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
  206. 'Content-Type': 'application/x-www-form-urlencoded'
  207. })
  208. class VLiveChannelIE(InfoExtractor):
  209. IE_NAME = 'vlive:channel'
  210. _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
  211. _TEST = {
  212. 'url': 'http://channels.vlive.tv/FCD4B',
  213. 'info_dict': {
  214. 'id': 'FCD4B',
  215. 'title': 'MAMAMOO',
  216. },
  217. 'playlist_mincount': 110
  218. }
  219. _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
  220. def _real_extract(self, url):
  221. channel_code = self._match_id(url)
  222. webpage = self._download_webpage(
  223. 'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
  224. app_id = None
  225. app_js_url = self._search_regex(
  226. r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
  227. webpage, 'app js', default=None, group='url')
  228. if app_js_url:
  229. app_js = self._download_webpage(
  230. app_js_url, channel_code, 'Downloading app JS', fatal=False)
  231. if app_js:
  232. app_id = self._search_regex(
  233. r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
  234. app_js, 'app id', default=None)
  235. app_id = app_id or self._APP_ID
  236. channel_info = self._download_json(
  237. 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
  238. channel_code, note='Downloading decode channel code',
  239. query={
  240. 'app_id': app_id,
  241. 'channelCode': channel_code,
  242. '_': int(time.time())
  243. })
  244. channel_seq = channel_info['result']['channelSeq']
  245. channel_name = None
  246. entries = []
  247. for page_num in itertools.count(1):
  248. video_list = self._download_json(
  249. 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
  250. channel_code, note='Downloading channel list page #%d' % page_num,
  251. query={
  252. 'app_id': app_id,
  253. 'channelSeq': channel_seq,
  254. # Large values of maxNumOfRows (~300 or above) may cause
  255. # empty responses (see [1]), e.g. this happens for [2] that
  256. # has more than 300 videos.
  257. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
  258. # 2. http://channels.vlive.tv/EDBF.
  259. 'maxNumOfRows': 100,
  260. '_': int(time.time()),
  261. 'pageNo': page_num
  262. }
  263. )
  264. if not channel_name:
  265. channel_name = try_get(
  266. video_list,
  267. lambda x: x['result']['channelInfo']['channelName'],
  268. compat_str)
  269. videos = try_get(
  270. video_list, lambda x: x['result']['videoList'], list)
  271. if not videos:
  272. break
  273. for video in videos:
  274. video_id = video.get('videoSeq')
  275. if not video_id:
  276. continue
  277. video_id = compat_str(video_id)
  278. entries.append(
  279. self.url_result(
  280. 'http://www.vlive.tv/video/%s' % video_id,
  281. ie=VLiveIE.ie_key(), video_id=video_id))
  282. return self.playlist_result(
  283. entries, channel_code, channel_name)
  284. class VLivePlaylistIE(InfoExtractor):
  285. IE_NAME = 'vlive:playlist'
  286. _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
  287. _TEST = {
  288. 'url': 'http://www.vlive.tv/video/22867/playlist/22912',
  289. 'info_dict': {
  290. 'id': '22912',
  291. 'title': 'Valentine Day Message from TWICE'
  292. },
  293. 'playlist_mincount': 9
  294. }
  295. def _real_extract(self, url):
  296. mobj = re.match(self._VALID_URL, url)
  297. video_id, playlist_id = mobj.group('video_id', 'id')
  298. VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
  299. if self._downloader.params.get('noplaylist'):
  300. self.to_screen(
  301. 'Downloading just video %s because of --no-playlist' % video_id)
  302. return self.url_result(
  303. VIDEO_URL_TEMPLATE % video_id,
  304. ie=VLiveIE.ie_key(), video_id=video_id)
  305. self.to_screen(
  306. 'Downloading playlist %s - add --no-playlist to just download video'
  307. % playlist_id)
  308. webpage = self._download_webpage(
  309. 'http://www.vlive.tv/video/%s/playlist/%s'
  310. % (video_id, playlist_id), playlist_id)
  311. item_ids = self._parse_json(
  312. self._search_regex(
  313. r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
  314. 'playlist video seqs'),
  315. playlist_id)
  316. entries = [
  317. self.url_result(
  318. VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
  319. video_id=compat_str(item_id))
  320. for item_id in item_ids]
  321. playlist_name = self._html_search_regex(
  322. r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
  323. webpage, 'playlist title', fatal=False)
  324. return self.playlist_result(entries, playlist_id, playlist_name)