dailymotion.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import hashlib
  5. import itertools
  6. import json
  7. import random
  8. import re
  9. import string
  10. from .common import InfoExtractor
  11. from ..compat import compat_struct_pack
  12. from ..utils import (
  13. determine_ext,
  14. error_to_compat_str,
  15. ExtractorError,
  16. int_or_none,
  17. parse_iso8601,
  18. sanitized_Request,
  19. str_to_int,
  20. unescapeHTML,
  21. mimetype2ext,
  22. )
  23. class DailymotionBaseInfoExtractor(InfoExtractor):
  24. @staticmethod
  25. def _build_request(url):
  26. """Build a request with the family filter disabled"""
  27. request = sanitized_Request(url)
  28. request.add_header('Cookie', 'family_filter=off; ff=off')
  29. return request
  30. def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
  31. request = self._build_request(url)
  32. return self._download_webpage_handle(request, *args, **kwargs)
  33. def _download_webpage_no_ff(self, url, *args, **kwargs):
  34. request = self._build_request(url)
  35. return self._download_webpage(request, *args, **kwargs)
  36. class DailymotionIE(DailymotionBaseInfoExtractor):
  37. _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
  38. IE_NAME = 'dailymotion'
  39. _FORMATS = [
  40. ('stream_h264_ld_url', 'ld'),
  41. ('stream_h264_url', 'standard'),
  42. ('stream_h264_hq_url', 'hq'),
  43. ('stream_h264_hd_url', 'hd'),
  44. ('stream_h264_hd1080_url', 'hd180'),
  45. ]
  46. _TESTS = [{
  47. 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
  48. 'md5': '074b95bdee76b9e3654137aee9c79dfe',
  49. 'info_dict': {
  50. 'id': 'x5kesuj',
  51. 'ext': 'mp4',
  52. 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
  53. 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
  54. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  55. 'duration': 187,
  56. 'timestamp': 1493651285,
  57. 'upload_date': '20170501',
  58. 'uploader': 'Deadline',
  59. 'uploader_id': 'x1xm8ri',
  60. 'age_limit': 0,
  61. },
  62. }, {
  63. 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
  64. 'md5': '2137c41a8e78554bb09225b8eb322406',
  65. 'info_dict': {
  66. 'id': 'x2iuewm',
  67. 'ext': 'mp4',
  68. 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
  69. 'description': 'Several come bundled with the Steam Controller.',
  70. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  71. 'duration': 74,
  72. 'timestamp': 1425657362,
  73. 'upload_date': '20150306',
  74. 'uploader': 'IGN',
  75. 'uploader_id': 'xijv66',
  76. 'age_limit': 0,
  77. 'view_count': int,
  78. },
  79. 'skip': 'video gone',
  80. }, {
  81. # Vevo video
  82. 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
  83. 'info_dict': {
  84. 'title': 'Roar (Official)',
  85. 'id': 'USUV71301934',
  86. 'ext': 'mp4',
  87. 'uploader': 'Katy Perry',
  88. 'upload_date': '20130905',
  89. },
  90. 'params': {
  91. 'skip_download': True,
  92. },
  93. 'skip': 'VEVO is only available in some countries',
  94. }, {
  95. # age-restricted video
  96. 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
  97. 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
  98. 'info_dict': {
  99. 'id': 'xyh2zz',
  100. 'ext': 'mp4',
  101. 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
  102. 'uploader': 'HotWaves1012',
  103. 'age_limit': 18,
  104. },
  105. 'skip': 'video gone',
  106. }, {
  107. # geo-restricted, player v5
  108. 'url': 'http://www.dailymotion.com/video/xhza0o',
  109. 'only_matching': True,
  110. }, {
  111. # with subtitles
  112. 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
  113. 'only_matching': True,
  114. }, {
  115. 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
  116. 'only_matching': True,
  117. }, {
  118. 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
  119. 'only_matching': True,
  120. }]
  121. @staticmethod
  122. def _extract_urls(webpage):
  123. # Look for embedded Dailymotion player
  124. matches = re.findall(
  125. r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
  126. return list(map(lambda m: unescapeHTML(m[1]), matches))
  127. def _real_extract(self, url):
  128. video_id = self._match_id(url)
  129. webpage = self._download_webpage_no_ff(
  130. 'https://www.dailymotion.com/video/%s' % video_id, video_id)
  131. age_limit = self._rta_search(webpage)
  132. description = self._og_search_description(
  133. webpage, default=None) or self._html_search_meta(
  134. 'description', webpage, 'description')
  135. view_count_str = self._search_regex(
  136. (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
  137. r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
  138. webpage, 'view count', default=None)
  139. if view_count_str:
  140. view_count_str = re.sub(r'\s', '', view_count_str)
  141. view_count = str_to_int(view_count_str)
  142. comment_count = int_or_none(self._search_regex(
  143. r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
  144. webpage, 'comment count', default=None))
  145. player_v5 = self._search_regex(
  146. [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826
  147. r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
  148. r'buildPlayer\(({.+?})\);',
  149. r'var\s+config\s*=\s*({.+?});',
  150. # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
  151. r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
  152. webpage, 'player v5', default=None)
  153. if player_v5:
  154. player = self._parse_json(player_v5, video_id)
  155. metadata = player['metadata']
  156. if metadata.get('error', {}).get('type') == 'password_protected':
  157. password = self._downloader.params.get('videopassword')
  158. if password:
  159. r = int(metadata['id'][1:], 36)
  160. us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
  161. t = ''.join(random.choice(string.ascii_letters) for i in range(10))
  162. n = us64e(compat_struct_pack('I', r))
  163. i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
  164. metadata = self._download_json(
  165. 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
  166. self._check_error(metadata)
  167. formats = []
  168. for quality, media_list in metadata['qualities'].items():
  169. for media in media_list:
  170. media_url = media.get('url')
  171. if not media_url:
  172. continue
  173. type_ = media.get('type')
  174. if type_ == 'application/vnd.lumberjack.manifest':
  175. continue
  176. ext = mimetype2ext(type_) or determine_ext(media_url)
  177. if ext == 'm3u8':
  178. m3u8_formats = self._extract_m3u8_formats(
  179. media_url, video_id, 'mp4', preference=-1,
  180. m3u8_id='hls', fatal=False)
  181. for f in m3u8_formats:
  182. f['url'] = f['url'].split('#')[0]
  183. formats.append(f)
  184. elif ext == 'f4m':
  185. formats.extend(self._extract_f4m_formats(
  186. media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
  187. else:
  188. f = {
  189. 'url': media_url,
  190. 'format_id': 'http-%s' % quality,
  191. 'ext': ext,
  192. }
  193. m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
  194. if m:
  195. f.update({
  196. 'width': int(m.group('width')),
  197. 'height': int(m.group('height')),
  198. })
  199. formats.append(f)
  200. self._sort_formats(formats)
  201. title = metadata['title']
  202. duration = int_or_none(metadata.get('duration'))
  203. timestamp = int_or_none(metadata.get('created_time'))
  204. thumbnail = metadata.get('poster_url')
  205. uploader = metadata.get('owner', {}).get('screenname')
  206. uploader_id = metadata.get('owner', {}).get('id')
  207. subtitles = {}
  208. subtitles_data = metadata.get('subtitles', {}).get('data', {})
  209. if subtitles_data and isinstance(subtitles_data, dict):
  210. for subtitle_lang, subtitle in subtitles_data.items():
  211. subtitles[subtitle_lang] = [{
  212. 'ext': determine_ext(subtitle_url),
  213. 'url': subtitle_url,
  214. } for subtitle_url in subtitle.get('urls', [])]
  215. return {
  216. 'id': video_id,
  217. 'title': title,
  218. 'description': description,
  219. 'thumbnail': thumbnail,
  220. 'duration': duration,
  221. 'timestamp': timestamp,
  222. 'uploader': uploader,
  223. 'uploader_id': uploader_id,
  224. 'age_limit': age_limit,
  225. 'view_count': view_count,
  226. 'comment_count': comment_count,
  227. 'formats': formats,
  228. 'subtitles': subtitles,
  229. }
  230. # vevo embed
  231. vevo_id = self._search_regex(
  232. r'<link rel="video_src" href="[^"]*?vevo\.com[^"]*?video=(?P<id>[\w]*)',
  233. webpage, 'vevo embed', default=None)
  234. if vevo_id:
  235. return self.url_result('vevo:%s' % vevo_id, 'Vevo')
  236. # fallback old player
  237. embed_page = self._download_webpage_no_ff(
  238. 'https://www.dailymotion.com/embed/video/%s' % video_id,
  239. video_id, 'Downloading embed page')
  240. timestamp = parse_iso8601(self._html_search_meta(
  241. 'video:release_date', webpage, 'upload date'))
  242. info = self._parse_json(
  243. self._search_regex(
  244. r'var info = ({.*?}),$', embed_page,
  245. 'video info', flags=re.MULTILINE),
  246. video_id)
  247. self._check_error(info)
  248. formats = []
  249. for (key, format_id) in self._FORMATS:
  250. video_url = info.get(key)
  251. if video_url is not None:
  252. m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
  253. if m_size is not None:
  254. width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
  255. else:
  256. width, height = None, None
  257. formats.append({
  258. 'url': video_url,
  259. 'ext': 'mp4',
  260. 'format_id': format_id,
  261. 'width': width,
  262. 'height': height,
  263. })
  264. self._sort_formats(formats)
  265. # subtitles
  266. video_subtitles = self.extract_subtitles(video_id, webpage)
  267. title = self._og_search_title(webpage, default=None)
  268. if title is None:
  269. title = self._html_search_regex(
  270. r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
  271. 'title')
  272. return {
  273. 'id': video_id,
  274. 'formats': formats,
  275. 'uploader': info['owner.screenname'],
  276. 'timestamp': timestamp,
  277. 'title': title,
  278. 'description': description,
  279. 'subtitles': video_subtitles,
  280. 'thumbnail': info['thumbnail_url'],
  281. 'age_limit': age_limit,
  282. 'view_count': view_count,
  283. 'duration': info['duration']
  284. }
  285. def _check_error(self, info):
  286. error = info.get('error')
  287. if error:
  288. title = error.get('title') or error['message']
  289. # See https://developer.dailymotion.com/api#access-error
  290. if error.get('code') == 'DM007':
  291. self.raise_geo_restricted(msg=title)
  292. raise ExtractorError(
  293. '%s said: %s' % (self.IE_NAME, title), expected=True)
  294. def _get_subtitles(self, video_id, webpage):
  295. try:
  296. sub_list = self._download_webpage(
  297. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  298. video_id, note=False)
  299. except ExtractorError as err:
  300. self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
  301. return {}
  302. info = json.loads(sub_list)
  303. if (info['total'] > 0):
  304. sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
  305. return sub_lang_list
  306. self._downloader.report_warning('video doesn\'t have subtitles')
  307. return {}
  308. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  309. IE_NAME = 'dailymotion:playlist'
  310. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'
  311. _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
  312. _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
  313. _TESTS = [{
  314. 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
  315. 'info_dict': {
  316. 'title': 'SPORT',
  317. 'id': 'xv4bw_nqtv_sport',
  318. },
  319. 'playlist_mincount': 20,
  320. }]
  321. def _extract_entries(self, id):
  322. video_ids = set()
  323. processed_urls = set()
  324. for pagenum in itertools.count(1):
  325. page_url = self._PAGE_TEMPLATE % (id, pagenum)
  326. webpage, urlh = self._download_webpage_handle_no_ff(
  327. page_url, id, 'Downloading page %s' % pagenum)
  328. if urlh.geturl() in processed_urls:
  329. self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
  330. page_url, urlh.geturl()), id)
  331. break
  332. processed_urls.add(urlh.geturl())
  333. for video_id in re.findall(r'data-xid="(.+?)"', webpage):
  334. if video_id not in video_ids:
  335. yield self.url_result(
  336. 'http://www.dailymotion.com/video/%s' % video_id,
  337. DailymotionIE.ie_key(), video_id)
  338. video_ids.add(video_id)
  339. if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
  340. break
  341. def _real_extract(self, url):
  342. mobj = re.match(self._VALID_URL, url)
  343. playlist_id = mobj.group('id')
  344. webpage = self._download_webpage(url, playlist_id)
  345. return {
  346. '_type': 'playlist',
  347. 'id': playlist_id,
  348. 'title': self._og_search_title(webpage),
  349. 'entries': self._extract_entries(playlist_id),
  350. }
  351. class DailymotionUserIE(DailymotionPlaylistIE):
  352. IE_NAME = 'dailymotion:user'
  353. _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
  354. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  355. _TESTS = [{
  356. 'url': 'https://www.dailymotion.com/user/nqtv',
  357. 'info_dict': {
  358. 'id': 'nqtv',
  359. 'title': 'Rémi Gaillard',
  360. },
  361. 'playlist_mincount': 100,
  362. }, {
  363. 'url': 'http://www.dailymotion.com/user/UnderProject',
  364. 'info_dict': {
  365. 'id': 'UnderProject',
  366. 'title': 'UnderProject',
  367. },
  368. 'playlist_mincount': 1800,
  369. 'expected_warnings': [
  370. 'Stopped at duplicated page',
  371. ],
  372. 'skip': 'Takes too long time',
  373. }]
  374. def _real_extract(self, url):
  375. mobj = re.match(self._VALID_URL, url)
  376. user = mobj.group('user')
  377. webpage = self._download_webpage(
  378. 'https://www.dailymotion.com/user/%s' % user, user)
  379. full_user = unescapeHTML(self._html_search_regex(
  380. r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
  381. webpage, 'user'))
  382. return {
  383. '_type': 'playlist',
  384. 'id': user,
  385. 'title': full_user,
  386. 'entries': self._extract_entries(user),
  387. }