dailymotion.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import hashlib
  5. import itertools
  6. import json
  7. import random
  8. import re
  9. import string
  10. from .common import InfoExtractor
  11. from ..compat import compat_struct_pack
  12. from ..utils import (
  13. determine_ext,
  14. error_to_compat_str,
  15. ExtractorError,
  16. int_or_none,
  17. parse_iso8601,
  18. sanitized_Request,
  19. str_to_int,
  20. unescapeHTML,
  21. mimetype2ext,
  22. )
  23. class DailymotionBaseInfoExtractor(InfoExtractor):
  24. @staticmethod
  25. def _build_request(url):
  26. """Build a request with the family filter disabled"""
  27. request = sanitized_Request(url)
  28. request.add_header('Cookie', 'family_filter=off; ff=off')
  29. return request
  30. def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
  31. request = self._build_request(url)
  32. return self._download_webpage_handle(request, *args, **kwargs)
  33. def _download_webpage_no_ff(self, url, *args, **kwargs):
  34. request = self._build_request(url)
  35. return self._download_webpage(request, *args, **kwargs)
  36. class DailymotionIE(DailymotionBaseInfoExtractor):
  37. _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
  38. IE_NAME = 'dailymotion'
  39. _FORMATS = [
  40. ('stream_h264_ld_url', 'ld'),
  41. ('stream_h264_url', 'standard'),
  42. ('stream_h264_hq_url', 'hq'),
  43. ('stream_h264_hd_url', 'hd'),
  44. ('stream_h264_hd1080_url', 'hd180'),
  45. ]
  46. _TESTS = [{
  47. 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
  48. 'md5': '074b95bdee76b9e3654137aee9c79dfe',
  49. 'info_dict': {
  50. 'id': 'x5kesuj',
  51. 'ext': 'mp4',
  52. 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
  53. 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
  54. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  55. 'duration': 187,
  56. 'timestamp': 1493651285,
  57. 'upload_date': '20170501',
  58. 'uploader': 'Deadline',
  59. 'uploader_id': 'x1xm8ri',
  60. 'age_limit': 0,
  61. },
  62. }, {
  63. 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
  64. 'md5': '2137c41a8e78554bb09225b8eb322406',
  65. 'info_dict': {
  66. 'id': 'x2iuewm',
  67. 'ext': 'mp4',
  68. 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
  69. 'description': 'Several come bundled with the Steam Controller.',
  70. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  71. 'duration': 74,
  72. 'timestamp': 1425657362,
  73. 'upload_date': '20150306',
  74. 'uploader': 'IGN',
  75. 'uploader_id': 'xijv66',
  76. 'age_limit': 0,
  77. 'view_count': int,
  78. },
  79. 'skip': 'video gone',
  80. }, {
  81. # Vevo video
  82. 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
  83. 'info_dict': {
  84. 'title': 'Roar (Official)',
  85. 'id': 'USUV71301934',
  86. 'ext': 'mp4',
  87. 'uploader': 'Katy Perry',
  88. 'upload_date': '20130905',
  89. },
  90. 'params': {
  91. 'skip_download': True,
  92. },
  93. 'skip': 'VEVO is only available in some countries',
  94. }, {
  95. # age-restricted video
  96. 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
  97. 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
  98. 'info_dict': {
  99. 'id': 'xyh2zz',
  100. 'ext': 'mp4',
  101. 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
  102. 'uploader': 'HotWaves1012',
  103. 'age_limit': 18,
  104. },
  105. 'skip': 'video gone',
  106. }, {
  107. # geo-restricted, player v5
  108. 'url': 'http://www.dailymotion.com/video/xhza0o',
  109. 'only_matching': True,
  110. }, {
  111. # with subtitles
  112. 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
  113. 'only_matching': True,
  114. }, {
  115. 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
  116. 'only_matching': True,
  117. }, {
  118. 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
  119. 'only_matching': True,
  120. }]
  121. @staticmethod
  122. def _extract_urls(webpage):
  123. # Look for embedded Dailymotion player
  124. matches = re.findall(
  125. r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
  126. return list(map(lambda m: unescapeHTML(m[1]), matches))
  127. def _real_extract(self, url):
  128. video_id = self._match_id(url)
  129. webpage = self._download_webpage_no_ff(
  130. 'https://www.dailymotion.com/video/%s' % video_id, video_id)
  131. age_limit = self._rta_search(webpage)
  132. description = self._og_search_description(webpage) or self._html_search_meta(
  133. 'description', webpage, 'description')
  134. view_count_str = self._search_regex(
  135. (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
  136. r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
  137. webpage, 'view count', default=None)
  138. if view_count_str:
  139. view_count_str = re.sub(r'\s', '', view_count_str)
  140. view_count = str_to_int(view_count_str)
  141. comment_count = int_or_none(self._search_regex(
  142. r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
  143. webpage, 'comment count', default=None))
  144. player_v5 = self._search_regex(
  145. [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826
  146. r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
  147. r'buildPlayer\(({.+?})\);',
  148. r'var\s+config\s*=\s*({.+?});',
  149. # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
  150. r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
  151. webpage, 'player v5', default=None)
  152. if player_v5:
  153. player = self._parse_json(player_v5, video_id)
  154. metadata = player['metadata']
  155. if metadata.get('error', {}).get('type') == 'password_protected':
  156. password = self._downloader.params.get('videopassword')
  157. if password:
  158. r = int(metadata['id'][1:], 36)
  159. us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
  160. t = ''.join(random.choice(string.ascii_letters) for i in range(10))
  161. n = us64e(compat_struct_pack('I', r))
  162. i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
  163. metadata = self._download_json(
  164. 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
  165. self._check_error(metadata)
  166. formats = []
  167. for quality, media_list in metadata['qualities'].items():
  168. for media in media_list:
  169. media_url = media.get('url')
  170. if not media_url:
  171. continue
  172. type_ = media.get('type')
  173. if type_ == 'application/vnd.lumberjack.manifest':
  174. continue
  175. ext = mimetype2ext(type_) or determine_ext(media_url)
  176. if ext == 'm3u8':
  177. m3u8_formats = self._extract_m3u8_formats(
  178. media_url, video_id, 'mp4', preference=-1,
  179. m3u8_id='hls', fatal=False)
  180. for f in m3u8_formats:
  181. f['url'] = f['url'].split('#')[0]
  182. formats.append(f)
  183. elif ext == 'f4m':
  184. formats.extend(self._extract_f4m_formats(
  185. media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
  186. else:
  187. f = {
  188. 'url': media_url,
  189. 'format_id': 'http-%s' % quality,
  190. 'ext': ext,
  191. }
  192. m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
  193. if m:
  194. f.update({
  195. 'width': int(m.group('width')),
  196. 'height': int(m.group('height')),
  197. })
  198. formats.append(f)
  199. self._sort_formats(formats)
  200. title = metadata['title']
  201. duration = int_or_none(metadata.get('duration'))
  202. timestamp = int_or_none(metadata.get('created_time'))
  203. thumbnail = metadata.get('poster_url')
  204. uploader = metadata.get('owner', {}).get('screenname')
  205. uploader_id = metadata.get('owner', {}).get('id')
  206. subtitles = {}
  207. subtitles_data = metadata.get('subtitles', {}).get('data', {})
  208. if subtitles_data and isinstance(subtitles_data, dict):
  209. for subtitle_lang, subtitle in subtitles_data.items():
  210. subtitles[subtitle_lang] = [{
  211. 'ext': determine_ext(subtitle_url),
  212. 'url': subtitle_url,
  213. } for subtitle_url in subtitle.get('urls', [])]
  214. return {
  215. 'id': video_id,
  216. 'title': title,
  217. 'description': description,
  218. 'thumbnail': thumbnail,
  219. 'duration': duration,
  220. 'timestamp': timestamp,
  221. 'uploader': uploader,
  222. 'uploader_id': uploader_id,
  223. 'age_limit': age_limit,
  224. 'view_count': view_count,
  225. 'comment_count': comment_count,
  226. 'formats': formats,
  227. 'subtitles': subtitles,
  228. }
  229. # vevo embed
  230. vevo_id = self._search_regex(
  231. r'<link rel="video_src" href="[^"]*?vevo\.com[^"]*?video=(?P<id>[\w]*)',
  232. webpage, 'vevo embed', default=None)
  233. if vevo_id:
  234. return self.url_result('vevo:%s' % vevo_id, 'Vevo')
  235. # fallback old player
  236. embed_page = self._download_webpage_no_ff(
  237. 'https://www.dailymotion.com/embed/video/%s' % video_id,
  238. video_id, 'Downloading embed page')
  239. timestamp = parse_iso8601(self._html_search_meta(
  240. 'video:release_date', webpage, 'upload date'))
  241. info = self._parse_json(
  242. self._search_regex(
  243. r'var info = ({.*?}),$', embed_page,
  244. 'video info', flags=re.MULTILINE),
  245. video_id)
  246. self._check_error(info)
  247. formats = []
  248. for (key, format_id) in self._FORMATS:
  249. video_url = info.get(key)
  250. if video_url is not None:
  251. m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
  252. if m_size is not None:
  253. width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
  254. else:
  255. width, height = None, None
  256. formats.append({
  257. 'url': video_url,
  258. 'ext': 'mp4',
  259. 'format_id': format_id,
  260. 'width': width,
  261. 'height': height,
  262. })
  263. self._sort_formats(formats)
  264. # subtitles
  265. video_subtitles = self.extract_subtitles(video_id, webpage)
  266. title = self._og_search_title(webpage, default=None)
  267. if title is None:
  268. title = self._html_search_regex(
  269. r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
  270. 'title')
  271. return {
  272. 'id': video_id,
  273. 'formats': formats,
  274. 'uploader': info['owner.screenname'],
  275. 'timestamp': timestamp,
  276. 'title': title,
  277. 'description': description,
  278. 'subtitles': video_subtitles,
  279. 'thumbnail': info['thumbnail_url'],
  280. 'age_limit': age_limit,
  281. 'view_count': view_count,
  282. 'duration': info['duration']
  283. }
  284. def _check_error(self, info):
  285. error = info.get('error')
  286. if error:
  287. title = error.get('title') or error['message']
  288. # See https://developer.dailymotion.com/api#access-error
  289. if error.get('code') == 'DM007':
  290. self.raise_geo_restricted(msg=title)
  291. raise ExtractorError(
  292. '%s said: %s' % (self.IE_NAME, title), expected=True)
  293. def _get_subtitles(self, video_id, webpage):
  294. try:
  295. sub_list = self._download_webpage(
  296. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  297. video_id, note=False)
  298. except ExtractorError as err:
  299. self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
  300. return {}
  301. info = json.loads(sub_list)
  302. if (info['total'] > 0):
  303. sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
  304. return sub_lang_list
  305. self._downloader.report_warning('video doesn\'t have subtitles')
  306. return {}
  307. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  308. IE_NAME = 'dailymotion:playlist'
  309. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'
  310. _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
  311. _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
  312. _TESTS = [{
  313. 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
  314. 'info_dict': {
  315. 'title': 'SPORT',
  316. 'id': 'xv4bw_nqtv_sport',
  317. },
  318. 'playlist_mincount': 20,
  319. }]
  320. def _extract_entries(self, id):
  321. video_ids = set()
  322. processed_urls = set()
  323. for pagenum in itertools.count(1):
  324. page_url = self._PAGE_TEMPLATE % (id, pagenum)
  325. webpage, urlh = self._download_webpage_handle_no_ff(
  326. page_url, id, 'Downloading page %s' % pagenum)
  327. if urlh.geturl() in processed_urls:
  328. self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
  329. page_url, urlh.geturl()), id)
  330. break
  331. processed_urls.add(urlh.geturl())
  332. for video_id in re.findall(r'data-xid="(.+?)"', webpage):
  333. if video_id not in video_ids:
  334. yield self.url_result(
  335. 'http://www.dailymotion.com/video/%s' % video_id,
  336. DailymotionIE.ie_key(), video_id)
  337. video_ids.add(video_id)
  338. if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
  339. break
  340. def _real_extract(self, url):
  341. mobj = re.match(self._VALID_URL, url)
  342. playlist_id = mobj.group('id')
  343. webpage = self._download_webpage(url, playlist_id)
  344. return {
  345. '_type': 'playlist',
  346. 'id': playlist_id,
  347. 'title': self._og_search_title(webpage),
  348. 'entries': self._extract_entries(playlist_id),
  349. }
  350. class DailymotionUserIE(DailymotionPlaylistIE):
  351. IE_NAME = 'dailymotion:user'
  352. _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
  353. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  354. _TESTS = [{
  355. 'url': 'https://www.dailymotion.com/user/nqtv',
  356. 'info_dict': {
  357. 'id': 'nqtv',
  358. 'title': 'Rémi Gaillard',
  359. },
  360. 'playlist_mincount': 100,
  361. }, {
  362. 'url': 'http://www.dailymotion.com/user/UnderProject',
  363. 'info_dict': {
  364. 'id': 'UnderProject',
  365. 'title': 'UnderProject',
  366. },
  367. 'playlist_mincount': 1800,
  368. 'expected_warnings': [
  369. 'Stopped at duplicated page',
  370. ],
  371. 'skip': 'Takes too long time',
  372. }]
  373. def _real_extract(self, url):
  374. mobj = re.match(self._VALID_URL, url)
  375. user = mobj.group('user')
  376. webpage = self._download_webpage(
  377. 'https://www.dailymotion.com/user/%s' % user, user)
  378. full_user = unescapeHTML(self._html_search_regex(
  379. r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
  380. webpage, 'user'))
  381. return {
  382. '_type': 'playlist',
  383. 'id': user,
  384. 'title': full_user,
  385. 'entries': self._extract_entries(user),
  386. }