youtube-dl 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Author: Ricardo Garcia Gonzalez
  4. # Author: Danny Colligan
  5. # Author: Benjamin Johnson
  6. # License: Public domain code
  7. import htmlentitydefs
  8. import httplib
  9. import locale
  10. import math
  11. import netrc
  12. import os
  13. import os.path
  14. import re
  15. import socket
  16. import string
  17. import subprocess
  18. import sys
  19. import time
  20. import urllib
  21. import urllib2
  22. # parse_qs was moved from the cgi module to the urlparse module recently.
  23. try:
  24. from urlparse import parse_qs
  25. except ImportError:
  26. from cgi import parse_qs
  27. std_headers = {
  28. 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  29. 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  30. 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  31. 'Accept-Language': 'en-us,en;q=0.5',
  32. }
  33. simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  34. def preferredencoding():
  35. """Get preferred encoding.
  36. Returns the best encoding scheme for the system, based on
  37. locale.getpreferredencoding() and some further tweaks.
  38. """
  39. def yield_preferredencoding():
  40. try:
  41. pref = locale.getpreferredencoding()
  42. u'TEST'.encode(pref)
  43. except:
  44. pref = 'UTF-8'
  45. while True:
  46. yield pref
  47. return yield_preferredencoding().next()
  48. def htmlentity_transform(matchobj):
  49. """Transforms an HTML entity to a Unicode character.
  50. This function receives a match object and is intended to be used with
  51. the re.sub() function.
  52. """
  53. entity = matchobj.group(1)
  54. # Known non-numeric HTML entity
  55. if entity in htmlentitydefs.name2codepoint:
  56. return unichr(htmlentitydefs.name2codepoint[entity])
  57. # Unicode character
  58. mobj = re.match(ur'(?u)#(x?\d+)', entity)
  59. if mobj is not None:
  60. numstr = mobj.group(1)
  61. if numstr.startswith(u'x'):
  62. base = 16
  63. numstr = u'0%s' % numstr
  64. else:
  65. base = 10
  66. return unichr(long(numstr, base))
  67. # Unknown entity in name, return its literal representation
  68. return (u'&%s;' % entity)
  69. def sanitize_title(utitle):
  70. """Sanitizes a video title so it could be used as part of a filename."""
  71. utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  72. return utitle.replace(unicode(os.sep), u'%')
  73. def sanitize_open(filename, open_mode):
  74. """Try to open the given filename, and slightly tweak it if this fails.
  75. Attempts to open the given filename. If this fails, it tries to change
  76. the filename slightly, step by step, until it's either able to open it
  77. or it fails and raises a final exception, like the standard open()
  78. function.
  79. It returns the tuple (stream, definitive_file_name).
  80. """
  81. try:
  82. if filename == u'-':
  83. return (sys.stdout, filename)
  84. stream = open(filename, open_mode)
  85. return (stream, filename)
  86. except (IOError, OSError), err:
  87. # In case of error, try to remove win32 forbidden chars
  88. filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
  89. # An exception here should be caught in the caller
  90. stream = open(filename, open_mode)
  91. return (stream, filename)
  92. class DownloadError(Exception):
  93. """Download Error exception.
  94. This exception may be thrown by FileDownloader objects if they are not
  95. configured to continue on errors. They will contain the appropriate
  96. error message.
  97. """
  98. pass
  99. class SameFileError(Exception):
  100. """Same File exception.
  101. This exception will be thrown by FileDownloader objects if they detect
  102. multiple files would have to be downloaded to the same file on disk.
  103. """
  104. pass
  105. class PostProcessingError(Exception):
  106. """Post Processing exception.
  107. This exception may be raised by PostProcessor's .run() method to
  108. indicate an error in the postprocessing task.
  109. """
  110. pass
  111. class UnavailableFormatError(Exception):
  112. """Unavailable Format exception.
  113. This exception will be thrown when a video is requested
  114. in a format that is not available for that video.
  115. """
  116. pass
  117. class ContentTooShortError(Exception):
  118. """Content Too Short exception.
  119. This exception may be raised by FileDownloader objects when a file they
  120. download is too small for what the server announced first, indicating
  121. the connection was probably interrupted.
  122. """
  123. # Both in bytes
  124. downloaded = None
  125. expected = None
  126. def __init__(self, downloaded, expected):
  127. self.downloaded = downloaded
  128. self.expected = expected
  129. class FileDownloader(object):
  130. """File Downloader class.
  131. File downloader objects are the ones responsible of downloading the
  132. actual video file and writing it to disk if the user has requested
  133. it, among some other tasks. In most cases there should be one per
  134. program. As, given a video URL, the downloader doesn't know how to
  135. extract all the needed information, task that InfoExtractors do, it
  136. has to pass the URL to one of them.
  137. For this, file downloader objects have a method that allows
  138. InfoExtractors to be registered in a given order. When it is passed
  139. a URL, the file downloader handles it to the first InfoExtractor it
  140. finds that reports being able to handle it. The InfoExtractor extracts
  141. all the information about the video or videos the URL refers to, and
  142. asks the FileDownloader to process the video information, possibly
  143. downloading the video.
  144. File downloaders accept a lot of parameters. In order not to saturate
  145. the object constructor with arguments, it receives a dictionary of
  146. options instead. These options are available through the params
  147. attribute for the InfoExtractors to use. The FileDownloader also
  148. registers itself as the downloader in charge for the InfoExtractors
  149. that are added to it, so this is a "mutual registration".
  150. Available options:
  151. username: Username for authentication purposes.
  152. password: Password for authentication purposes.
  153. usenetrc: Use netrc for authentication instead.
  154. quiet: Do not print messages to stdout.
  155. forceurl: Force printing final URL.
  156. forcetitle: Force printing title.
  157. simulate: Do not download the video files.
  158. format: Video format code.
  159. outtmpl: Template for output names.
  160. ignoreerrors: Do not stop on download errors.
  161. ratelimit: Download speed limit, in bytes/sec.
  162. nooverwrites: Prevent overwriting files.
  163. continuedl: Try to continue downloads if possible.
  164. noprogress: Do not print the progress bar.
  165. """
  166. params = None
  167. _ies = []
  168. _pps = []
  169. _download_retcode = None
  170. def __init__(self, params):
  171. """Create a FileDownloader object with the given options."""
  172. self._ies = []
  173. self._pps = []
  174. self._download_retcode = 0
  175. self.params = params
  176. @staticmethod
  177. def pmkdir(filename):
  178. """Create directory components in filename. Similar to Unix "mkdir -p"."""
  179. components = filename.split(os.sep)
  180. aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  181. aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
  182. for dir in aggregate:
  183. if not os.path.exists(dir):
  184. os.mkdir(dir)
  185. @staticmethod
  186. def format_bytes(bytes):
  187. if bytes is None:
  188. return 'N/A'
  189. if type(bytes) is str:
  190. bytes = float(bytes)
  191. if bytes == 0.0:
  192. exponent = 0
  193. else:
  194. exponent = long(math.log(bytes, 1024.0))
  195. suffix = 'bkMGTPEZY'[exponent]
  196. converted = float(bytes) / float(1024**exponent)
  197. return '%.2f%s' % (converted, suffix)
  198. @staticmethod
  199. def calc_percent(byte_counter, data_len):
  200. if data_len is None:
  201. return '---.-%'
  202. return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
  203. @staticmethod
  204. def calc_eta(start, now, total, current):
  205. if total is None:
  206. return '--:--'
  207. dif = now - start
  208. if current == 0 or dif < 0.001: # One millisecond
  209. return '--:--'
  210. rate = float(current) / dif
  211. eta = long((float(total) - float(current)) / rate)
  212. (eta_mins, eta_secs) = divmod(eta, 60)
  213. if eta_mins > 99:
  214. return '--:--'
  215. return '%02d:%02d' % (eta_mins, eta_secs)
  216. @staticmethod
  217. def calc_speed(start, now, bytes):
  218. dif = now - start
  219. if bytes == 0 or dif < 0.001: # One millisecond
  220. return '%10s' % '---b/s'
  221. return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
  222. @staticmethod
  223. def best_block_size(elapsed_time, bytes):
  224. new_min = max(bytes / 2.0, 1.0)
  225. new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
  226. if elapsed_time < 0.001:
  227. return long(new_max)
  228. rate = bytes / elapsed_time
  229. if rate > new_max:
  230. return long(new_max)
  231. if rate < new_min:
  232. return long(new_min)
  233. return long(rate)
  234. @staticmethod
  235. def parse_bytes(bytestr):
  236. """Parse a string indicating a byte quantity into a long integer."""
  237. matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
  238. if matchobj is None:
  239. return None
  240. number = float(matchobj.group(1))
  241. multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
  242. return long(round(number * multiplier))
  243. @staticmethod
  244. def verify_url(url):
  245. """Verify a URL is valid and data could be downloaded. Return real data URL."""
  246. request = urllib2.Request(url, None, std_headers)
  247. data = urllib2.urlopen(request)
  248. data.read(1)
  249. url = data.geturl()
  250. data.close()
  251. return url
  252. def add_info_extractor(self, ie):
  253. """Add an InfoExtractor object to the end of the list."""
  254. self._ies.append(ie)
  255. ie.set_downloader(self)
  256. def add_post_processor(self, pp):
  257. """Add a PostProcessor object to the end of the chain."""
  258. self._pps.append(pp)
  259. pp.set_downloader(self)
  260. def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
  261. """Print message to stdout if not in quiet mode."""
  262. try:
  263. if not self.params.get('quiet', False):
  264. print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
  265. sys.stdout.flush()
  266. except (UnicodeEncodeError), err:
  267. if not ignore_encoding_errors:
  268. raise
  269. def to_stderr(self, message):
  270. """Print message to stderr."""
  271. print >>sys.stderr, message.encode(preferredencoding())
  272. def fixed_template(self):
  273. """Checks if the output template is fixed."""
  274. return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
  275. def trouble(self, message=None):
  276. """Determine action to take when a download problem appears.
  277. Depending on if the downloader has been configured to ignore
  278. download errors or not, this method may throw an exception or
  279. not when errors are found, after printing the message.
  280. """
  281. if message is not None:
  282. self.to_stderr(message)
  283. if not self.params.get('ignoreerrors', False):
  284. raise DownloadError(message)
  285. self._download_retcode = 1
  286. def slow_down(self, start_time, byte_counter):
  287. """Sleep if the download speed is over the rate limit."""
  288. rate_limit = self.params.get('ratelimit', None)
  289. if rate_limit is None or byte_counter == 0:
  290. return
  291. now = time.time()
  292. elapsed = now - start_time
  293. if elapsed <= 0.0:
  294. return
  295. speed = float(byte_counter) / elapsed
  296. if speed > rate_limit:
  297. time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
  298. def report_destination(self, filename):
  299. """Report destination filename."""
  300. self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
  301. def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
  302. """Report download progress."""
  303. if self.params.get('noprogress', False):
  304. return
  305. self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
  306. (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
  307. def report_resuming_byte(self, resume_len):
  308. """Report attemtp to resume at given byte."""
  309. self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
  310. def report_file_already_downloaded(self, file_name):
  311. """Report file has already been fully downloaded."""
  312. try:
  313. self.to_stdout(u'[download] %s has already been downloaded' % file_name)
  314. except (UnicodeEncodeError), err:
  315. self.to_stdout(u'[download] The file has already been downloaded')
  316. def report_unable_to_resume(self):
  317. """Report it was impossible to resume download."""
  318. self.to_stdout(u'[download] Unable to resume')
  319. def report_finish(self):
  320. """Report download finished."""
  321. if self.params.get('noprogress', False):
  322. self.to_stdout(u'[download] Download completed')
  323. else:
  324. self.to_stdout(u'')
  325. def process_info(self, info_dict):
  326. """Process a single dictionary returned by an InfoExtractor."""
  327. # Do nothing else if in simulate mode
  328. if self.params.get('simulate', False):
  329. # Verify URL if it's an HTTP one
  330. if info_dict['url'].startswith('http'):
  331. try:
  332. self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
  333. except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
  334. raise UnavailableFormatError
  335. # Forced printings
  336. if self.params.get('forcetitle', False):
  337. print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
  338. if self.params.get('forceurl', False):
  339. print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
  340. return
  341. try:
  342. template_dict = dict(info_dict)
  343. template_dict['epoch'] = unicode(long(time.time()))
  344. filename = self.params['outtmpl'] % template_dict
  345. except (ValueError, KeyError), err:
  346. self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
  347. if self.params.get('nooverwrites', False) and os.path.exists(filename):
  348. self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
  349. return
  350. try:
  351. self.pmkdir(filename)
  352. except (OSError, IOError), err:
  353. self.trouble('ERROR: unable to create directories: %s' % str(err))
  354. return
  355. try:
  356. success = self._do_download(filename, info_dict['url'].encode('utf-8'))
  357. except (OSError, IOError), err:
  358. raise UnavailableFormatError
  359. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  360. self.trouble('ERROR: unable to download video data: %s' % str(err))
  361. return
  362. except (ContentTooShortError, ), err:
  363. self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
  364. return
  365. if success:
  366. try:
  367. self.post_process(filename, info_dict)
  368. except (PostProcessingError), err:
  369. self.trouble('ERROR: postprocessing: %s' % str(err))
  370. return
  371. def download(self, url_list):
  372. """Download a given list of URLs."""
  373. if len(url_list) > 1 and self.fixed_template():
  374. raise SameFileError(self.params['outtmpl'])
  375. for url in url_list:
  376. suitable_found = False
  377. for ie in self._ies:
  378. # Go to next InfoExtractor if not suitable
  379. if not ie.suitable(url):
  380. continue
  381. # Suitable InfoExtractor found
  382. suitable_found = True
  383. # Extract information from URL and process it
  384. ie.extract(url)
  385. # Suitable InfoExtractor had been found; go to next URL
  386. break
  387. if not suitable_found:
  388. self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
  389. return self._download_retcode
  390. def post_process(self, filename, ie_info):
  391. """Run the postprocessing chain on the given file."""
  392. info = dict(ie_info)
  393. info['filepath'] = filename
  394. for pp in self._pps:
  395. info = pp.run(info)
  396. if info is None:
  397. break
  398. def _download_with_rtmpdump(self, filename, url):
  399. self.report_destination(filename)
  400. # Check for rtmpdump first
  401. try:
  402. subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
  403. except (OSError, IOError):
  404. self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
  405. return False
  406. # Download using rtmpdump. rtmpdump returns exit code 2 when
  407. # the connection was interrumpted and resuming appears to be
  408. # possible. This is part of rtmpdump's normal usage, AFAIK.
  409. basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
  410. retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
  411. while retval == 2 or retval == 1:
  412. self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
  413. time.sleep(2.0) # This seems to be needed
  414. retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
  415. if retval == 0:
  416. self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
  417. return True
  418. else:
  419. self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
  420. return False
  421. def _do_download(self, filename, url):
  422. # Attempt to download using rtmpdump
  423. if url.startswith('rtmp'):
  424. return self._download_with_rtmpdump(filename, url)
  425. stream = None
  426. open_mode = 'wb'
  427. basic_request = urllib2.Request(url, None, std_headers)
  428. request = urllib2.Request(url, None, std_headers)
  429. # Establish possible resume length
  430. if os.path.isfile(filename):
  431. resume_len = os.path.getsize(filename)
  432. else:
  433. resume_len = 0
  434. # Request parameters in case of being able to resume
  435. if self.params.get('continuedl', False) and resume_len != 0:
  436. self.report_resuming_byte(resume_len)
  437. request.add_header('Range','bytes=%d-' % resume_len)
  438. open_mode = 'ab'
  439. # Establish connection
  440. try:
  441. data = urllib2.urlopen(request)
  442. except (urllib2.HTTPError, ), err:
  443. if err.code != 416: # 416 is 'Requested range not satisfiable'
  444. raise
  445. # Unable to resume
  446. data = urllib2.urlopen(basic_request)
  447. content_length = data.info()['Content-Length']
  448. if content_length is not None and long(content_length) == resume_len:
  449. # Because the file had already been fully downloaded
  450. self.report_file_already_downloaded(filename)
  451. return True
  452. else:
  453. # Because the server didn't let us
  454. self.report_unable_to_resume()
  455. open_mode = 'wb'
  456. data_len = data.info().get('Content-length', None)
  457. data_len_str = self.format_bytes(data_len)
  458. byte_counter = 0
  459. block_size = 1024
  460. start = time.time()
  461. while True:
  462. # Download and write
  463. before = time.time()
  464. data_block = data.read(block_size)
  465. after = time.time()
  466. data_block_len = len(data_block)
  467. if data_block_len == 0:
  468. break
  469. byte_counter += data_block_len
  470. # Open file just in time
  471. if stream is None:
  472. try:
  473. (stream, filename) = sanitize_open(filename, open_mode)
  474. self.report_destination(filename)
  475. except (OSError, IOError), err:
  476. self.trouble('ERROR: unable to open for writing: %s' % str(err))
  477. return False
  478. stream.write(data_block)
  479. block_size = self.best_block_size(after - before, data_block_len)
  480. # Progress message
  481. percent_str = self.calc_percent(byte_counter, data_len)
  482. eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
  483. speed_str = self.calc_speed(start, time.time(), byte_counter)
  484. self.report_progress(percent_str, data_len_str, speed_str, eta_str)
  485. # Apply rate limit
  486. self.slow_down(start, byte_counter)
  487. self.report_finish()
  488. if data_len is not None and str(byte_counter) != data_len:
  489. raise ContentTooShortError(byte_counter, long(data_len))
  490. return True
  491. class InfoExtractor(object):
  492. """Information Extractor class.
  493. Information extractors are the classes that, given a URL, extract
  494. information from the video (or videos) the URL refers to. This
  495. information includes the real video URL, the video title and simplified
  496. title, author and others. The information is stored in a dictionary
  497. which is then passed to the FileDownloader. The FileDownloader
  498. processes this information possibly downloading the video to the file
  499. system, among other possible outcomes. The dictionaries must include
  500. the following fields:
  501. id: Video identifier.
  502. url: Final video URL.
  503. uploader: Nickname of the video uploader.
  504. title: Literal title.
  505. stitle: Simplified title.
  506. ext: Video filename extension.
  507. format: Video format.
  508. Subclasses of this one should re-define the _real_initialize() and
  509. _real_extract() methods, as well as the suitable() static method.
  510. Probably, they should also be instantiated and added to the main
  511. downloader.
  512. """
  513. _ready = False
  514. _downloader = None
  515. def __init__(self, downloader=None):
  516. """Constructor. Receives an optional downloader."""
  517. self._ready = False
  518. self.set_downloader(downloader)
  519. @staticmethod
  520. def suitable(url):
  521. """Receives a URL and returns True if suitable for this IE."""
  522. return False
  523. def initialize(self):
  524. """Initializes an instance (authentication, etc)."""
  525. if not self._ready:
  526. self._real_initialize()
  527. self._ready = True
  528. def extract(self, url):
  529. """Extracts URL information and returns it in list of dicts."""
  530. self.initialize()
  531. return self._real_extract(url)
  532. def set_downloader(self, downloader):
  533. """Sets the downloader for this IE."""
  534. self._downloader = downloader
  535. def _real_initialize(self):
  536. """Real initialization process. Redefine in subclasses."""
  537. pass
  538. def _real_extract(self, url):
  539. """Real extraction process. Redefine in subclasses."""
  540. pass
  541. class YoutubeIE(InfoExtractor):
  542. """Information extractor for youtube.com."""
  543. _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
  544. _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  545. _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
  546. _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  547. _NETRC_MACHINE = 'youtube'
  548. _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
  549. _video_extensions = {
  550. '13': '3gp',
  551. '17': 'mp4',
  552. '18': 'mp4',
  553. '22': 'mp4',
  554. '37': 'mp4',
  555. }
  556. @staticmethod
  557. def suitable(url):
  558. return (re.match(YoutubeIE._VALID_URL, url) is not None)
  559. def report_lang(self):
  560. """Report attempt to set language."""
  561. self._downloader.to_stdout(u'[youtube] Setting language')
  562. def report_login(self):
  563. """Report attempt to log in."""
  564. self._downloader.to_stdout(u'[youtube] Logging in')
  565. def report_age_confirmation(self):
  566. """Report attempt to confirm age."""
  567. self._downloader.to_stdout(u'[youtube] Confirming age')
  568. def report_video_info_webpage_download(self, video_id):
  569. """Report attempt to download video info webpage."""
  570. self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
  571. def report_information_extraction(self, video_id):
  572. """Report attempt to extract video information."""
  573. self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
  574. def report_unavailable_format(self, video_id, format):
  575. """Report extracted video URL."""
  576. self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
  577. def report_rtmp_download(self):
  578. """Indicate the download will use the RTMP protocol."""
  579. self._downloader.to_stdout(u'[youtube] RTMP download detected')
  580. def _real_initialize(self):
  581. if self._downloader is None:
  582. return
  583. username = None
  584. password = None
  585. downloader_params = self._downloader.params
  586. # Attempt to use provided username and password or .netrc data
  587. if downloader_params.get('username', None) is not None:
  588. username = downloader_params['username']
  589. password = downloader_params['password']
  590. elif downloader_params.get('usenetrc', False):
  591. try:
  592. info = netrc.netrc().authenticators(self._NETRC_MACHINE)
  593. if info is not None:
  594. username = info[0]
  595. password = info[2]
  596. else:
  597. raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
  598. except (IOError, netrc.NetrcParseError), err:
  599. self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
  600. return
  601. # Set language
  602. request = urllib2.Request(self._LANG_URL, None, std_headers)
  603. try:
  604. self.report_lang()
  605. urllib2.urlopen(request).read()
  606. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  607. self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
  608. return
  609. # No authentication to be performed
  610. if username is None:
  611. return
  612. # Log in
  613. login_form = {
  614. 'current_form': 'loginForm',
  615. 'next': '/',
  616. 'action_login': 'Log In',
  617. 'username': username,
  618. 'password': password,
  619. }
  620. request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
  621. try:
  622. self.report_login()
  623. login_results = urllib2.urlopen(request).read()
  624. if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
  625. self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
  626. return
  627. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  628. self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
  629. return
  630. # Confirm age
  631. age_form = {
  632. 'next_url': '/',
  633. 'action_confirm': 'Confirm',
  634. }
  635. request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
  636. try:
  637. self.report_age_confirmation()
  638. age_results = urllib2.urlopen(request).read()
  639. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  640. self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
  641. return
  642. def _real_extract(self, url):
  643. # Extract video id from URL
  644. mobj = re.match(self._VALID_URL, url)
  645. if mobj is None:
  646. self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
  647. return
  648. video_id = mobj.group(2)
  649. # Downloader parameters
  650. best_quality = False
  651. all_formats = False
  652. format_param = None
  653. quality_index = 0
  654. if self._downloader is not None:
  655. params = self._downloader.params
  656. format_param = params.get('format', None)
  657. if format_param == '0':
  658. format_param = self._available_formats[quality_index]
  659. best_quality = True
  660. elif format_param == '-1':
  661. format_param = self._available_formats[quality_index]
  662. all_formats = True
  663. while True:
  664. # Extension
  665. video_extension = self._video_extensions.get(format_param, 'flv')
  666. # Get video info
  667. video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=embedded&ps=default&eurl=&gl=US&hl=en' % video_id
  668. request = urllib2.Request(video_info_url, None, std_headers)
  669. try:
  670. self.report_video_info_webpage_download(video_id)
  671. video_info_webpage = urllib2.urlopen(request).read()
  672. video_info = parse_qs(video_info_webpage)
  673. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  674. self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
  675. return
  676. self.report_information_extraction(video_id)
  677. # "t" param
  678. if 'token' not in video_info:
  679. # Attempt to see if YouTube has issued an error message
  680. if 'reason' not in video_info:
  681. self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
  682. stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
  683. stream.write(video_info_webpage)
  684. stream.close()
  685. else:
  686. reason = urllib.unquote_plus(video_info['reason'][0])
  687. self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
  688. return
  689. token = urllib.unquote_plus(video_info['token'][0])
  690. video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
  691. if format_param is not None:
  692. video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
  693. # Check possible RTMP download
  694. if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
  695. self.report_rtmp_download()
  696. video_real_url = video_info['conn'][0]
  697. # uploader
  698. if 'author' not in video_info:
  699. self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
  700. return
  701. video_uploader = urllib.unquote_plus(video_info['author'][0])
  702. # title
  703. if 'title' not in video_info:
  704. self._downloader.trouble(u'ERROR: unable to extract video title')
  705. return
  706. video_title = urllib.unquote_plus(video_info['title'][0])
  707. video_title = video_title.decode('utf-8')
  708. video_title = sanitize_title(video_title)
  709. # simplified title
  710. simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
  711. simple_title = simple_title.strip(ur'_')
  712. try:
  713. # Process video information
  714. self._downloader.process_info({
  715. 'id': video_id.decode('utf-8'),
  716. 'url': video_real_url.decode('utf-8'),
  717. 'uploader': video_uploader.decode('utf-8'),
  718. 'title': video_title,
  719. 'stitle': simple_title,
  720. 'ext': video_extension.decode('utf-8'),
  721. 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
  722. })
  723. if all_formats:
  724. if quality_index == len(self._available_formats) - 1:
  725. # None left to get
  726. return
  727. else:
  728. quality_index += 1
  729. format_param = self._available_formats[quality_index]
  730. if format_param == None:
  731. return
  732. continue
  733. return
  734. except UnavailableFormatError, err:
  735. if best_quality or all_formats:
  736. if quality_index == len(self._available_formats) - 1:
  737. # I don't ever expect this to happen
  738. if not all_formats:
  739. self._downloader.trouble(u'ERROR: no known formats available for video')
  740. return
  741. else:
  742. self.report_unavailable_format(video_id, format_param)
  743. quality_index += 1
  744. format_param = self._available_formats[quality_index]
  745. if format_param == None:
  746. return
  747. continue
  748. else:
  749. self._downloader.trouble('ERROR: format not available for video')
  750. return
  751. class MetacafeIE(InfoExtractor):
  752. """Information Extractor for metacafe.com."""
  753. _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
  754. _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
  755. _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
  756. _youtube_ie = None
  757. def __init__(self, youtube_ie, downloader=None):
  758. InfoExtractor.__init__(self, downloader)
  759. self._youtube_ie = youtube_ie
  760. @staticmethod
  761. def suitable(url):
  762. return (re.match(MetacafeIE._VALID_URL, url) is not None)
  763. def report_disclaimer(self):
  764. """Report disclaimer retrieval."""
  765. self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
  766. def report_age_confirmation(self):
  767. """Report attempt to confirm age."""
  768. self._downloader.to_stdout(u'[metacafe] Confirming age')
  769. def report_download_webpage(self, video_id):
  770. """Report webpage download."""
  771. self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
  772. def report_extraction(self, video_id):
  773. """Report information extraction."""
  774. self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
  775. def _real_initialize(self):
  776. # Retrieve disclaimer
  777. request = urllib2.Request(self._DISCLAIMER, None, std_headers)
  778. try:
  779. self.report_disclaimer()
  780. disclaimer = urllib2.urlopen(request).read()
  781. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  782. self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
  783. return
  784. # Confirm age
  785. disclaimer_form = {
  786. 'filters': '0',
  787. 'submit': "Continue - I'm over 18",
  788. }
  789. request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
  790. try:
  791. self.report_age_confirmation()
  792. disclaimer = urllib2.urlopen(request).read()
  793. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  794. self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
  795. return
  796. def _real_extract(self, url):
  797. # Extract id and simplified title from URL
  798. mobj = re.match(self._VALID_URL, url)
  799. if mobj is None:
  800. self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
  801. return
  802. video_id = mobj.group(1)
  803. # Check if video comes from YouTube
  804. mobj2 = re.match(r'^yt-(.*)$', video_id)
  805. if mobj2 is not None:
  806. self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
  807. return
  808. simple_title = mobj.group(2).decode('utf-8')
  809. video_extension = 'flv'
  810. # Retrieve video webpage to extract further information
  811. request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
  812. try:
  813. self.report_download_webpage(video_id)
  814. webpage = urllib2.urlopen(request).read()
  815. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  816. self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
  817. return
  818. # Extract URL, uploader and title from webpage
  819. self.report_extraction(video_id)
  820. mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
  821. if mobj is None:
  822. self._downloader.trouble(u'ERROR: unable to extract media URL')
  823. return
  824. mediaURL = urllib.unquote(mobj.group(1))
  825. #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
  826. #if mobj is None:
  827. # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
  828. # return
  829. #gdaKey = mobj.group(1)
  830. #
  831. #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
  832. video_url = mediaURL
  833. mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
  834. if mobj is None:
  835. self._downloader.trouble(u'ERROR: unable to extract title')
  836. return
  837. video_title = mobj.group(1).decode('utf-8')
  838. video_title = sanitize_title(video_title)
  839. mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
  840. if mobj is None:
  841. self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
  842. return
  843. video_uploader = mobj.group(1)
  844. try:
  845. # Process video information
  846. self._downloader.process_info({
  847. 'id': video_id.decode('utf-8'),
  848. 'url': video_url.decode('utf-8'),
  849. 'uploader': video_uploader.decode('utf-8'),
  850. 'title': video_title,
  851. 'stitle': simple_title,
  852. 'ext': video_extension.decode('utf-8'),
  853. 'format': u'NA',
  854. })
  855. except UnavailableFormatError:
  856. self._downloader.trouble(u'ERROR: format not available for video')
  857. class GoogleIE(InfoExtractor):
  858. """Information extractor for video.google.com."""
  859. _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
  860. def __init__(self, downloader=None):
  861. InfoExtractor.__init__(self, downloader)
  862. @staticmethod
  863. def suitable(url):
  864. return (re.match(GoogleIE._VALID_URL, url) is not None)
  865. def report_download_webpage(self, video_id):
  866. """Report webpage download."""
  867. self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
  868. def report_extraction(self, video_id):
  869. """Report information extraction."""
  870. self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
  871. def _real_initialize(self):
  872. return
  873. def _real_extract(self, url):
  874. # Extract id from URL
  875. mobj = re.match(self._VALID_URL, url)
  876. if mobj is None:
  877. self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
  878. return
  879. video_id = mobj.group(1)
  880. video_extension = 'mp4'
  881. # Retrieve video webpage to extract further information
  882. request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
  883. try:
  884. self.report_download_webpage(video_id)
  885. webpage = urllib2.urlopen(request).read()
  886. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  887. self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
  888. return
  889. # Extract URL, uploader, and title from webpage
  890. self.report_extraction(video_id)
  891. mobj = re.search(r"download_url:'([^']+)'", webpage)
  892. if mobj is None:
  893. video_extension = 'flv'
  894. mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
  895. if mobj is None:
  896. self._downloader.trouble(u'ERROR: unable to extract media URL')
  897. return
  898. mediaURL = urllib.unquote(mobj.group(1))
  899. mediaURL = mediaURL.replace('\\x3d', '\x3d')
  900. mediaURL = mediaURL.replace('\\x26', '\x26')
  901. video_url = mediaURL
  902. mobj = re.search(r'<title>(.*)</title>', webpage)
  903. if mobj is None:
  904. self._downloader.trouble(u'ERROR: unable to extract title')
  905. return
  906. video_title = mobj.group(1).decode('utf-8')
  907. video_title = sanitize_title(video_title)
  908. simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
  909. try:
  910. # Process video information
  911. self._downloader.process_info({
  912. 'id': video_id.decode('utf-8'),
  913. 'url': video_url.decode('utf-8'),
  914. 'uploader': u'NA',
  915. 'title': video_title,
  916. 'stitle': simple_title,
  917. 'ext': video_extension.decode('utf-8'),
  918. 'format': u'NA',
  919. })
  920. except UnavailableFormatError:
  921. self._downloader.trouble(u'ERROR: format not available for video')
  922. class PhotobucketIE(InfoExtractor):
  923. """Information extractor for photobucket.com."""
  924. _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
  925. def __init__(self, downloader=None):
  926. InfoExtractor.__init__(self, downloader)
  927. @staticmethod
  928. def suitable(url):
  929. return (re.match(PhotobucketIE._VALID_URL, url) is not None)
  930. def report_download_webpage(self, video_id):
  931. """Report webpage download."""
  932. self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
  933. def report_extraction(self, video_id):
  934. """Report information extraction."""
  935. self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
  936. def _real_initialize(self):
  937. return
  938. def _real_extract(self, url):
  939. # Extract id from URL
  940. mobj = re.match(self._VALID_URL, url)
  941. if mobj is None:
  942. self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
  943. return
  944. video_id = mobj.group(1)
  945. video_extension = 'flv'
  946. # Retrieve video webpage to extract further information
  947. request = urllib2.Request(url)
  948. try:
  949. self.report_download_webpage(video_id)
  950. webpage = urllib2.urlopen(request).read()
  951. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  952. self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
  953. return
  954. # Extract URL, uploader, and title from webpage
  955. self.report_extraction(video_id)
  956. mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
  957. if mobj is None:
  958. self._downloader.trouble(u'ERROR: unable to extract media URL')
  959. return
  960. mediaURL = urllib.unquote(mobj.group(1))
  961. video_url = mediaURL
  962. mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
  963. if mobj is None:
  964. self._downloader.trouble(u'ERROR: unable to extract title')
  965. return
  966. video_title = mobj.group(1).decode('utf-8')
  967. video_title = sanitize_title(video_title)
  968. simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
  969. video_uploader = mobj.group(2).decode('utf-8')
  970. try:
  971. # Process video information
  972. self._downloader.process_info({
  973. 'id': video_id.decode('utf-8'),
  974. 'url': video_url.decode('utf-8'),
  975. 'uploader': video_uploader,
  976. 'title': video_title,
  977. 'stitle': simple_title,
  978. 'ext': video_extension.decode('utf-8'),
  979. 'format': u'NA',
  980. })
  981. except UnavailableFormatError:
  982. self._downloader.trouble(u'ERROR: format not available for video')
  983. class GenericIE(InfoExtractor):
  984. """Generic last-resort information extractor."""
  985. def __init__(self, downloader=None):
  986. InfoExtractor.__init__(self, downloader)
  987. @staticmethod
  988. def suitable(url):
  989. return True
  990. def report_download_webpage(self, video_id):
  991. """Report webpage download."""
  992. self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
  993. self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
  994. def report_extraction(self, video_id):
  995. """Report information extraction."""
  996. self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
  997. def _real_initialize(self):
  998. return
  999. def _real_extract(self, url):
  1000. video_id = url.split('/')[-1]
  1001. request = urllib2.Request(url)
  1002. try:
  1003. self.report_download_webpage(video_id)
  1004. webpage = urllib2.urlopen(request).read()
  1005. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  1006. self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
  1007. return
  1008. except ValueError, err:
  1009. # since this is the last-resort InfoExtractor, if
  1010. # this error is thrown, it'll be thrown here
  1011. self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
  1012. return
  1013. # Start with something easy: JW Player in SWFObject
  1014. mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
  1015. if mobj is None:
  1016. # Broaden the search a little bit
  1017. mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
  1018. if mobj is None:
  1019. self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
  1020. return
  1021. # It's possible that one of the regexes
  1022. # matched, but returned an empty group:
  1023. if mobj.group(1) is None:
  1024. self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
  1025. return
  1026. video_url = urllib.unquote(mobj.group(1))
  1027. video_id = os.path.basename(video_url)
  1028. # here's a fun little line of code for you:
  1029. video_extension = os.path.splitext(video_id)[1][1:]
  1030. video_id = os.path.splitext(video_id)[0]
  1031. # it's tempting to parse this further, but you would
  1032. # have to take into account all the variations like
  1033. # Video Title - Site Name
  1034. # Site Name | Video Title
  1035. # Video Title - Tagline | Site Name
  1036. # and so on and so forth; it's just not practical
  1037. mobj = re.search(r'<title>(.*)</title>', webpage)
  1038. if mobj is None:
  1039. self._downloader.trouble(u'ERROR: unable to extract title')
  1040. return
  1041. video_title = mobj.group(1).decode('utf-8')
  1042. video_title = sanitize_title(video_title)
  1043. simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
  1044. # video uploader is domain name
  1045. mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
  1046. if mobj is None:
  1047. self._downloader.trouble(u'ERROR: unable to extract title')
  1048. return
  1049. video_uploader = mobj.group(1).decode('utf-8')
  1050. try:
  1051. # Process video information
  1052. self._downloader.process_info({
  1053. 'id': video_id.decode('utf-8'),
  1054. 'url': video_url.decode('utf-8'),
  1055. 'uploader': video_uploader,
  1056. 'title': video_title,
  1057. 'stitle': simple_title,
  1058. 'ext': video_extension.decode('utf-8'),
  1059. 'format': u'NA',
  1060. })
  1061. except UnavailableFormatError:
  1062. self._downloader.trouble(u'ERROR: format not available for video')
  1063. class YoutubeSearchIE(InfoExtractor):
  1064. """Information Extractor for YouTube search queries."""
  1065. _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
  1066. _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
  1067. _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
  1068. _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
  1069. _youtube_ie = None
  1070. _max_youtube_results = 1000
  1071. def __init__(self, youtube_ie, downloader=None):
  1072. InfoExtractor.__init__(self, downloader)
  1073. self._youtube_ie = youtube_ie
  1074. @staticmethod
  1075. def suitable(url):
  1076. return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
  1077. def report_download_page(self, query, pagenum):
  1078. """Report attempt to download playlist page with given number."""
  1079. query = query.decode(preferredencoding())
  1080. self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
  1081. def _real_initialize(self):
  1082. self._youtube_ie.initialize()
  1083. def _real_extract(self, query):
  1084. mobj = re.match(self._VALID_QUERY, query)
  1085. if mobj is None:
  1086. self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
  1087. return
  1088. prefix, query = query.split(':')
  1089. prefix = prefix[8:]
  1090. query = query.encode('utf-8')
  1091. if prefix == '':
  1092. self._download_n_results(query, 1)
  1093. return
  1094. elif prefix == 'all':
  1095. self._download_n_results(query, self._max_youtube_results)
  1096. return
  1097. else:
  1098. try:
  1099. n = long(prefix)
  1100. if n <= 0:
  1101. self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
  1102. return
  1103. elif n > self._max_youtube_results:
  1104. self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
  1105. n = self._max_youtube_results
  1106. self._download_n_results(query, n)
  1107. return
  1108. except ValueError: # parsing prefix as integer fails
  1109. self._download_n_results(query, 1)
  1110. return
  1111. def _download_n_results(self, query, n):
  1112. """Downloads a specified number of results for a query"""
  1113. video_ids = []
  1114. already_seen = set()
  1115. pagenum = 1
  1116. while True:
  1117. self.report_download_page(query, pagenum)
  1118. result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
  1119. request = urllib2.Request(result_url, None, std_headers)
  1120. try:
  1121. page = urllib2.urlopen(request).read()
  1122. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  1123. self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
  1124. return
  1125. # Extract video identifiers
  1126. for mobj in re.finditer(self._VIDEO_INDICATOR, page):
  1127. video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
  1128. if video_id not in already_seen:
  1129. video_ids.append(video_id)
  1130. already_seen.add(video_id)
  1131. if len(video_ids) == n:
  1132. # Specified n videos reached
  1133. for id in video_ids:
  1134. self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
  1135. return
  1136. if re.search(self._MORE_PAGES_INDICATOR, page) is None:
  1137. for id in video_ids:
  1138. self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
  1139. return
  1140. pagenum = pagenum + 1
  1141. class YoutubePlaylistIE(InfoExtractor):
  1142. """Information Extractor for YouTube playlists."""
  1143. _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
  1144. _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
  1145. _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
  1146. _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
  1147. _youtube_ie = None
  1148. def __init__(self, youtube_ie, downloader=None):
  1149. InfoExtractor.__init__(self, downloader)
  1150. self._youtube_ie = youtube_ie
  1151. @staticmethod
  1152. def suitable(url):
  1153. return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
  1154. def report_download_page(self, playlist_id, pagenum):
  1155. """Report attempt to download playlist page with given number."""
  1156. self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
  1157. def _real_initialize(self):
  1158. self._youtube_ie.initialize()
  1159. def _real_extract(self, url):
  1160. # Extract playlist id
  1161. mobj = re.match(self._VALID_URL, url)
  1162. if mobj is None:
  1163. self._downloader.trouble(u'ERROR: invalid url: %s' % url)
  1164. return
  1165. # Download playlist pages
  1166. playlist_id = mobj.group(1)
  1167. video_ids = []
  1168. pagenum = 1
  1169. while True:
  1170. self.report_download_page(playlist_id, pagenum)
  1171. request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
  1172. try:
  1173. page = urllib2.urlopen(request).read()
  1174. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  1175. self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
  1176. return
  1177. # Extract video identifiers
  1178. ids_in_page = []
  1179. for mobj in re.finditer(self._VIDEO_INDICATOR, page):
  1180. if mobj.group(1) not in ids_in_page:
  1181. ids_in_page.append(mobj.group(1))
  1182. video_ids.extend(ids_in_page)
  1183. if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
  1184. break
  1185. pagenum = pagenum + 1
  1186. for id in video_ids:
  1187. self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
  1188. return
  1189. class YoutubeUserIE(InfoExtractor):
  1190. """Information Extractor for YouTube users."""
  1191. _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
  1192. _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
  1193. _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
  1194. _youtube_ie = None
  1195. def __init__(self, youtube_ie, downloader=None):
  1196. InfoExtractor.__init__(self, downloader)
  1197. self._youtube_ie = youtube_ie
  1198. @staticmethod
  1199. def suitable(url):
  1200. return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
  1201. def report_download_page(self, username):
  1202. """Report attempt to download user page."""
  1203. self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
  1204. def _real_initialize(self):
  1205. self._youtube_ie.initialize()
  1206. def _real_extract(self, url):
  1207. # Extract username
  1208. mobj = re.match(self._VALID_URL, url)
  1209. if mobj is None:
  1210. self._downloader.trouble(u'ERROR: invalid url: %s' % url)
  1211. return
  1212. # Download user page
  1213. username = mobj.group(1)
  1214. video_ids = []
  1215. pagenum = 1
  1216. self.report_download_page(username)
  1217. request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
  1218. try:
  1219. page = urllib2.urlopen(request).read()
  1220. except (urllib2.URLError, httplib.HTTPException, socket.error), err:
  1221. self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
  1222. return
  1223. # Extract video identifiers
  1224. ids_in_page = []
  1225. for mobj in re.finditer(self._VIDEO_INDICATOR, page):
  1226. if mobj.group(1) not in ids_in_page:
  1227. ids_in_page.append(mobj.group(1))
  1228. video_ids.extend(ids_in_page)
  1229. for id in video_ids:
  1230. self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
  1231. return
  1232. class PostProcessor(object):
  1233. """Post Processor class.
  1234. PostProcessor objects can be added to downloaders with their
  1235. add_post_processor() method. When the downloader has finished a
  1236. successful download, it will take its internal chain of PostProcessors
  1237. and start calling the run() method on each one of them, first with
  1238. an initial argument and then with the returned value of the previous
  1239. PostProcessor.
  1240. The chain will be stopped if one of them ever returns None or the end
  1241. of the chain is reached.
  1242. PostProcessor objects follow a "mutual registration" process similar
  1243. to InfoExtractor objects.
  1244. """
  1245. _downloader = None
  1246. def __init__(self, downloader=None):
  1247. self._downloader = downloader
  1248. def set_downloader(self, downloader):
  1249. """Sets the downloader for this PP."""
  1250. self._downloader = downloader
  1251. def run(self, information):
  1252. """Run the PostProcessor.
  1253. The "information" argument is a dictionary like the ones
  1254. composed by InfoExtractors. The only difference is that this
  1255. one has an extra field called "filepath" that points to the
  1256. downloaded file.
  1257. When this method returns None, the postprocessing chain is
  1258. stopped. However, this method may return an information
  1259. dictionary that will be passed to the next postprocessing
  1260. object in the chain. It can be the one it received after
  1261. changing some fields.
  1262. In addition, this method may raise a PostProcessingError
  1263. exception that will be taken into account by the downloader
  1264. it was called from.
  1265. """
  1266. return information # by default, do nothing
  1267. ### MAIN PROGRAM ###
  1268. if __name__ == '__main__':
  1269. try:
  1270. # Modules needed only when running the main program
  1271. import getpass
  1272. import optparse
  1273. # Function to update the program file with the latest version from bitbucket.org
  1274. def update_self(downloader, filename):
  1275. # Note: downloader only used for options
  1276. if not os.access (filename, os.W_OK):
  1277. sys.exit('ERROR: no write permissions on %s' % filename)
  1278. downloader.to_stdout('Updating to latest stable version...')
  1279. latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
  1280. latest_version = urllib.urlopen(latest_url).read().strip()
  1281. prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
  1282. newcontent = urllib.urlopen(prog_url).read()
  1283. stream = open(filename, 'w')
  1284. stream.write(newcontent)
  1285. stream.close()
  1286. downloader.to_stdout('Updated to version %s' % latest_version)
  1287. # General configuration
  1288. urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
  1289. urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
  1290. socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
  1291. # Parse command line
  1292. parser = optparse.OptionParser(
  1293. usage='Usage: %prog [options] url...',
  1294. version='2010.03.13',
  1295. conflict_handler='resolve',
  1296. )
  1297. parser.add_option('-h', '--help',
  1298. action='help', help='print this help text and exit')
  1299. parser.add_option('-v', '--version',
  1300. action='version', help='print program version and exit')
  1301. parser.add_option('-U', '--update',
  1302. action='store_true', dest='update_self', help='update this program to latest stable version')
  1303. parser.add_option('-i', '--ignore-errors',
  1304. action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
  1305. parser.add_option('-r', '--rate-limit',
  1306. dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
  1307. authentication = optparse.OptionGroup(parser, 'Authentication Options')
  1308. authentication.add_option('-u', '--username',
  1309. dest='username', metavar='UN', help='account username')
  1310. authentication.add_option('-p', '--password',
  1311. dest='password', metavar='PW', help='account password')
  1312. authentication.add_option('-n', '--netrc',
  1313. action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
  1314. parser.add_option_group(authentication)
  1315. video_format = optparse.OptionGroup(parser, 'Video Format Options')
  1316. video_format.add_option('-f', '--format',
  1317. action='store', dest='format', metavar='FMT', help='video format code')
  1318. video_format.add_option('-b', '--best-quality',
  1319. action='store_const', dest='format', help='download the best quality video possible', const='0')
  1320. video_format.add_option('-m', '--mobile-version',
  1321. action='store_const', dest='format', help='alias for -f 17', const='17')
  1322. video_format.add_option('-d', '--high-def',
  1323. action='store_const', dest='format', help='alias for -f 22', const='22')
  1324. video_format.add_option('--all-formats',
  1325. action='store_const', dest='format', help='download all available video formats', const='-1')
  1326. parser.add_option_group(video_format)
  1327. verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
  1328. verbosity.add_option('-q', '--quiet',
  1329. action='store_true', dest='quiet', help='activates quiet mode', default=False)
  1330. verbosity.add_option('-s', '--simulate',
  1331. action='store_true', dest='simulate', help='do not download video', default=False)
  1332. verbosity.add_option('-g', '--get-url',
  1333. action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
  1334. verbosity.add_option('-e', '--get-title',
  1335. action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
  1336. verbosity.add_option('--no-progress',
  1337. action='store_true', dest='noprogress', help='do not print progress bar', default=False)
  1338. parser.add_option_group(verbosity)
  1339. filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
  1340. filesystem.add_option('-t', '--title',
  1341. action='store_true', dest='usetitle', help='use title in file name', default=False)
  1342. filesystem.add_option('-l', '--literal',
  1343. action='store_true', dest='useliteral', help='use literal title in file name', default=False)
  1344. filesystem.add_option('-o', '--output',
  1345. dest='outtmpl', metavar='TPL', help='output filename template')
  1346. filesystem.add_option('-a', '--batch-file',
  1347. dest='batchfile', metavar='F', help='file containing URLs to download')
  1348. filesystem.add_option('-w', '--no-overwrites',
  1349. action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
  1350. filesystem.add_option('-c', '--continue',
  1351. action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
  1352. parser.add_option_group(filesystem)
  1353. (opts, args) = parser.parse_args()
  1354. # Batch file verification
  1355. batchurls = []
  1356. if opts.batchfile is not None:
  1357. try:
  1358. batchurls = open(opts.batchfile, 'r').readlines()
  1359. batchurls = [x.strip() for x in batchurls]
  1360. batchurls = [x for x in batchurls if len(x) > 0]
  1361. except IOError:
  1362. sys.exit(u'ERROR: batch file could not be read')
  1363. all_urls = batchurls + args
  1364. # Conflicting, missing and erroneous options
  1365. if opts.usenetrc and (opts.username is not None or opts.password is not None):
  1366. parser.error(u'using .netrc conflicts with giving username/password')
  1367. if opts.password is not None and opts.username is None:
  1368. parser.error(u'account username missing')
  1369. if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
  1370. parser.error(u'using output template conflicts with using title or literal title')
  1371. if opts.usetitle and opts.useliteral:
  1372. parser.error(u'using title conflicts with using literal title')
  1373. if opts.username is not None and opts.password is None:
  1374. opts.password = getpass.getpass(u'Type account password and press return:')
  1375. if opts.ratelimit is not None:
  1376. numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
  1377. if numeric_limit is None:
  1378. parser.error(u'invalid rate limit specified')
  1379. opts.ratelimit = numeric_limit
  1380. # Information extractors
  1381. youtube_ie = YoutubeIE()
  1382. metacafe_ie = MetacafeIE(youtube_ie)
  1383. youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
  1384. youtube_user_ie = YoutubeUserIE(youtube_ie)
  1385. youtube_search_ie = YoutubeSearchIE(youtube_ie)
  1386. google_ie = GoogleIE()
  1387. photobucket_ie = PhotobucketIE()
  1388. generic_ie = GenericIE()
  1389. # File downloader
  1390. fd = FileDownloader({
  1391. 'usenetrc': opts.usenetrc,
  1392. 'username': opts.username,
  1393. 'password': opts.password,
  1394. 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
  1395. 'forceurl': opts.geturl,
  1396. 'forcetitle': opts.gettitle,
  1397. 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
  1398. 'format': opts.format,
  1399. 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
  1400. or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
  1401. or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
  1402. or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
  1403. or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
  1404. or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
  1405. or u'%(id)s.%(ext)s'),
  1406. 'ignoreerrors': opts.ignoreerrors,
  1407. 'ratelimit': opts.ratelimit,
  1408. 'nooverwrites': opts.nooverwrites,
  1409. 'continuedl': opts.continue_dl,
  1410. 'noprogress': opts.noprogress,
  1411. })
  1412. fd.add_info_extractor(youtube_search_ie)
  1413. fd.add_info_extractor(youtube_pl_ie)
  1414. fd.add_info_extractor(youtube_user_ie)
  1415. fd.add_info_extractor(metacafe_ie)
  1416. fd.add_info_extractor(youtube_ie)
  1417. fd.add_info_extractor(google_ie)
  1418. fd.add_info_extractor(photobucket_ie)
  1419. # This must come last since it's the
  1420. # fallback if none of the others work
  1421. fd.add_info_extractor(generic_ie)
  1422. # Update version
  1423. if opts.update_self:
  1424. update_self(fd, sys.argv[0])
  1425. # Maybe do nothing
  1426. if len(all_urls) < 1:
  1427. if not opts.update_self:
  1428. parser.error(u'you must provide at least one URL')
  1429. else:
  1430. sys.exit()
  1431. retcode = fd.download(all_urls)
  1432. sys.exit(retcode)
  1433. except DownloadError:
  1434. sys.exit(1)
  1435. except SameFileError:
  1436. sys.exit(u'ERROR: fixed output name but more than one file to download')
  1437. except KeyboardInterrupt:
  1438. sys.exit(u'\nERROR: Interrupted by user')