Browse Source

New optoin --restrict-filenames

Philipp Hagemeister 12 years ago
parent
commit
1c469a9480
7 changed files with 77 additions and 41 deletions
  1. 2 0
      README.md
  2. 25 2
      test/test_utils.py
  3. 3 1
      youtube-dl.1
  4. 1 1
      youtube-dl.bash-completion
  5. 33 32
      youtube_dl/FileDownloader.py
  6. 4 0
      youtube_dl/__init__.py
  7. 9 5
      youtube_dl/utils.py

+ 2 - 0
README.md

@@ -47,6 +47,8 @@ which means you can modify it, redistribute it or use it however you like.
                              %(extractor)s for the provider (youtube, metacafe,
                              %(extractor)s for the provider (youtube, metacafe,
                              etc), %(id)s for the video id and %% for a literal
                              etc), %(id)s for the video id and %% for a literal
                              percent. Use - to output to stdout.
                              percent. Use - to output to stdout.
+    --restrict-filenames     Avoid some characters such as "&" and spaces in
+                             filenames
     -a, --batch-file FILE    file containing URLs to download ('-' for stdin)
     -a, --batch-file FILE    file containing URLs to download ('-' for stdin)
     -w, --no-overwrites      do not overwrite files
     -w, --no-overwrites      do not overwrite files
     -c, --continue           resume partially downloaded files
     -c, --continue           resume partially downloaded files

+ 25 - 2
test/test_utils.py

@@ -30,11 +30,34 @@ class TestUtil(unittest.TestCase):
 		self.assertEqual(u'yes no', sanitize_filename(u'yes? no'))
 		self.assertEqual(u'yes no', sanitize_filename(u'yes? no'))
 		self.assertEqual(u'this - that', sanitize_filename(u'this: that'))
 		self.assertEqual(u'this - that', sanitize_filename(u'this: that'))
 
 
+		self.assertEqual(sanitize_filename(u'AT&T'), u'AT&T')
 		self.assertEqual(sanitize_filename(u'ä'), u'ä')
 		self.assertEqual(sanitize_filename(u'ä'), u'ä')
 		self.assertEqual(sanitize_filename(u'кириллица'), u'кириллица')
 		self.assertEqual(sanitize_filename(u'кириллица'), u'кириллица')
 
 
-		for forbidden in u'"\0\\/':
-			self.assertTrue(forbidden not in sanitize_filename(forbidden))
+		forbidden = u'"\0\\/'
+		for fc in forbidden:
+			for fbc in forbidden:
+				self.assertTrue(fbc not in sanitize_filename(fc))
+
+	def test_sanitize_filename_restricted(self):
+		self.assertEqual(sanitize_filename(u'abc', restricted=True), u'abc')
+		self.assertEqual(sanitize_filename(u'abc_d-e', restricted=True), u'abc_d-e')
+
+		self.assertEqual(sanitize_filename(u'123', restricted=True), u'123')
+
+		self.assertEqual(u'abc-de', sanitize_filename(u'abc/de', restricted=True))
+		self.assertFalse(u'/' in sanitize_filename(u'abc/de///', restricted=True))
+
+		self.assertEqual(u'abc-de', sanitize_filename(u'abc/<>\\*|de', restricted=True))
+		self.assertEqual(u'xxx', sanitize_filename(u'xxx/<>\\*|', restricted=True))
+		self.assertEqual(u'yes_no', sanitize_filename(u'yes? no', restricted=True))
+		self.assertEqual(u'this_-_that', sanitize_filename(u'this: that', restricted=True))
+
+		forbidden = u'"\0\\/&: \'\t\n'
+		for fc in forbidden:
+			print('input: ' + fc + ', result: ' + repr(sanitize_filename(fc, restricted=True)))
+			for fbc in forbidden:
+				self.assertTrue(fbc not in sanitize_filename(fc, restricted=True))
 
 
 	def test_ordered_set(self):
 	def test_ordered_set(self):
 		self.assertEqual(orderedSet([1,1,2,3,4,4,5,6,7,3,5]), [1,2,3,4,5,6,7])
 		self.assertEqual(orderedSet([1,1,2,3,4,4,5,6,7,3,5]), [1,2,3,4,5,6,7])

+ 3 - 1
youtube-dl.1

@@ -59,6 +59,8 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ %(extractor)s\ for\ the\ provider\ (youtube,\ metacafe,
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ %(extractor)s\ for\ the\ provider\ (youtube,\ metacafe,
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ etc),\ %(id)s\ for\ the\ video\ id\ and\ %%\ for\ a\ literal
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ etc),\ %(id)s\ for\ the\ video\ id\ and\ %%\ for\ a\ literal
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ percent.\ Use\ -\ to\ output\ to\ stdout.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ percent.\ Use\ -\ to\ output\ to\ stdout.
+--restrict-filenames\ \ \ \ \ Avoid\ some\ characters\ such\ as\ "&"\ and\ spaces\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ filenames
 -a,\ --batch-file\ FILE\ \ \ \ file\ containing\ URLs\ to\ download\ (\[aq]-\[aq]\ for\ stdin)
 -a,\ --batch-file\ FILE\ \ \ \ file\ containing\ URLs\ to\ download\ (\[aq]-\[aq]\ for\ stdin)
 -w,\ --no-overwrites\ \ \ \ \ \ do\ not\ overwrite\ files
 -w,\ --no-overwrites\ \ \ \ \ \ do\ not\ overwrite\ files
 -c,\ --continue\ \ \ \ \ \ \ \ \ \ \ resume\ partially\ downloaded\ files
 -c,\ --continue\ \ \ \ \ \ \ \ \ \ \ resume\ partially\ downloaded\ files
@@ -210,7 +212,7 @@ Please note that Python 2.5 is not supported anymore.
 .PP
 .PP
 Since June 2012 (#342) youtube-dl is packed as an executable zipfile,
 Since June 2012 (#342) youtube-dl is packed as an executable zipfile,
 simply unzip it (might need renaming to \f[C]youtube-dl.zip\f[] first on
 simply unzip it (might need renaming to \f[C]youtube-dl.zip\f[] first on
-some systems) or clone the git repo to see the code.
+some systems) or clone the git repository, as laid out above.
 If you modify the code, you can run it by executing the
 If you modify the code, you can run it by executing the
 \f[C]__main__.py\f[] file.
 \f[C]__main__.py\f[] file.
 To recompile the executable, run \f[C]make\ youtube-dl\f[].
 To recompile the executable, run \f[C]make\ youtube-dl\f[].

+ 1 - 1
youtube-dl.bash-completion

@@ -3,7 +3,7 @@ __youtube-dl()
     local cur prev opts
     local cur prev opts
     COMPREPLY=()
     COMPREPLY=()
     cur="${COMP_WORDS[COMP_CWORD]}"
     cur="${COMP_WORDS[COMP_CWORD]}"
-    opts="--all-formats --audio-format --audio-quality --auto-number --batch-file --console-title --continue --cookies --dump-user-agent --extract-audio --format --get-description --get-filename --get-format --get-thumbnail --get-title --get-url --help --id --ignore-errors --keep-video --list-extractors --list-formats --literal --match-title --max-downloads --max-quality --netrc --no-continue --no-mtime --no-overwrites --no-part --no-progress --output --password --playlist-end --playlist-start --prefer-free-formats --quiet --rate-limit --reject-title --retries --simulate --skip-download --srt-lang --title --update --user-agent --username --verbose --version --write-description --write-info-json --write-srt"
+    opts="--all-formats --audio-format --audio-quality --auto-number --batch-file --console-title --continue --cookies --dump-user-agent --extract-audio --format --get-description --get-filename --get-format --get-thumbnail --get-title --get-url --help --id --ignore-errors --keep-video --list-extractors --list-formats --literal --match-title --max-downloads --max-quality --netrc --no-continue --no-mtime --no-overwrites --no-part --no-progress --output --password --playlist-end --playlist-start --prefer-free-formats --quiet --rate-limit --reject-title --restrict-filenames --retries --simulate --skip-download --srt-lang --title --update --user-agent --username --verbose --version --write-description --write-info-json --write-srt"
 
 
     if [[ ${cur} == * ]] ; then
     if [[ ${cur} == * ]] ; then
         COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
         COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )

+ 33 - 32
youtube_dl/FileDownloader.py

@@ -44,37 +44,38 @@ class FileDownloader(object):
 
 
 	Available options:
 	Available options:
 
 
-	username:         Username for authentication purposes.
-	password:         Password for authentication purposes.
-	usenetrc:         Use netrc for authentication instead.
-	quiet:            Do not print messages to stdout.
-	forceurl:         Force printing final URL.
-	forcetitle:       Force printing title.
-	forcethumbnail:   Force printing thumbnail URL.
-	forcedescription: Force printing description.
-	forcefilename:    Force printing final filename.
-	simulate:         Do not download the video files.
-	format:           Video format code.
-	format_limit:     Highest quality format to try.
-	outtmpl:          Template for output names.
-	ignoreerrors:     Do not stop on download errors.
-	ratelimit:        Download speed limit, in bytes/sec.
-	nooverwrites:     Prevent overwriting files.
-	retries:          Number of times to retry for HTTP error 5xx
-	continuedl:       Try to continue downloads if possible.
-	noprogress:       Do not print the progress bar.
-	playliststart:    Playlist item to start at.
-	playlistend:      Playlist item to end at.
-	matchtitle:       Download only matching titles.
-	rejecttitle:      Reject downloads for matching titles.
-	logtostderr:      Log messages to stderr instead of stdout.
-	consoletitle:     Display progress in console window's titlebar.
-	nopart:           Do not use temporary .part files.
-	updatetime:       Use the Last-modified header to set output file timestamps.
-	writedescription: Write the video description to a .description file
-	writeinfojson:    Write the video description to a .info.json file
-	writesubtitles:   Write the video subtitles to a .srt file
-	subtitleslang:    Language of the subtitles to download
+	username:          Username for authentication purposes.
+	password:          Password for authentication purposes.
+	usenetrc:          Use netrc for authentication instead.
+	quiet:             Do not print messages to stdout.
+	forceurl:          Force printing final URL.
+	forcetitle:        Force printing title.
+	forcethumbnail:    Force printing thumbnail URL.
+	forcedescription:  Force printing description.
+	forcefilename:     Force printing final filename.
+	simulate:          Do not download the video files.
+	format:            Video format code.
+	format_limit:      Highest quality format to try.
+	outtmpl:           Template for output names.
+	restrictfilenames: Do not allow "&" and spaces in file names
+	ignoreerrors:      Do not stop on download errors.
+	ratelimit:         Download speed limit, in bytes/sec.
+	nooverwrites:      Prevent overwriting files.
+	retries:           Number of times to retry for HTTP error 5xx
+	continuedl:        Try to continue downloads if possible.
+	noprogress:        Do not print the progress bar.
+	playliststart:     Playlist item to start at.
+	playlistend:       Playlist item to end at.
+	matchtitle:        Download only matching titles.
+	rejecttitle:       Reject downloads for matching titles.
+	logtostderr:       Log messages to stderr instead of stdout.
+	consoletitle:      Display progress in console window's titlebar.
+	nopart:            Do not use temporary .part files.
+	updatetime:        Use the Last-modified header to set output file timestamps.
+	writedescription:  Write the video description to a .description file
+	writeinfojson:     Write the video description to a .info.json file
+	writesubtitles:    Write the video subtitles to a .srt file
+	subtitleslang:     Language of the subtitles to download
 	"""
 	"""
 
 
 	params = None
 	params = None
@@ -349,7 +350,7 @@ class FileDownloader(object):
 	def process_info(self, info_dict):
 	def process_info(self, info_dict):
 		"""Process a single dictionary returned by an InfoExtractor."""
 		"""Process a single dictionary returned by an InfoExtractor."""
 
 
-		info_dict['stitle'] = sanitize_filename(info_dict['title'])
+		info_dict['stitle'] = sanitize_filename(info_dict['title'], self.params.get('restrictfilenames'))
 
 
 		reason = self._match_entry(info_dict)
 		reason = self._match_entry(info_dict)
 		if reason is not None:
 		if reason is not None:

+ 4 - 0
youtube_dl/__init__.py

@@ -272,6 +272,9 @@ def parseOpts():
 			help='number downloaded files starting from 00000', default=False)
 			help='number downloaded files starting from 00000', default=False)
 	filesystem.add_option('-o', '--output',
 	filesystem.add_option('-o', '--output',
 			dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id and %% for a literal percent. Use - to output to stdout.')
 			dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id and %% for a literal percent. Use - to output to stdout.')
+	filesystem.add_option('--restrict-filenames',
+			action='store_true', dest='restrictfilenames',
+			help='Avoid some characters such as "&" and spaces in filenames', default=False)
 	filesystem.add_option('-a', '--batch-file',
 	filesystem.add_option('-a', '--batch-file',
 			dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
 			dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
 	filesystem.add_option('-w', '--no-overwrites',
 	filesystem.add_option('-w', '--no-overwrites',
@@ -485,6 +488,7 @@ def _real_main():
 			or (opts.useid and u'%(id)s.%(ext)s')
 			or (opts.useid and u'%(id)s.%(ext)s')
 			or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
 			or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
 			or u'%(id)s.%(ext)s'),
 			or u'%(id)s.%(ext)s'),
+		'restrictfilenames': opts.restrictfilenames,
 		'ignoreerrors': opts.ignoreerrors,
 		'ignoreerrors': opts.ignoreerrors,
 		'ratelimit': opts.ratelimit,
 		'ratelimit': opts.ratelimit,
 		'nooverwrites': opts.nooverwrites,
 		'nooverwrites': opts.nooverwrites,

+ 9 - 5
youtube_dl/utils.py

@@ -194,18 +194,22 @@ def timeconvert(timestr):
 	if timetuple is not None:
 	if timetuple is not None:
 		timestamp = email.utils.mktime_tz(timetuple)
 		timestamp = email.utils.mktime_tz(timetuple)
 	return timestamp
 	return timestamp
-	
-def sanitize_filename(s):
-	"""Sanitizes a string so it could be used as part of a filename."""
+
+def sanitize_filename(s, restricted=False):
+	"""Sanitizes a string so it could be used as part of a filename.
+	If restricted is set, use a stricter subset of allowed characters.
+	"""
 	def replace_insane(char):
 	def replace_insane(char):
 		if char == '?' or ord(char) < 32 or ord(char) == 127:
 		if char == '?' or ord(char) < 32 or ord(char) == 127:
 			return ''
 			return ''
 		elif char == '"':
 		elif char == '"':
-			return '\''
+			return '' if restricted else 'FOO\''
 		elif char == ':':
 		elif char == ':':
-			return ' -'
+			return '_-' if restricted else ' -'
 		elif char in '\\/|*<>':
 		elif char in '\\/|*<>':
 			return '-'
 			return '-'
+		if restricted and (char in '&\'' or char.isspace()):
+			return '_'
 		return char
 		return char
 
 
 	result = u''.join(map(replace_insane, s))
 	result = u''.join(map(replace_insane, s))