11 years ago · d05cfe0600
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -40,6 +40,8 @@ from youtube_dl.utils import (
 
				     parse_iso8601,
			
 
				     strip_jsonp,
			
 
				     uppercase_escape,
			
 
				+    escape_rfc3986,
			
 
				+    escape_url,
			
 
				 )
			
 
				 
			
 
				 
			
@@ -286,5 +288,34 @@ class TestUtil(unittest.TestCase):
 
				         self.assertEqual(uppercase_escape('aä'), 'aä')
			
 
				         self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
			
 
				 
			
 
				+    def test_escape_rfc3986(self):
			
 
				+        reserved = "!*'();:@&=+$,/?#[]"
			
 
				+        unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
			
 
				+        self.assertEqual(escape_rfc3986(reserved), reserved)
			
 
				+        self.assertEqual(escape_rfc3986(unreserved), unreserved)
			
 
				+        self.assertEqual(escape_rfc3986('тест'), '%D1%82%D0%B5%D1%81%D1%82')
			
 
				+        self.assertEqual(escape_rfc3986('%D1%82%D0%B5%D1%81%D1%82'), '%D1%82%D0%B5%D1%81%D1%82')
			
 
				+        self.assertEqual(escape_rfc3986('foo bar'), 'foo%20bar')
			
 
				+        self.assertEqual(escape_rfc3986('foo%20bar'), 'foo%20bar')
			
 
				+
			
 
				+    def test_escape_url(self):
			
 
				+        self.assertEqual(
			
 
				+            escape_url('http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavré_FD.mp4'),
			
 
				+            'http://wowza.imust.org/srv/vod/telemb/new/UPLOAD/UPLOAD/20224_IncendieHavre%CC%81_FD.mp4'
			
 
				+        )
			
 
				+        self.assertEqual(
			
 
				+            escape_url('http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erklärt/Das-Erste/Video?documentId=22673108&bcastId=5290'),
			
 
				+            'http://www.ardmediathek.de/tv/Sturm-der-Liebe/Folge-2036-Zu-Mann-und-Frau-erkl%C3%A4rt/Das-Erste/Video?documentId=22673108&bcastId=5290'
			
 
				+        )
			
 
				+        self.assertEqual(
			
 
				+            escape_url('http://тест.рф/фрагмент'),
			
 
				+            'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82'
			
 
				+        )
			
 
				+        self.assertEqual(
			
 
				+            escape_url('http://тест.рф/абв?абв=абв#абв'),
			
 
				+            'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2'
			
 
				+        )
			
 
				+        self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0')
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     unittest.main()
			
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -28,7 +28,7 @@ from .utils import (
 
				     compat_str,
			
 
				     compat_urllib_error,
			
 
				     compat_urllib_request,
			
 
				-    compat_urllib_parse_urlparse,
			
 
				+    escape_url,
			
 
				     ContentTooShortError,
			
 
				     date_from_str,
			
 
				     DateRange,
			
@@ -1243,20 +1243,14 @@ class YoutubeDL(object):
 
				     def urlopen(self, req):
			
 
				         """ Start an HTTP download """
			
 
				 
			
 
				+        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
			
 
				+        # always respected by websites, some tend to give out URLs with non percent-encoded
			
 
				+        # non-ASCII characters (see telemb.py, ard.py [#3412])
			
 
				         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
			
 
				-        # Working around by replacing request's original URL with escaped one
			
 
				-
			
 
				+        # To work around aforementioned issue we will replace request's original URL with
			
 
				+        # percent-encoded one
			
 
				         url = req if isinstance(req, compat_str) else req.get_full_url()
			
 
				-
			
 
				-        def escape(component):
			
 
				-            return compat_cookiejar.escape_path(component.encode('utf-8'))
			
 
				-
			
 
				-        url_parsed = compat_urllib_parse_urlparse(url)
			
 
				-        url_escaped = url_parsed._replace(
			
 
				-            path=escape(url_parsed.path),
			
 
				-            query=escape(url_parsed.query),
			
 
				-            fragment=escape(url_parsed.fragment)
			
 
				-        ).geturl()
			
 
				+        url_escaped = escape_url(url)
			
 
				 
			
 
				         # Substitute URL if any change after escaping
			
 
				         if url != url_escaped:
			
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1418,6 +1418,24 @@ def uppercase_escape(s):
 
				         lambda m: unicode_escape(m.group(0))[0],
			
 
				         s)
			
 
				 
			
 
				+
			
 
				+def escape_rfc3986(s):
			
 
				+    """Escape non-ASCII characters as suggested by RFC 3986"""
			
 
				+    if sys.version_info < (3, 0) and isinstance(s, unicode):
			
 
				+        s = s.encode('utf-8')
			
 
				+    return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") #"%/;:@&=+$,!~*'()?#[]+"   #?#[]+
			
 
				+
			
 
				+
			
 
				+def escape_url(url):
			
 
				+    """Escape URL as suggested by RFC 3986"""
			
 
				+    url_parsed = compat_urllib_parse_urlparse(url)
			
 
				+    return url_parsed._replace(
			
 
				+        path=escape_rfc3986(url_parsed.path),
			
 
				+        params=escape_rfc3986(url_parsed.params),
			
 
				+        query=escape_rfc3986(url_parsed.query),
			
 
				+        fragment=escape_rfc3986(url_parsed.fragment)
			
 
				+    ).geturl()
			
 
				+
			
 
				 try:
			
 
				     struct.pack(u'!I', 0)
			
 
				 except TypeError: