Przeglądaj źródła

[YoutubeDL] Escape non-ASCII characters in URLs

urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
Working around by replacing request's original URL with escaped one
Sergey M․ 11 lat temu
rodzic
commit
37419b4f99
1 zmienionych plików z 26 dodań i 0 usunięć
  1. 26 0
      youtube_dl/YoutubeDL.py

+ 26 - 0
youtube_dl/YoutubeDL.py

@@ -28,6 +28,7 @@ from .utils import (
     compat_str,
     compat_str,
     compat_urllib_error,
     compat_urllib_error,
     compat_urllib_request,
     compat_urllib_request,
+    compat_urllib_parse_urlparse,
     ContentTooShortError,
     ContentTooShortError,
     date_from_str,
     date_from_str,
     DateRange,
     DateRange,
@@ -1241,6 +1242,31 @@ class YoutubeDL(object):
 
 
     def urlopen(self, req):
     def urlopen(self, req):
         """ Start an HTTP download """
         """ Start an HTTP download """
+
+        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+        # Working around by replacing request's original URL with escaped one
+
+        url = req if isinstance(req, compat_str) else req.get_full_url()
+
+        def escape(component):
+            return compat_cookiejar.escape_path(component.encode('utf-8'))
+
+        url_parsed = compat_urllib_parse_urlparse(url)
+        url_escaped = url_parsed._replace(
+            path=escape(url_parsed.path),
+            query=escape(url_parsed.query),
+            fragment=escape(url_parsed.fragment)
+        ).geturl()
+
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            if isinstance(req, compat_str):
+                req = url_escaped
+            else:
+                req = compat_urllib_request.Request(
+                    url_escaped, data=req.data, headers=req.headers,
+                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+
         return self._opener.open(req, timeout=self._socket_timeout)
         return self._opener.open(req, timeout=self._socket_timeout)
 
 
     def print_debug_header(self):
     def print_debug_header(self):