Pārlūkot izejas kodu

Add experimental geo restriction bypass mechanism
Based on faking X-Forwarded-For HTTP header

Sergey M․ 8 gadi atpakaļ
vecāks
revīzija
773f291dcb

+ 17 - 0
youtube_dl/YoutubeDL.py

@@ -56,6 +56,8 @@ from .utils import (
     ExtractorError,
     ExtractorError,
     format_bytes,
     format_bytes,
     formatSeconds,
     formatSeconds,
+    GeoRestrictedError,
+    ISO3166Utils,
     locked_file,
     locked_file,
     make_HTTPS_handler,
     make_HTTPS_handler,
     MaxDownloadsReached,
     MaxDownloadsReached,
@@ -272,6 +274,13 @@ class YoutubeDL(object):
                        If it returns None, the video is downloaded.
                        If it returns None, the video is downloaded.
                        match_filter_func in utils.py is one example for this.
                        match_filter_func in utils.py is one example for this.
     no_color:          Do not emit color codes in output.
     no_color:          Do not emit color codes in output.
+    bypass_geo_restriction:
+                       Bypass geographic restriction via faking X-Forwarded-For
+                       HTTP header (experimental)
+    bypass_geo_restriction_as_country:
+                       Two-letter ISO 3166-2 country code that will be used for
+                       explicit geographic restriction bypassing via faking
+                       X-Forwarded-For HTTP header (experimental)
 
 
     The following options determine which downloader is picked:
     The following options determine which downloader is picked:
     external_downloader: Executable of the external downloader to call.
     external_downloader: Executable of the external downloader to call.
@@ -707,6 +716,14 @@ class YoutubeDL(object):
                     return self.process_ie_result(ie_result, download, extra_info)
                     return self.process_ie_result(ie_result, download, extra_info)
                 else:
                 else:
                     return ie_result
                     return ie_result
+            except GeoRestrictedError as e:
+                msg = e.msg
+                if e.countries:
+                    msg += '\nThis video is available in %s.' % ', '.join(
+                        map(ISO3166Utils.short2full, e.countries))
+                msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
+                self.report_error(msg)
+                break
             except ExtractorError as e:  # An error we somewhat expected
             except ExtractorError as e:  # An error we somewhat expected
                 self.report_error(compat_str(e), e.format_traceback())
                 self.report_error(compat_str(e), e.format_traceback())
                 break
                 break

+ 2 - 0
youtube_dl/__init__.py

@@ -414,6 +414,8 @@ def _real_main(argv=None):
         'cn_verification_proxy': opts.cn_verification_proxy,
         'cn_verification_proxy': opts.cn_verification_proxy,
         'geo_verification_proxy': opts.geo_verification_proxy,
         'geo_verification_proxy': opts.geo_verification_proxy,
         'config_location': opts.config_location,
         'config_location': opts.config_location,
+        'bypass_geo_restriction': opts.bypass_geo_restriction,
+        'bypass_geo_restriction_as_country': opts.bypass_geo_restriction_as_country,
     }
     }
 
 
     with YoutubeDL(ydl_opts) as ydl:
     with YoutubeDL(ydl_opts) as ydl:

+ 42 - 6
youtube_dl/extractor/common.py

@@ -6,6 +6,7 @@ import hashlib
 import json
 import json
 import netrc
 import netrc
 import os
 import os
+import random
 import re
 import re
 import socket
 import socket
 import sys
 import sys
@@ -39,6 +40,8 @@ from ..utils import (
     ExtractorError,
     ExtractorError,
     fix_xml_ampersands,
     fix_xml_ampersands,
     float_or_none,
     float_or_none,
+    GeoRestrictedError,
+    GeoUtils,
     int_or_none,
     int_or_none,
     js_to_json,
     js_to_json,
     parse_iso8601,
     parse_iso8601,
@@ -320,17 +323,25 @@ class InfoExtractor(object):
     _real_extract() methods and define a _VALID_URL regexp.
     _real_extract() methods and define a _VALID_URL regexp.
     Probably, they should also be added to the list of extractors.
     Probably, they should also be added to the list of extractors.
 
 
+    _BYPASS_GEO attribute may be set to False in order to disable
+    geo restriction bypass mechanisms for a particular extractor.
+    Though it won't disable explicit geo restriction bypass based on
+    country code provided with bypass_geo_restriction_as_country.
+
     Finally, the _WORKING attribute should be set to False for broken IEs
     Finally, the _WORKING attribute should be set to False for broken IEs
     in order to warn the users and skip the tests.
     in order to warn the users and skip the tests.
     """
     """
 
 
     _ready = False
     _ready = False
     _downloader = None
     _downloader = None
+    _x_forwarded_for_ip = None
+    _BYPASS_GEO = True
     _WORKING = True
     _WORKING = True
 
 
     def __init__(self, downloader=None):
     def __init__(self, downloader=None):
         """Constructor. Receives an optional downloader."""
         """Constructor. Receives an optional downloader."""
         self._ready = False
         self._ready = False
+        self._x_forwarded_for_ip = None
         self.set_downloader(downloader)
         self.set_downloader(downloader)
 
 
     @classmethod
     @classmethod
@@ -359,6 +370,10 @@ class InfoExtractor(object):
 
 
     def initialize(self):
     def initialize(self):
         """Initializes an instance (authentication, etc)."""
         """Initializes an instance (authentication, etc)."""
+        if not self._x_forwarded_for_ip:
+            country_code = self._downloader.params.get('bypass_geo_restriction_as_country', None)
+            if country_code:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
         if not self._ready:
         if not self._ready:
             self._real_initialize()
             self._real_initialize()
             self._ready = True
             self._ready = True
@@ -366,8 +381,22 @@ class InfoExtractor(object):
     def extract(self, url):
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
         """Extracts URL information and returns it in list of dicts."""
         try:
         try:
-            self.initialize()
-            return self._real_extract(url)
+            for _ in range(2):
+                try:
+                    self.initialize()
+                    return self._real_extract(url)
+                except GeoRestrictedError as e:
+                    if (not self._downloader.params.get('bypass_geo_restriction_as_country', None) and
+                            self._BYPASS_GEO and
+                            self._downloader.params.get('bypass_geo_restriction', True) and
+                            not self._x_forwarded_for_ip and
+                            e.countries):
+                        self._x_forwarded_for_ip = GeoUtils.random_ipv4(random.choice(e.countries))
+                        if self._x_forwarded_for_ip:
+                            self.report_warning(
+                                'Video is geo restricted. Retrying extraction with fake %s IP as X-Forwarded-For.' % self._x_forwarded_for_ip)
+                            continue
+                    raise
         except ExtractorError:
         except ExtractorError:
             raise
             raise
         except compat_http_client.IncompleteRead as e:
         except compat_http_client.IncompleteRead as e:
@@ -434,6 +463,15 @@ class InfoExtractor(object):
         if isinstance(url_or_request, (compat_str, str)):
         if isinstance(url_or_request, (compat_str, str)):
             url_or_request = url_or_request.partition('#')[0]
             url_or_request = url_or_request.partition('#')[0]
 
 
+        # Some sites check X-Forwarded-For HTTP header in order to figure out
+        # the origin of the client behind proxy. This allows bypassing geo
+        # restriction by faking this header's value to IP that belongs to some
+        # geo unrestricted country. We will do so once we encounter any
+        # geo restriction error.
+        if self._x_forwarded_for_ip:
+            if 'X-Forwarded-For' not in headers:
+                headers['X-Forwarded-For'] = self._x_forwarded_for_ip
+
         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
         if urlh is False:
         if urlh is False:
             assert not fatal
             assert not fatal
@@ -609,10 +647,8 @@ class InfoExtractor(object):
             expected=True)
             expected=True)
 
 
     @staticmethod
     @staticmethod
-    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
-        raise ExtractorError(
-            '%s. You might want to use --proxy to workaround.' % msg,
-            expected=True)
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
+        raise GeoRestrictedError(msg, countries=countries)
 
 
     # Methods for following #608
     # Methods for following #608
     @staticmethod
     @staticmethod

+ 12 - 0
youtube_dl/options.py

@@ -549,6 +549,18 @@ def parseOpts(overrideArguments=None):
             'Upper bound of a range for randomized sleep before each download '
             'Upper bound of a range for randomized sleep before each download '
             '(maximum possible number of seconds to sleep). Must only be used '
             '(maximum possible number of seconds to sleep). Must only be used '
             'along with --min-sleep-interval.'))
             'along with --min-sleep-interval.'))
+    workarounds.add_option(
+        '--bypass-geo',
+        action='store_true', dest='bypass_geo_restriction', default=True,
+        help='Bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)')
+    workarounds.add_option(
+        '--no-bypass-geo',
+        action='store_false', dest='bypass_geo_restriction', default=True,
+        help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header (experimental)')
+    workarounds.add_option(
+        '--bypass-geo-as-country', metavar='CODE',
+        dest='bypass_geo_restriction_as_country', default=None,
+        help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code (experimental)')
 
 
     verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
     verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
     verbosity.add_option(
     verbosity.add_option(

+ 267 - 0
youtube_dl/utils.py

@@ -23,6 +23,7 @@ import operator
 import os
 import os
 import pipes
 import pipes
 import platform
 import platform
+import random
 import re
 import re
 import socket
 import socket
 import ssl
 import ssl
@@ -747,6 +748,18 @@ class RegexNotFoundError(ExtractorError):
     pass
     pass
 
 
 
 
+class GeoRestrictedError(ExtractorError):
+    """Geographic restriction Error exception.
+
+    This exception may be thrown when a video is not available from your
+    geographic location due to geographic restrictions imposed by a website.
+    """
+    def __init__(self, msg, countries=None):
+        super(GeoRestrictedError, self).__init__(msg, expected=True)
+        self.msg = msg
+        self.countries = countries
+
+
 class DownloadError(YoutubeDLError):
 class DownloadError(YoutubeDLError):
     """Download Error exception.
     """Download Error exception.
 
 
@@ -3027,6 +3040,260 @@ class ISO3166Utils(object):
         return cls._country_map.get(code.upper())
         return cls._country_map.get(code.upper())
 
 
 
 
+class GeoUtils(object):
+    # Major IPv4 address blocks per country
+    _country_ip_map = {
+        'AD': '85.94.160.0/19',
+        'AE': '94.200.0.0/13',
+        'AF': '149.54.0.0/17',
+        'AG': '209.59.64.0/18',
+        'AI': '204.14.248.0/21',
+        'AL': '46.99.0.0/16',
+        'AM': '46.70.0.0/15',
+        'AO': '105.168.0.0/13',
+        'AP': '159.117.192.0/21',
+        'AR': '181.0.0.0/12',
+        'AS': '202.70.112.0/20',
+        'AT': '84.112.0.0/13',
+        'AU': '1.128.0.0/11',
+        'AW': '181.41.0.0/18',
+        'AZ': '5.191.0.0/16',
+        'BA': '31.176.128.0/17',
+        'BB': '65.48.128.0/17',
+        'BD': '114.130.0.0/16',
+        'BE': '57.0.0.0/8',
+        'BF': '129.45.128.0/17',
+        'BG': '95.42.0.0/15',
+        'BH': '37.131.0.0/17',
+        'BI': '154.117.192.0/18',
+        'BJ': '137.255.0.0/16',
+        'BL': '192.131.134.0/24',
+        'BM': '196.12.64.0/18',
+        'BN': '156.31.0.0/16',
+        'BO': '161.56.0.0/16',
+        'BQ': '161.0.80.0/20',
+        'BR': '152.240.0.0/12',
+        'BS': '24.51.64.0/18',
+        'BT': '119.2.96.0/19',
+        'BW': '168.167.0.0/16',
+        'BY': '178.120.0.0/13',
+        'BZ': '179.42.192.0/18',
+        'CA': '99.224.0.0/11',
+        'CD': '41.243.0.0/16',
+        'CF': '196.32.200.0/21',
+        'CG': '197.214.128.0/17',
+        'CH': '85.0.0.0/13',
+        'CI': '154.232.0.0/14',
+        'CK': '202.65.32.0/19',
+        'CL': '152.172.0.0/14',
+        'CM': '165.210.0.0/15',
+        'CN': '36.128.0.0/10',
+        'CO': '181.240.0.0/12',
+        'CR': '201.192.0.0/12',
+        'CU': '152.206.0.0/15',
+        'CV': '165.90.96.0/19',
+        'CW': '190.88.128.0/17',
+        'CY': '46.198.0.0/15',
+        'CZ': '88.100.0.0/14',
+        'DE': '53.0.0.0/8',
+        'DJ': '197.241.0.0/17',
+        'DK': '87.48.0.0/12',
+        'DM': '192.243.48.0/20',
+        'DO': '152.166.0.0/15',
+        'DZ': '41.96.0.0/12',
+        'EC': '186.68.0.0/15',
+        'EE': '90.190.0.0/15',
+        'EG': '156.160.0.0/11',
+        'ER': '196.200.96.0/20',
+        'ES': '88.0.0.0/11',
+        'ET': '196.188.0.0/14',
+        'EU': '2.16.0.0/13',
+        'FI': '91.152.0.0/13',
+        'FJ': '144.120.0.0/16',
+        'FM': '119.252.112.0/20',
+        'FO': '88.85.32.0/19',
+        'FR': '90.0.0.0/9',
+        'GA': '41.158.0.0/15',
+        'GB': '25.0.0.0/8',
+        'GD': '74.122.88.0/21',
+        'GE': '31.146.0.0/16',
+        'GF': '161.22.64.0/18',
+        'GG': '62.68.160.0/19',
+        'GH': '45.208.0.0/14',
+        'GI': '85.115.128.0/19',
+        'GL': '88.83.0.0/19',
+        'GM': '160.182.0.0/15',
+        'GN': '197.149.192.0/18',
+        'GP': '104.250.0.0/19',
+        'GQ': '105.235.224.0/20',
+        'GR': '94.64.0.0/13',
+        'GT': '168.234.0.0/16',
+        'GU': '168.123.0.0/16',
+        'GW': '197.214.80.0/20',
+        'GY': '181.41.64.0/18',
+        'HK': '113.252.0.0/14',
+        'HN': '181.210.0.0/16',
+        'HR': '93.136.0.0/13',
+        'HT': '148.102.128.0/17',
+        'HU': '84.0.0.0/14',
+        'ID': '39.192.0.0/10',
+        'IE': '87.32.0.0/12',
+        'IL': '79.176.0.0/13',
+        'IM': '5.62.80.0/20',
+        'IN': '117.192.0.0/10',
+        'IO': '203.83.48.0/21',
+        'IQ': '37.236.0.0/14',
+        'IR': '2.176.0.0/12',
+        'IS': '82.221.0.0/16',
+        'IT': '79.0.0.0/10',
+        'JE': '87.244.64.0/18',
+        'JM': '72.27.0.0/17',
+        'JO': '176.29.0.0/16',
+        'JP': '126.0.0.0/8',
+        'KE': '105.48.0.0/12',
+        'KG': '158.181.128.0/17',
+        'KH': '36.37.128.0/17',
+        'KI': '103.25.140.0/22',
+        'KM': '197.255.224.0/20',
+        'KN': '198.32.32.0/19',
+        'KP': '175.45.176.0/22',
+        'KR': '175.192.0.0/10',
+        'KW': '37.36.0.0/14',
+        'KY': '64.96.0.0/15',
+        'KZ': '2.72.0.0/13',
+        'LA': '115.84.64.0/18',
+        'LB': '178.135.0.0/16',
+        'LC': '192.147.231.0/24',
+        'LI': '82.117.0.0/19',
+        'LK': '112.134.0.0/15',
+        'LR': '41.86.0.0/19',
+        'LS': '129.232.0.0/17',
+        'LT': '78.56.0.0/13',
+        'LU': '188.42.0.0/16',
+        'LV': '46.109.0.0/16',
+        'LY': '41.252.0.0/14',
+        'MA': '105.128.0.0/11',
+        'MC': '88.209.64.0/18',
+        'MD': '37.246.0.0/16',
+        'ME': '178.175.0.0/17',
+        'MF': '74.112.232.0/21',
+        'MG': '154.126.0.0/17',
+        'MH': '117.103.88.0/21',
+        'MK': '77.28.0.0/15',
+        'ML': '154.118.128.0/18',
+        'MM': '37.111.0.0/17',
+        'MN': '49.0.128.0/17',
+        'MO': '60.246.0.0/16',
+        'MP': '202.88.64.0/20',
+        'MQ': '109.203.224.0/19',
+        'MR': '41.188.64.0/18',
+        'MS': '208.90.112.0/22',
+        'MT': '46.11.0.0/16',
+        'MU': '105.16.0.0/12',
+        'MV': '27.114.128.0/18',
+        'MW': '105.234.0.0/16',
+        'MX': '187.192.0.0/11',
+        'MY': '175.136.0.0/13',
+        'MZ': '197.218.0.0/15',
+        'NA': '41.182.0.0/16',
+        'NC': '101.101.0.0/18',
+        'NE': '197.214.0.0/18',
+        'NF': '203.17.240.0/22',
+        'NG': '105.112.0.0/12',
+        'NI': '186.76.0.0/15',
+        'NL': '145.96.0.0/11',
+        'NO': '84.208.0.0/13',
+        'NP': '36.252.0.0/15',
+        'NR': '203.98.224.0/19',
+        'NU': '49.156.48.0/22',
+        'NZ': '49.224.0.0/14',
+        'OM': '5.36.0.0/15',
+        'PA': '186.72.0.0/15',
+        'PE': '186.160.0.0/14',
+        'PF': '123.50.64.0/18',
+        'PG': '124.240.192.0/19',
+        'PH': '49.144.0.0/13',
+        'PK': '39.32.0.0/11',
+        'PL': '83.0.0.0/11',
+        'PM': '70.36.0.0/20',
+        'PR': '66.50.0.0/16',
+        'PS': '188.161.0.0/16',
+        'PT': '85.240.0.0/13',
+        'PW': '202.124.224.0/20',
+        'PY': '181.120.0.0/14',
+        'QA': '37.210.0.0/15',
+        'RE': '139.26.0.0/16',
+        'RO': '79.112.0.0/13',
+        'RS': '178.220.0.0/14',
+        'RU': '5.136.0.0/13',
+        'RW': '105.178.0.0/15',
+        'SA': '188.48.0.0/13',
+        'SB': '202.1.160.0/19',
+        'SC': '154.192.0.0/11',
+        'SD': '154.96.0.0/13',
+        'SE': '78.64.0.0/12',
+        'SG': '152.56.0.0/14',
+        'SI': '188.196.0.0/14',
+        'SK': '78.98.0.0/15',
+        'SL': '197.215.0.0/17',
+        'SM': '89.186.32.0/19',
+        'SN': '41.82.0.0/15',
+        'SO': '197.220.64.0/19',
+        'SR': '186.179.128.0/17',
+        'SS': '105.235.208.0/21',
+        'ST': '197.159.160.0/19',
+        'SV': '168.243.0.0/16',
+        'SX': '190.102.0.0/20',
+        'SY': '5.0.0.0/16',
+        'SZ': '41.84.224.0/19',
+        'TC': '65.255.48.0/20',
+        'TD': '154.68.128.0/19',
+        'TG': '196.168.0.0/14',
+        'TH': '171.96.0.0/13',
+        'TJ': '85.9.128.0/18',
+        'TK': '27.96.24.0/21',
+        'TL': '180.189.160.0/20',
+        'TM': '95.85.96.0/19',
+        'TN': '197.0.0.0/11',
+        'TO': '175.176.144.0/21',
+        'TR': '78.160.0.0/11',
+        'TT': '186.44.0.0/15',
+        'TV': '202.2.96.0/19',
+        'TW': '120.96.0.0/11',
+        'TZ': '156.156.0.0/14',
+        'UA': '93.72.0.0/13',
+        'UG': '154.224.0.0/13',
+        'US': '3.0.0.0/8',
+        'UY': '167.56.0.0/13',
+        'UZ': '82.215.64.0/18',
+        'VA': '212.77.0.0/19',
+        'VC': '24.92.144.0/20',
+        'VE': '186.88.0.0/13',
+        'VG': '172.103.64.0/18',
+        'VI': '146.226.0.0/16',
+        'VN': '14.160.0.0/11',
+        'VU': '202.80.32.0/20',
+        'WF': '117.20.32.0/21',
+        'WS': '202.4.32.0/19',
+        'YE': '134.35.0.0/16',
+        'YT': '41.242.116.0/22',
+        'ZA': '41.0.0.0/11',
+        'ZM': '165.56.0.0/13',
+        'ZW': '41.85.192.0/19',
+    }
+
+    @classmethod
+    def random_ipv4(cls, code):
+        block = cls._country_ip_map.get(code.upper())
+        if not block:
+            return None
+        addr, preflen = block.split('/')
+        addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
+        addr_max = addr_min | (0xffffffff >> int(preflen))
+        return socket.inet_ntoa(
+            compat_struct_pack('!I', random.randint(addr_min, addr_max)))
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
     def __init__(self, proxies=None):
         # Set default handlers
         # Set default handlers