11 years ago · 3828505646
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -152,86 +152,6 @@ def xpath_text(node, xpath, name=None, fatal=False):
 
															     return n.text
														
 
															-if sys.version_info < (2, 7):
														
 
															-    compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
														
 
															-
														
 
															-class BaseHTMLParser(compat_html_parser.HTMLParser):
														
 
															-    def __init(self):
														
 
															-        compat_html_parser.HTMLParser.__init__(self)
														
 
															-        self.html = None
														
 
															-
														
 
															-    def loads(self, html):
														
 
															-        self.html = html
														
 
															-        self.feed(html)
														
 
															-        self.close()
														
 
															-
														
 
															-class AttrParser(BaseHTMLParser):
														
 
															-    """Modified HTMLParser that isolates a tag with the specified attribute"""
														
 
															-    def __init__(self, attribute, value):
														
 
															-        self.attribute = attribute
														
 
															-        self.value = value
														
 
															-        self.result = None
														
 
															-        self.started = False
														
 
															-        self.depth = {}
														
 
															-        self.watch_startpos = False
														
 
															-        self.error_count = 0
														
 
															-        BaseHTMLParser.__init__(self)
														
 
															-
														
 
															-    def error(self, message):
														
 
															-        if self.error_count > 10 or self.started:
														
 
															-            raise compat_html_parser.HTMLParseError(message, self.getpos())
														
 
															-        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
														
 
															-        self.error_count += 1
														
 
															-        self.goahead(1)
														
 
															-
														
 
															-    def handle_starttag(self, tag, attrs):
														
 
															-        attrs = dict(attrs)
														
 
															-        if self.started:
														
 
															-            self.find_startpos(None)
														
 
															-        if self.attribute in attrs and attrs[self.attribute] == self.value:
														
 
															-            self.result = [tag]
														
 
															-            self.started = True
														
 
															-            self.watch_startpos = True
														
 
															-        if self.started:
														
 
															-            if not tag in self.depth: self.depth[tag] = 0
														
 
															-            self.depth[tag] += 1
														
 
															-
														
 
															-    def handle_endtag(self, tag):
														
 
															-        if self.started:
														
 
															-            if tag in self.depth: self.depth[tag] -= 1
														
 
															-            if self.depth[self.result[0]] == 0:
														
 
															-                self.started = False
														
 
															-                self.result.append(self.getpos())
														
 
															-
														
 
															-    def find_startpos(self, x):
														
 
															-        """Needed to put the start position of the result (self.result[1])
														
 
															-        after the opening tag with the requested id"""
														
 
															-        if self.watch_startpos:
														
 
															-            self.watch_startpos = False
														
 
															-            self.result.append(self.getpos())
														
 
															-    handle_entityref = handle_charref = handle_data = handle_comment = \
														
 
															-    handle_decl = handle_pi = unknown_decl = find_startpos
														
 
															-
														
 
															-    def get_result(self):
														
 
															-        if self.result is None:
														
 
															-            return None
														
 
															-        if len(self.result) != 3:
														
 
															-            return None
														
 
															-        lines = self.html.split('\n')
														
 
															-        lines = lines[self.result[1][0]-1:self.result[2][0]]
														
 
															-        lines[0] = lines[0][self.result[1][1]:]
														
 
															-        if len(lines) == 1:
														
 
															-            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
														
 
															-        lines[-1] = lines[-1][:self.result[2][1]]
														
 
															-        return '\n'.join(lines).strip()
														
 
															-# Hack for https://github.com/rg3/youtube-dl/issues/662
														
 
															-if sys.version_info < (2, 7, 3):
														
 
															-    AttrParser.parse_endtag = (lambda self, i:
														
 
															-        i + len("</scr'+'ipt>")
														
 
															-        if self.rawdata[i:].startswith("</scr'+'ipt>")
														
 
															-        else compat_html_parser.HTMLParser.parse_endtag(self, i))
														
 
															-
														
 
															-
														
 
															 def get_element_by_id(id, html):
														
 
															     """Return the content of the tag with the specified ID in the passed HTML document"""
														
 
															     return get_element_by_attribute("id", id, html)
														
@@ -239,34 +159,25 @@ def get_element_by_id(id, html):
 
															 def get_element_by_attribute(attribute, value, html):
														
 
															     """Return the content of the tag with the specified attribute in the passed HTML document"""
														
 
															-    parser = AttrParser(attribute, value)
														
 
															-    try:
														
 
															-        parser.loads(html)
														
 
															-    except compat_html_parser.HTMLParseError:
														
 
															-        pass
														
 
															-    return parser.get_result()
														
 
															-class MetaParser(BaseHTMLParser):
														
 
															-    """
														
 
															-    Modified HTMLParser that isolates a meta tag with the specified name 
														
 
															-    attribute.
														
 
															-    """
														
 
															-    def __init__(self, name):
														
 
															-        BaseHTMLParser.__init__(self)
														
 
															-        self.name = name
														
 
															-        self.content = None
														
 
															-        self.result = None
														
 
															-
														
 
															-    def handle_starttag(self, tag, attrs):
														
 
															-        if tag != 'meta':
														
 
															-            return
														
 
															-        attrs = dict(attrs)
														
 
															-        if attrs.get('name') == self.name:
														
 
															-            self.result = attrs.get('content')
														
 
															+    m = re.search(r'''(?xs)
														
 
															+        <([a-zA-Z0-9:._-]+)
														
 
															+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
														
 
															+         \s+%s=['"]?%s['"]?
														
 
															+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
														
 
															+        \s*>
														
 
															+        (?P<content>.*?)
														
 
															+        </\1>
														
 
															+    ''' % (re.escape(attribute), re.escape(value)), html)
														
 
															+
														
 
															+    if not m:
														
 
															+        return None
														
 
															+    res = m.group('content')
														
 
															-    def get_result(self):
														
 
															-        return self.result
														
 
															+    if res.startswith('"') or res.startswith("'"):
														
 
															+        res = res[1:-1]
														
 
															+    return unescapeHTML(res)
														
 
															 def clean_html(html):