浏览代码

[utils] Use a regexp instead of HTMLParser for get_element_by_attribute

Philipp Hagemeister 10 年之前
父节点
当前提交
3828505646
共有 1 个文件被更改,包括 16 次插入105 次删除
  1. 16 105
      youtube_dl/utils.py

+ 16 - 105
youtube_dl/utils.py

@@ -152,86 +152,6 @@ def xpath_text(node, xpath, name=None, fatal=False):
     return n.text
     return n.text
 
 
 
 
-if sys.version_info < (2, 7):
-    compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-
-class BaseHTMLParser(compat_html_parser.HTMLParser):
-    def __init(self):
-        compat_html_parser.HTMLParser.__init__(self)
-        self.html = None
-
-    def loads(self, html):
-        self.html = html
-        self.feed(html)
-        self.close()
-
-class AttrParser(BaseHTMLParser):
-    """Modified HTMLParser that isolates a tag with the specified attribute"""
-    def __init__(self, attribute, value):
-        self.attribute = attribute
-        self.value = value
-        self.result = None
-        self.started = False
-        self.depth = {}
-        self.watch_startpos = False
-        self.error_count = 0
-        BaseHTMLParser.__init__(self)
-
-    def error(self, message):
-        if self.error_count > 10 or self.started:
-            raise compat_html_parser.HTMLParseError(message, self.getpos())
-        self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
-        self.error_count += 1
-        self.goahead(1)
-
-    def handle_starttag(self, tag, attrs):
-        attrs = dict(attrs)
-        if self.started:
-            self.find_startpos(None)
-        if self.attribute in attrs and attrs[self.attribute] == self.value:
-            self.result = [tag]
-            self.started = True
-            self.watch_startpos = True
-        if self.started:
-            if not tag in self.depth: self.depth[tag] = 0
-            self.depth[tag] += 1
-
-    def handle_endtag(self, tag):
-        if self.started:
-            if tag in self.depth: self.depth[tag] -= 1
-            if self.depth[self.result[0]] == 0:
-                self.started = False
-                self.result.append(self.getpos())
-
-    def find_startpos(self, x):
-        """Needed to put the start position of the result (self.result[1])
-        after the opening tag with the requested id"""
-        if self.watch_startpos:
-            self.watch_startpos = False
-            self.result.append(self.getpos())
-    handle_entityref = handle_charref = handle_data = handle_comment = \
-    handle_decl = handle_pi = unknown_decl = find_startpos
-
-    def get_result(self):
-        if self.result is None:
-            return None
-        if len(self.result) != 3:
-            return None
-        lines = self.html.split('\n')
-        lines = lines[self.result[1][0]-1:self.result[2][0]]
-        lines[0] = lines[0][self.result[1][1]:]
-        if len(lines) == 1:
-            lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
-        lines[-1] = lines[-1][:self.result[2][1]]
-        return '\n'.join(lines).strip()
-# Hack for https://github.com/rg3/youtube-dl/issues/662
-if sys.version_info < (2, 7, 3):
-    AttrParser.parse_endtag = (lambda self, i:
-        i + len("</scr'+'ipt>")
-        if self.rawdata[i:].startswith("</scr'+'ipt>")
-        else compat_html_parser.HTMLParser.parse_endtag(self, i))
-
-
 def get_element_by_id(id, html):
 def get_element_by_id(id, html):
     """Return the content of the tag with the specified ID in the passed HTML document"""
     """Return the content of the tag with the specified ID in the passed HTML document"""
     return get_element_by_attribute("id", id, html)
     return get_element_by_attribute("id", id, html)
@@ -239,34 +159,25 @@ def get_element_by_id(id, html):
 
 
 def get_element_by_attribute(attribute, value, html):
 def get_element_by_attribute(attribute, value, html):
     """Return the content of the tag with the specified attribute in the passed HTML document"""
     """Return the content of the tag with the specified attribute in the passed HTML document"""
-    parser = AttrParser(attribute, value)
-    try:
-        parser.loads(html)
-    except compat_html_parser.HTMLParseError:
-        pass
-    return parser.get_result()
 
 
-class MetaParser(BaseHTMLParser):
-    """
-    Modified HTMLParser that isolates a meta tag with the specified name 
-    attribute.
-    """
-    def __init__(self, name):
-        BaseHTMLParser.__init__(self)
-        self.name = name
-        self.content = None
-        self.result = None
-
-    def handle_starttag(self, tag, attrs):
-        if tag != 'meta':
-            return
-        attrs = dict(attrs)
-        if attrs.get('name') == self.name:
-            self.result = attrs.get('content')
+    m = re.search(r'''(?xs)
+        <([a-zA-Z0-9:._-]+)
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         \s+%s=['"]?%s['"]?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+        \s*>
+        (?P<content>.*?)
+        </\1>
+    ''' % (re.escape(attribute), re.escape(value)), html)
+
+    if not m:
+        return None
+    res = m.group('content')
 
 
-    def get_result(self):
-        return self.result
+    if res.startswith('"') or res.startswith("'"):
+        res = res[1:-1]
 
 
+    return unescapeHTML(res)
 
 
 
 
 def clean_html(html):
 def clean_html(html):