Browse Source

Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178)

Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes).
Jaime Marquínez Ferrándiz 9 years ago
parent
commit
36e6f62cd0

+ 7 - 0
test/test_compat.py

@@ -13,8 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from youtube_dl.utils import get_filesystem_encoding
 from youtube_dl.utils import get_filesystem_encoding
 from youtube_dl.compat import (
 from youtube_dl.compat import (
     compat_getenv,
     compat_getenv,
+    compat_etree_fromstring,
     compat_expanduser,
     compat_expanduser,
     compat_shlex_split,
     compat_shlex_split,
+    compat_str,
     compat_urllib_parse_unquote,
     compat_urllib_parse_unquote,
     compat_urllib_parse_unquote_plus,
     compat_urllib_parse_unquote_plus,
 )
 )
@@ -71,5 +73,10 @@ class TestCompat(unittest.TestCase):
     def test_compat_shlex_split(self):
     def test_compat_shlex_split(self):
         self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
         self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
 
 
+    def test_compat_etree_fromstring(self):
+        xml = '<el foo="bar"></el>'
+        doc = compat_etree_fromstring(xml.encode('utf-8'))
+        self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
+
 if __name__ == '__main__':
 if __name__ == '__main__':
     unittest.main()
     unittest.main()

+ 7 - 4
test/test_utils.py

@@ -68,6 +68,9 @@ from youtube_dl.utils import (
     cli_valueless_option,
     cli_valueless_option,
     cli_bool_option,
     cli_bool_option,
 )
 )
+from youtube_dl.compat import (
+    compat_etree_fromstring,
+)
 
 
 
 
 class TestUtil(unittest.TestCase):
 class TestUtil(unittest.TestCase):
@@ -242,7 +245,7 @@ class TestUtil(unittest.TestCase):
             <node x="b" y="d" />
             <node x="b" y="d" />
             <node x="" />
             <node x="" />
         </root>'''
         </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
 
 
         self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
         self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)
         self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
         self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
@@ -263,7 +266,7 @@ class TestUtil(unittest.TestCase):
                 <url>http://server.com/download.mp3</url>
                 <url>http://server.com/download.mp3</url>
             </media:song>
             </media:song>
         </root>'''
         </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
         find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
         find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))
         self.assertTrue(find('media:song') is not None)
         self.assertTrue(find('media:song') is not None)
         self.assertEqual(find('media:song/media:author').text, 'The Author')
         self.assertEqual(find('media:song/media:author').text, 'The Author')
@@ -285,7 +288,7 @@ class TestUtil(unittest.TestCase):
                 <p>Foo</p>
                 <p>Foo</p>
             </div>
             </div>
         </root>'''
         </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
         self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
         self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')
         self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')
         self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')
         self.assertTrue(xpath_text(doc, 'div/bar') is None)
         self.assertTrue(xpath_text(doc, 'div/bar') is None)
@@ -297,7 +300,7 @@ class TestUtil(unittest.TestCase):
                 <p x="a">Foo</p>
                 <p x="a">Foo</p>
             </div>
             </div>
         </root>'''
         </root>'''
-        doc = xml.etree.ElementTree.fromstring(testxml)
+        doc = compat_etree_fromstring(testxml)
         self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')
         self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')
         self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)
         self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)
         self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None)
         self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None)

+ 25 - 0
youtube_dl/compat.py

@@ -14,6 +14,7 @@ import socket
 import subprocess
 import subprocess
 import sys
 import sys
 import itertools
 import itertools
+import xml.etree.ElementTree
 
 
 
 
 try:
 try:
@@ -212,6 +213,29 @@ try:
 except ImportError:  # Python 2.6
 except ImportError:  # Python 2.6
     from xml.parsers.expat import ExpatError as compat_xml_parse_error
     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 
 
+if sys.version_info[0] >= 3:
+    compat_etree_fromstring = xml.etree.ElementTree.fromstring
+else:
+    # on python 2.x the the attributes of a node are str objects instead of
+    # unicode
+    etree = xml.etree.ElementTree
+
+    # on 2.6 XML doesn't have a parser argument, function copied from CPython
+    # 2.7 source
+    def _XML(text, parser=None):
+        if not parser:
+            parser = etree.XMLParser(target=etree.TreeBuilder())
+        parser.feed(text)
+        return parser.close()
+
+    def _element_factory(*args, **kwargs):
+        el = etree.Element(*args, **kwargs)
+        for k, v in el.items():
+            el.set(k, v.decode('utf-8'))
+        return el
+
+    def compat_etree_fromstring(text):
+        return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
 
 
 try:
 try:
     from urllib.parse import parse_qs as compat_parse_qs
     from urllib.parse import parse_qs as compat_parse_qs
@@ -507,6 +531,7 @@ __all__ = [
     'compat_chr',
     'compat_chr',
     'compat_cookiejar',
     'compat_cookiejar',
     'compat_cookies',
     'compat_cookies',
+    'compat_etree_fromstring',
     'compat_expanduser',
     'compat_expanduser',
     'compat_get_terminal_size',
     'compat_get_terminal_size',
     'compat_getenv',
     'compat_getenv',

+ 2 - 2
youtube_dl/downloader/f4m.py

@@ -5,10 +5,10 @@ import io
 import itertools
 import itertools
 import os
 import os
 import time
 import time
-import xml.etree.ElementTree as etree
 
 
 from .fragment import FragmentFD
 from .fragment import FragmentFD
 from ..compat import (
 from ..compat import (
+    compat_etree_fromstring,
     compat_urlparse,
     compat_urlparse,
     compat_urllib_error,
     compat_urllib_error,
     compat_urllib_parse_urlparse,
     compat_urllib_parse_urlparse,
@@ -290,7 +290,7 @@ class F4mFD(FragmentFD):
         man_url = urlh.geturl()
         man_url = urlh.geturl()
         manifest = urlh.read()
         manifest = urlh.read()
 
 
-        doc = etree.fromstring(manifest)
+        doc = compat_etree_fromstring(manifest)
         formats = [(int(f.attrib.get('bitrate', -1)), f)
         formats = [(int(f.attrib.get('bitrate', -1)), f)
                    for f in self._get_unencrypted_media(doc)]
                    for f in self._get_unencrypted_media(doc)]
         if requested_bitrate is None:
         if requested_bitrate is None:

+ 5 - 3
youtube_dl/extractor/bbc.py

@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 from __future__ import unicode_literals
 
 
 import re
 import re
-import xml.etree.ElementTree
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..utils import (
 from ..utils import (
@@ -14,7 +13,10 @@ from ..utils import (
     remove_end,
     remove_end,
     unescapeHTML,
     unescapeHTML,
 )
 )
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_etree_fromstring,
+    compat_HTTPError,
+)
 
 
 
 
 class BBCCoUkIE(InfoExtractor):
 class BBCCoUkIE(InfoExtractor):
@@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor):
                 url, programme_id, 'Downloading media selection XML')
                 url, programme_id, 'Downloading media selection XML')
         except ExtractorError as ee:
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
-                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8'))
+                media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
             else:
             else:
                 raise
                 raise
         return self._process_media_selector(media_selection, programme_id)
         return self._process_media_selector(media_selection, programme_id)

+ 4 - 2
youtube_dl/extractor/bilibili.py

@@ -4,9 +4,11 @@ from __future__ import unicode_literals
 import re
 import re
 import itertools
 import itertools
 import json
 import json
-import xml.etree.ElementTree as ET
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
+from ..compat import (
+    compat_etree_fromstring,
+)
 from ..utils import (
 from ..utils import (
     int_or_none,
     int_or_none,
     unified_strdate,
     unified_strdate,
@@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor):
         except ValueError:
         except ValueError:
             pass
             pass
 
 
-        lq_doc = ET.fromstring(lq_page)
+        lq_doc = compat_etree_fromstring(lq_page)
         lq_durls = lq_doc.findall('./durl')
         lq_durls = lq_doc.findall('./durl')
 
 
         hq_doc = self._download_xml(
         hq_doc = self._download_xml(

+ 2 - 2
youtube_dl/extractor/brightcove.py

@@ -3,10 +3,10 @@ from __future__ import unicode_literals
 
 
 import re
 import re
 import json
 import json
-import xml.etree.ElementTree
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..compat import (
 from ..compat import (
+    compat_etree_fromstring,
     compat_parse_qs,
     compat_parse_qs,
     compat_str,
     compat_str,
     compat_urllib_parse,
     compat_urllib_parse,
@@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor):
         object_str = fix_xml_ampersands(object_str)
         object_str = fix_xml_ampersands(object_str)
 
 
         try:
         try:
-            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
+            object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
         except compat_xml_parse_error:
         except compat_xml_parse_error:
             return
             return
 
 

+ 2 - 2
youtube_dl/extractor/common.py

@@ -10,7 +10,6 @@ import re
 import socket
 import socket
 import sys
 import sys
 import time
 import time
-import xml.etree.ElementTree
 
 
 from ..compat import (
 from ..compat import (
     compat_cookiejar,
     compat_cookiejar,
@@ -23,6 +22,7 @@ from ..compat import (
     compat_urllib_request,
     compat_urllib_request,
     compat_urlparse,
     compat_urlparse,
     compat_str,
     compat_str,
+    compat_etree_fromstring,
 )
 )
 from ..utils import (
 from ..utils import (
     NO_DEFAULT,
     NO_DEFAULT,
@@ -461,7 +461,7 @@ class InfoExtractor(object):
             return xml_string
             return xml_string
         if transform_source:
         if transform_source:
             xml_string = transform_source(xml_string)
             xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
 
 
     def _download_json(self, url_or_request, video_id,
     def _download_json(self, url_or_request, video_id,
                        note='Downloading JSON metadata',
                        note='Downloading JSON metadata',

+ 2 - 2
youtube_dl/extractor/crunchyroll.py

@@ -5,12 +5,12 @@ import re
 import json
 import json
 import base64
 import base64
 import zlib
 import zlib
-import xml.etree.ElementTree
 
 
 from hashlib import sha1
 from hashlib import sha1
 from math import pow, sqrt, floor
 from math import pow, sqrt, floor
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..compat import (
 from ..compat import (
+    compat_etree_fromstring,
     compat_urllib_parse,
     compat_urllib_parse,
     compat_urllib_parse_unquote,
     compat_urllib_parse_unquote,
     compat_urllib_request,
     compat_urllib_request,
@@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         return output
         return output
 
 
     def _extract_subtitles(self, subtitle):
     def _extract_subtitles(self, subtitle):
-        sub_root = xml.etree.ElementTree.fromstring(subtitle)
+        sub_root = compat_etree_fromstring(subtitle)
         return [{
         return [{
             'ext': 'srt',
             'ext': 'srt',
             'data': self._convert_subtitles_to_srt(sub_root),
             'data': self._convert_subtitles_to_srt(sub_root),

+ 3 - 3
youtube_dl/extractor/vevo.py

@@ -1,10 +1,10 @@
 from __future__ import unicode_literals
 from __future__ import unicode_literals
 
 
 import re
 import re
-import xml.etree.ElementTree
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from ..compat import (
 from ..compat import (
+    compat_etree_fromstring,
     compat_urllib_request,
     compat_urllib_request,
 )
 )
 from ..utils import (
 from ..utils import (
@@ -97,7 +97,7 @@ class VevoIE(InfoExtractor):
         if last_version['version'] == -1:
         if last_version['version'] == -1:
             raise ExtractorError('Unable to extract last version of the video')
             raise ExtractorError('Unable to extract last version of the video')
 
 
-        renditions = xml.etree.ElementTree.fromstring(last_version['data'])
+        renditions = compat_etree_fromstring(last_version['data'])
         formats = []
         formats = []
         # Already sorted from worst to best quality
         # Already sorted from worst to best quality
         for rend in renditions.findall('rendition'):
         for rend in renditions.findall('rendition'):
@@ -114,7 +114,7 @@ class VevoIE(InfoExtractor):
 
 
     def _formats_from_smil(self, smil_xml):
     def _formats_from_smil(self, smil_xml):
         formats = []
         formats = []
-        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8'))
+        smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))
         els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
         els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')
         for el in els:
         for el in els:
             src = el.attrib['src']
             src = el.attrib['src']

+ 2 - 1
youtube_dl/utils.py

@@ -36,6 +36,7 @@ import zlib
 from .compat import (
 from .compat import (
     compat_basestring,
     compat_basestring,
     compat_chr,
     compat_chr,
+    compat_etree_fromstring,
     compat_html_entities,
     compat_html_entities,
     compat_http_client,
     compat_http_client,
     compat_kwargs,
     compat_kwargs,
@@ -1974,7 +1975,7 @@ def dfxp2srt(dfxp_data):
 
 
         return out
         return out
 
 
-    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
     out = []
     out = []
     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')