Sfoglia il codice sorgente

Correct XML ampersand fixup

Philipp Hagemeister 11 anni fa
parent
commit
5aafe895fc

+ 14 - 0
test/test_utils.py

@@ -16,6 +16,7 @@ from youtube_dl.utils import (
     DateRange,
     encodeFilename,
     find_xpath_attr,
+    fix_xml_ampersands,
     get_meta_content,
     orderedSet,
     parse_duration,
@@ -200,5 +201,18 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_duration('9:12:43'), 33163)
         self.assertEqual(parse_duration('x:y'), None)
 
+    def test_fix_xml_ampersands(self):
+        self.assertEqual(
+            fix_xml_ampersands('"&x=y&z=a'), '"&x=y&z=a')
+        self.assertEqual(
+            fix_xml_ampersands('"&x=y&wrong;&z=a'),
+            '"&x=y&wrong;&z=a')
+        self.assertEqual(
+            fix_xml_ampersands('&'><"'),
+            '&'><"')
+        self.assertEqual(
+            fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼')
+        self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#')
+
 if __name__ == '__main__':
     unittest.main()

+ 2 - 2
youtube_dl/extractor/clipsyndicate.py

@@ -3,7 +3,7 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     find_xpath_attr,
-    fix_xml_all_ampersand,
+    fix_xml_ampersands
 )
 
 
@@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor):
         pdoc = self._download_xml(
             'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
             video_id, u'Downloading video info',
-            transform_source=fix_xml_all_ampersand) 
+            transform_source=fix_xml_ampersands)
 
         track_doc = pdoc.find('trackList/track')
         def find_param(name):

+ 2 - 2
youtube_dl/extractor/metacritic.py

@@ -4,7 +4,7 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    fix_xml_all_ampersand,
+    fix_xml_ampersands,
 )
 
 
@@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         # The xml is not well formatted, there are raw '&'
         info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
-            video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand)
+            video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
 
         clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
         formats = []

+ 2 - 4
youtube_dl/extractor/mtv.py

@@ -5,6 +5,7 @@ from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
     ExtractorError,
+    fix_xml_ampersands,
 )
 
 def _media_xml_tag(tag):
@@ -83,12 +84,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
         video_id = self._id_from_uri(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
 
-        def fix_ampersand(s):
-            """ Fix unencoded ampersand in XML """
-            return s.replace(u'& ', '& ')
         idoc = self._download_xml(
             self._FEED_URL + '?' + data, video_id,
-            u'Downloading info', transform_source=fix_ampersand)
+            u'Downloading info', transform_source=fix_xml_ampersands)
         return [self._get_video_info(item) for item in idoc.findall('.//item')]
 
 

+ 5 - 2
youtube_dl/utils.py

@@ -1092,9 +1092,12 @@ def month_by_name(name):
         return None
 
 
-def fix_xml_all_ampersand(xml_str):
+def fix_xml_ampersands(xml_str):
     """Replace all the '&' by '&' in XML"""
-    return xml_str.replace(u'&', u'&')
+    return re.sub(
+        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+        u'&',
+        xml_str)
 
 
 def setproctitle(title):