Philipp Hagemeister 11 лет назад
Родитель
Сommit
ad3bc6acd5
2 измененных файлов с 11 добавлено и 7 удалено
  1. 2 0
      youtube_dl/extractor/common.py
  2. 9 7
      youtube_dl/extractor/youtube.py

+ 2 - 0
youtube_dl/extractor/common.py

@@ -113,6 +113,8 @@ class InfoExtractor(object):
     webpage_url:    The url to the video webpage, if given to youtube-dl it
     webpage_url:    The url to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
                     by YoutubeDL if it's missing)
+    categories:     A list of categories that the video falls in, for example
+                    ["Sports", "Berlin"]
 
 
     Unless mentioned otherwise, the fields should be Unicode strings.
     Unless mentioned otherwise, the fields should be Unicode strings.
 
 

+ 9 - 7
youtube_dl/extractor/youtube.py

@@ -242,7 +242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader_id": u"phihag",
                 u"uploader_id": u"phihag",
                 u"upload_date": u"20121002",
                 u"upload_date": u"20121002",
-                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
+                u"categories": [u'Science & Technology'],
             }
             }
         },
         },
         {
         {
@@ -1136,18 +1137,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
 
         # upload date
         # upload date
         upload_date = None
         upload_date = None
-        mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
         if mobj is not None:
         if mobj is not None:
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
             upload_date = unified_strdate(upload_date)
             upload_date = unified_strdate(upload_date)
 
 
-
-        video_categories = []
-        # categories
         m_cat_container = get_element_by_id("eow-category", video_webpage)
         m_cat_container = get_element_by_id("eow-category", video_webpage)
         if m_cat_container:
         if m_cat_container:
-            video_categories = re.findall(r'<a[^<]+>(.*?)</a>',
-                                m_cat_container, re.DOTALL)
+            category = self._html_search_regex(
+                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'cateory',
+                default=None)
+            video_categories = None if category is None else [category]
+        else:
+            video_categories = None
 
 
         # description
         # description
         video_description = get_element_by_id("eow-description", video_webpage)
         video_description = get_element_by_id("eow-description", video_webpage)