瀏覽代碼

Document and test categories (#2923)

Philipp Hagemeister 11 年之前
父節點
當前提交
ad3bc6acd5
共有 2 個文件被更改,包括 11 次插入7 次删除
  1. 2 0
      youtube_dl/extractor/common.py
  2. 9 7
      youtube_dl/extractor/youtube.py

+ 2 - 0
youtube_dl/extractor/common.py

@@ -113,6 +113,8 @@ class InfoExtractor(object):
     webpage_url:    The url to the video webpage, if given to youtube-dl it
     webpage_url:    The url to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
                     by YoutubeDL if it's missing)
+    categories:     A list of categories that the video falls in, for example
+                    ["Sports", "Berlin"]
 
 
     Unless mentioned otherwise, the fields should be Unicode strings.
     Unless mentioned otherwise, the fields should be Unicode strings.
 
 

+ 9 - 7
youtube_dl/extractor/youtube.py

@@ -242,7 +242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader_id": u"phihag",
                 u"uploader_id": u"phihag",
                 u"upload_date": u"20121002",
                 u"upload_date": u"20121002",
-                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
+                u"categories": [u'Science & Technology'],
             }
             }
         },
         },
         {
         {
@@ -1136,18 +1137,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
 
         # upload date
         # upload date
         upload_date = None
         upload_date = None
-        mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
         if mobj is not None:
         if mobj is not None:
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
             upload_date = unified_strdate(upload_date)
             upload_date = unified_strdate(upload_date)
 
 
-
-        video_categories = []
-        # categories
         m_cat_container = get_element_by_id("eow-category", video_webpage)
         m_cat_container = get_element_by_id("eow-category", video_webpage)
         if m_cat_container:
         if m_cat_container:
-            video_categories = re.findall(r'<a[^<]+>(.*?)</a>',
-                                m_cat_container, re.DOTALL)
+            category = self._html_search_regex(
+                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'cateory',
+                default=None)
+            video_categories = None if category is None else [category]
+        else:
+            video_categories = None
 
 
         # description
         # description
         video_description = get_element_by_id("eow-description", video_webpage)
         video_description = get_element_by_id("eow-description", video_webpage)