浏览代码

[minhateca] Add extractor (Fixes #4094)

Philipp Hagemeister 10 年之前
父节点
当前提交
4349c07dd7
共有 4 个文件被更改,包括 78 次插入2 次删除
  1. 1 0
      test/test_utils.py
  2. 1 0
      youtube_dl/extractor/__init__.py
  3. 71 0
      youtube_dl/extractor/minhateca.py
  4. 5 2
      youtube_dl/utils.py

+ 1 - 0
test/test_utils.py

@@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_filesize('2 MiB'), 2097152)
         self.assertEqual(parse_filesize('2 MiB'), 2097152)
         self.assertEqual(parse_filesize('5 GB'), 5000000000)
         self.assertEqual(parse_filesize('5 GB'), 5000000000)
         self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
         self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+        self.assertEqual(parse_filesize('1,24 KB'), 1240)
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     unittest.main()
     unittest.main()

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -217,6 +217,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .ministrygrid import MinistryGridIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
 from .mitele import MiTeleIE

+ 71 - 0
youtube_dl/extractor/minhateca.py

@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+    _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+    _TEST = {
+        'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+        'info_dict': {
+            'id': '125848331',
+            'ext': 'mp4',
+            'title': 'youtube-dl test video',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'filesize_approx': 1530000,
+            'duration': 9,
+            'view_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        token = self._html_search_regex(
+            r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+            webpage, 'request token')
+        token_data = [
+            ('fileId', video_id),
+            ('__RequestVerificationToken', token),
+        ]
+        req = compat_urllib_request.Request(
+            'http://minhateca.com.br/action/License/Download',
+            data=compat_urllib_parse.urlencode(token_data))
+        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        data = self._download_json(
+            req, video_id, note='Downloading metadata')
+
+        video_url = data['redirectUrl']
+        title_str = self._html_search_regex(
+            r'<h1.*?>(.*?)</h1>', webpage, 'title')
+        title, _, ext = title_str.rpartition('.')
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<p class="fileSize">(.*?)</p>',
+            webpage, 'file size approximation', fatal=False))
+        duration = int_or_none(self._html_search_regex(
+            r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
+            webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'<p class="downloadsCounter">([0-9]+)</p>',
+            webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'ext': ext,
+            'filesize_approx': filesize_approx,
+            'duration': duration,
+            'view_count': view_count,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }

+ 5 - 2
youtube_dl/utils.py

@@ -1090,11 +1090,14 @@ def parse_filesize(s):
     }
     }
 
 
     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
     if not m:
     if not m:
         return None
         return None
 
 
-    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+    num_str = m.group('num').replace(',', '.')
+    mult = _UNIT_TABLE[m.group('unit')]
+    return int(float(num_str) * mult)
 
 
 
 
 def get_term_width():
 def get_term_width():