Pārlūkot izejas kodu

[minhateca] Add extractor (Fixes #4094)

Philipp Hagemeister 11 gadi atpakaļ
vecāks
revīzija
4349c07dd7

+ 1 - 0
test/test_utils.py

@@ -376,6 +376,7 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(parse_filesize('2 MiB'), 2097152)
         self.assertEqual(parse_filesize('2 MiB'), 2097152)
         self.assertEqual(parse_filesize('5 GB'), 5000000000)
         self.assertEqual(parse_filesize('5 GB'), 5000000000)
         self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
         self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)
+        self.assertEqual(parse_filesize('1,24 KB'), 1240)
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
     unittest.main()
     unittest.main()

+ 1 - 0
youtube_dl/extractor/__init__.py

@@ -217,6 +217,7 @@ from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
 from .mgoon import MgoonIE
+from .minhateca import MinhatecaIE
 from .ministrygrid import MinistryGridIE
 from .ministrygrid import MinistryGridIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
 from .mitele import MiTeleIE

+ 71 - 0
youtube_dl/extractor/minhateca.py

@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+)
+
+
+class MinhatecaIE(InfoExtractor):
+    _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
+    _TEST = {
+        'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
+        'info_dict': {
+            'id': '125848331',
+            'ext': 'mp4',
+            'title': 'youtube-dl test video',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'filesize_approx': 1530000,
+            'duration': 9,
+            'view_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        token = self._html_search_regex(
+            r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
+            webpage, 'request token')
+        token_data = [
+            ('fileId', video_id),
+            ('__RequestVerificationToken', token),
+        ]
+        req = compat_urllib_request.Request(
+            'http://minhateca.com.br/action/License/Download',
+            data=compat_urllib_parse.urlencode(token_data))
+        req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        data = self._download_json(
+            req, video_id, note='Downloading metadata')
+
+        video_url = data['redirectUrl']
+        title_str = self._html_search_regex(
+            r'<h1.*?>(.*?)</h1>', webpage, 'title')
+        title, _, ext = title_str.rpartition('.')
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<p class="fileSize">(.*?)</p>',
+            webpage, 'file size approximation', fatal=False))
+        duration = int_or_none(self._html_search_regex(
+            r'(?s)<p class="fileLeng[ht][th]">.*?([0-9]+)\s*s',
+            webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'<p class="downloadsCounter">([0-9]+)</p>',
+            webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'ext': ext,
+            'filesize_approx': filesize_approx,
+            'duration': duration,
+            'view_count': view_count,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }

+ 5 - 2
youtube_dl/utils.py

@@ -1090,11 +1090,14 @@ def parse_filesize(s):
     }
     }
 
 
     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
-    m = re.match(r'(?P<num>[0-9]+(?:\.[0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
     if not m:
     if not m:
         return None
         return None
 
 
-    return int(float(m.group('num')) * _UNIT_TABLE[m.group('unit')])
+    num_str = m.group('num').replace(',', '.')
+    mult = _UNIT_TABLE[m.group('unit')]
+    return int(float(num_str) * mult)
 
 
 
 
 def get_term_width():
 def get_term_width():