9 years ago · 2d19fb5072
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -6,11 +6,18 @@ import json
 
				 import sys
			
 
				 
			
 
				 from .common import InfoExtractor
			
 
				-from ..compat import compat_str
			
 
				+from ..compat import (
			
 
				+    compat_str,
			
 
				+    compat_urlparse,
			
 
				+)
			
 
				 from ..utils import (
			
 
				+    clean_html,
			
 
				     ExtractorError,
			
 
				+    get_element_by_class,
			
 
				     int_or_none,
			
 
				     orderedSet,
			
 
				+    parse_duration,
			
 
				+    remove_start,
			
 
				     str_to_int,
			
 
				     unescapeHTML,
			
 
				     unified_strdate,
			
@@ -20,7 +27,54 @@ from .vimeo import VimeoIE
 
				 from .pladform import PladformIE
			
 
				 
			
 
				 
			
 
				-class VKIE(InfoExtractor):
			
 
				+class VKBaseIE(InfoExtractor):
			
 
				+    _NETRC_MACHINE = 'vk'
			
 
				+
			
 
				+    def _login(self):
			
 
				+        (username, password) = self._get_login_info()
			
 
				+        if username is None:
			
 
				+            return
			
 
				+
			
 
				+        login_page, url_handle = self._download_webpage_handle(
			
 
				+            'https://vk.com', None, 'Downloading login page')
			
 
				+
			
 
				+        login_form = self._hidden_inputs(login_page)
			
 
				+
			
 
				+        login_form.update({
			
 
				+            'email': username.encode('cp1251'),
			
 
				+            'pass': password.encode('cp1251'),
			
 
				+        })
			
 
				+
			
 
				+        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
			
 
				+        # and expects the first one to be set rather than second (see
			
 
				+        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
			
 
				+        # As of RFC6265 the newer one cookie should be set into cookie store
			
 
				+        # what actually happens.
			
 
				+        # We will workaround this VK issue by resetting the remixlhk cookie to
			
 
				+        # the first one manually.
			
 
				+        cookies = url_handle.headers.get('Set-Cookie')
			
 
				+        if sys.version_info[0] >= 3:
			
 
				+            cookies = cookies.encode('iso-8859-1')
			
 
				+        cookies = cookies.decode('utf-8')
			
 
				+        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
			
 
				+        if remixlhk:
			
 
				+            value, domain = remixlhk.groups()
			
 
				+            self._set_cookie(domain, 'remixlhk', value)
			
 
				+
			
 
				+        login_page = self._download_webpage(
			
 
				+            'https://login.vk.com/?act=login', None,
			
 
				+            note='Logging in as %s' % username,
			
 
				+            data=urlencode_postdata(login_form))
			
 
				+
			
 
				+        if re.search(r'onLoginFailed', login_page):
			
 
				+            raise ExtractorError(
			
 
				+                'Unable to login, incorrect username and/or password', expected=True)
			
 
				+
			
 
				+    def _real_initialize(self):
			
 
				+        self._login()
			
 
				+
			
 
				+
			
 
				+class VKIE(VKBaseIE):
			
 
				     IE_NAME = 'vk'
			
 
				     IE_DESC = 'VK'
			
 
				     _VALID_URL = r'''(?x)
			
@@ -38,8 +92,6 @@ class VKIE(InfoExtractor):
 
				                             (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
			
 
				                         )
			
 
				                     '''
			
 
				-    _NETRC_MACHINE = 'vk'
			
 
				-
			
 
				     _TESTS = [
			
 
				         {
			
 
				             'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
			
@@ -189,49 +241,6 @@ class VKIE(InfoExtractor):
 
				         }
			
 
				     ]
			
 
				 
			
 
				-    def _login(self):
			
 
				-        (username, password) = self._get_login_info()
			
 
				-        if username is None:
			
 
				-            return
			
 
				-
			
 
				-        login_page, url_handle = self._download_webpage_handle(
			
 
				-            'https://vk.com', None, 'Downloading login page')
			
 
				-
			
 
				-        login_form = self._hidden_inputs(login_page)
			
 
				-
			
 
				-        login_form.update({
			
 
				-            'email': username.encode('cp1251'),
			
 
				-            'pass': password.encode('cp1251'),
			
 
				-        })
			
 
				-
			
 
				-        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
			
 
				-        # and expects the first one to be set rather than second (see
			
 
				-        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
			
 
				-        # As of RFC6265 the newer one cookie should be set into cookie store
			
 
				-        # what actually happens.
			
 
				-        # We will workaround this VK issue by resetting the remixlhk cookie to
			
 
				-        # the first one manually.
			
 
				-        cookies = url_handle.headers.get('Set-Cookie')
			
 
				-        if sys.version_info[0] >= 3:
			
 
				-            cookies = cookies.encode('iso-8859-1')
			
 
				-        cookies = cookies.decode('utf-8')
			
 
				-        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
			
 
				-        if remixlhk:
			
 
				-            value, domain = remixlhk.groups()
			
 
				-            self._set_cookie(domain, 'remixlhk', value)
			
 
				-
			
 
				-        login_page = self._download_webpage(
			
 
				-            'https://login.vk.com/?act=login', None,
			
 
				-            note='Logging in as %s' % username,
			
 
				-            data=urlencode_postdata(login_form))
			
 
				-
			
 
				-        if re.search(r'onLoginFailed', login_page):
			
 
				-            raise ExtractorError(
			
 
				-                'Unable to login, incorrect username and/or password', expected=True)
			
 
				-
			
 
				-    def _real_initialize(self):
			
 
				-        self._login()
			
 
				-
			
 
				     def _real_extract(self, url):
			
 
				         mobj = re.match(self._VALID_URL, url)
			
 
				         video_id = mobj.group('videoid')
			
@@ -355,7 +364,7 @@ class VKIE(InfoExtractor):
 
				         }
			
 
				 
			
 
				 
			
 
				-class VKUserVideosIE(InfoExtractor):
			
 
				+class VKUserVideosIE(VKBaseIE):
			
 
				     IE_NAME = 'vk:uservideos'
			
 
				     IE_DESC = "VK - User's Videos"
			
 
				     _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
			
@@ -396,3 +405,121 @@ class VKUserVideosIE(InfoExtractor):
 
				             webpage, 'title', default=page_id))
			
 
				 
			
 
				         return self.playlist_result(entries, page_id, title)
			
 
				+
			
 
				+
			
 
				+class VKWallPostIE(VKBaseIE):
			
 
				+    IE_NAME = 'vk:wallpost'
			
 
				+    _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))'
			
 
				+    _TESTS = [{
			
 
				+        # public page URL, audio playlist
			
 
				+        'url': 'https://vk.com/bs.official?w=wall-23538238_35',
			
 
				+        'info_dict': {
			
 
				+            'id': '23538238_35',
			
 
				+            'title': 'Black Shadow - Wall post 23538238_35',
			
 
				+            'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
			
 
				+        },
			
 
				+        'playlist': [{
			
 
				+            'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
			
 
				+            'info_dict': {
			
 
				+                'id': '135220665_111806521',
			
 
				+                'ext': 'mp3',
			
 
				+                'title': 'Black Shadow - Слепое Верование',
			
 
				+                'duration': 370,
			
 
				+                'uploader': 'Black Shadow',
			
 
				+                'artist': 'Black Shadow',
			
 
				+                'track': 'Слепое Верование',
			
 
				+            },
			
 
				+        }, {
			
 
				+            'md5': '4cc7e804579122b17ea95af7834c9233',
			
 
				+            'info_dict': {
			
 
				+                'id': '135220665_111802303',
			
 
				+                'ext': 'mp3',
			
 
				+                'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
			
 
				+                'duration': 423,
			
 
				+                'uploader': 'Black Shadow',
			
 
				+                'artist': 'Black Shadow',
			
 
				+                'track': 'Война - Негасимое Бездны Пламя!',
			
 
				+            },
			
 
				+            'params': {
			
 
				+                'skip_download': True,
			
 
				+            },
			
 
				+        }],
			
 
				+        'skip': 'Requires vk account credentials',
			
 
				+    }, {
			
 
				+        # single YouTube embed, no leading -
			
 
				+        'url': 'https://vk.com/wall85155021_6319',
			
 
				+        'info_dict': {
			
 
				+            'id': '85155021_6319',
			
 
				+            'title': 'Sergey Gorbunov - Wall post 85155021_6319',
			
 
				+        },
			
 
				+        'playlist_count': 1,
			
 
				+        'skip': 'Requires vk account credentials',
			
 
				+    }, {
			
 
				+        # wall page URL
			
 
				+        'url': 'https://vk.com/wall-23538238_35',
			
 
				+        'only_matching': True,
			
 
				+    }, {
			
 
				+        # mobile wall page URL
			
 
				+        'url': 'https://m.vk.com/wall-23538238_35',
			
 
				+        'only_matching': True,
			
 
				+    }]
			
 
				+
			
 
				+    def _real_extract(self, url):
			
 
				+        post_id = self._match_id(url)
			
 
				+
			
 
				+        wall_url = 'https://vk.com/wall%s' % post_id
			
 
				+
			
 
				+        post_id = remove_start(post_id, '-')
			
 
				+
			
 
				+        webpage = self._download_webpage(wall_url, post_id)
			
 
				+
			
 
				+        error = self._html_search_regex(
			
 
				+            r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
			
 
				+            webpage, 'error', default=None)
			
 
				+        if error:
			
 
				+            raise ExtractorError('VK said: %s' % error, expected=True)
			
 
				+
			
 
				+        description = clean_html(get_element_by_class('wall_post_text', webpage))
			
 
				+        uploader = clean_html(get_element_by_class(
			
 
				+            'fw_post_author', webpage)) or self._og_search_description(webpage)
			
 
				+        thumbnail = self._og_search_thumbnail(webpage)
			
 
				+
			
 
				+        entries = []
			
 
				+
			
 
				+        for audio in re.finditer(r'''(?sx)
			
 
				+                            <input[^>]+
			
 
				+                                id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+
			
 
				+                                value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2)
			
 
				+                                .+?
			
 
				+                            </table>''', webpage):
			
 
				+            audio_html = audio.group(0)
			
 
				+            audio_id = audio.group('id')
			
 
				+            duration = parse_duration(get_element_by_class('duration', audio_html))
			
 
				+            track = self._html_search_regex(
			
 
				+                r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id,
			
 
				+                audio_html, 'title', default=None)
			
 
				+            artist = self._html_search_regex(
			
 
				+                r'>([^<]+)</a></b>\s*&ndash', audio_html,
			
 
				+                'artist', default=None)
			
 
				+            entries.append({
			
 
				+                'id': audio_id,
			
 
				+                'url': audio.group('url'),
			
 
				+                'title': '%s - %s' % (artist, track) if artist and track else audio_id,
			
 
				+                'thumbnail': thumbnail,
			
 
				+                'duration': duration,
			
 
				+                'uploader': uploader,
			
 
				+                'artist': artist,
			
 
				+                'track': track,
			
 
				+            })
			
 
				+
			
 
				+        for video in re.finditer(
			
 
				+                r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
			
 
				+            entries.append(self.url_result(
			
 
				+                compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
			
 
				+
			
 
				+        title = 'Wall post %s' % post_id
			
 
				+
			
 
				+        return self.playlist_result(
			
 
				+            orderedSet(entries), post_id,
			
 
				+            '%s - %s' % (uploader, title) if uploader else title,
			
 
				+            description)