2
0
Эх сурвалжийг харах

[mixcloud] Added support for user uploads, playlists, favorites and listens.
Fixes #3750 and #5272

Philip Huppert 10 жил өмнө
parent
commit
c96eca426b

+ 5 - 1
youtube_dl/extractor/extractors.py

@@ -411,7 +411,11 @@ from .minoto import MinotoIE
 from .miomio import MioMioIE
 from .miomio import MioMioIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mitele import MiTeleIE
 from .mitele import MiTeleIE
-from .mixcloud import MixcloudIE
+from .mixcloud import (
+    MixcloudIE,
+    MixcloudUserIE,
+    MixcloudPlaylistIE
+)
 from .mlb import MLBIE
 from .mlb import MLBIE
 from .mnet import MnetIE
 from .mnet import MnetIE
 from .mpora import MporaIE
 from .mpora import MporaIE

+ 198 - 2
youtube_dl/extractor/mixcloud.py

@@ -3,18 +3,22 @@ from __future__ import unicode_literals
 import re
 import re
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+    compat_urllib_parse_unquote,
+    compat_urllib_request
+)
 from ..utils import (
 from ..utils import (
     ExtractorError,
     ExtractorError,
     HEADRequest,
     HEADRequest,
     NO_DEFAULT,
     NO_DEFAULT,
     parse_count,
     parse_count,
     str_to_int,
     str_to_int,
+    clean_html
 )
 )
 
 
 
 
 class MixcloudIE(InfoExtractor):
 class MixcloudIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
     IE_NAME = 'mixcloud'
     IE_NAME = 'mixcloud'
 
 
     _TESTS = [{
     _TESTS = [{
@@ -115,3 +119,195 @@ class MixcloudIE(InfoExtractor):
             'view_count': view_count,
             'view_count': view_count,
             'like_count': like_count,
             'like_count': like_count,
         }
         }
+
+
+class MixcloudUserIE(InfoExtractor):
+    """
+    Information extractor for Mixcloud users.
+    It can retrieve a list of a user's uploads, favorites or listens.
+    """
+
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
+    IE_NAME = 'mixcloud:user'
+
+    _TESTS = [{
+        'url': 'http://www.mixcloud.com/dholbach/',
+        'info_dict': {
+            'id': 'dholbach/uploads',
+            'title': 'Daniel Holbach (uploads)',
+            'description': 'md5:327af72d1efeb404a8216c27240d1370',
+        },
+        'playlist_mincount': 11
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/uploads/',
+        'info_dict': {
+            'id': 'dholbach/uploads',
+            'title': 'Daniel Holbach (uploads)',
+            'description': 'md5:327af72d1efeb404a8216c27240d1370',
+        },
+        'playlist_mincount': 11
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/favorites/',
+        'info_dict': {
+            'id': 'dholbach/favorites',
+            'title': 'Daniel Holbach (favorites)',
+            'description': 'md5:327af72d1efeb404a8216c27240d1370',
+        },
+        'playlist_mincount': 244
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/listens/',
+        'info_dict': {
+            'id': 'dholbach/listens',
+            'title': 'Daniel Holbach (listens)',
+            'description': 'md5:327af72d1efeb404a8216c27240d1370',
+        },
+        'playlist_mincount': 846
+    }]
+
+    def _fetch_tracks(self, base_url, video_id, dl_note=None, dl_errnote=None):
+        # retrieve all fragments of a list of tracks with fake AJAX calls
+        track_urls = []
+        current_page = 1
+        while True:
+            # fake a AJAX request to retrieve a list fragment
+            page_url = base_url + "?page=%d&list=main&_ajax=1" % current_page
+            req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}, method="GET")
+            resp = self._download_webpage(req, video_id, note=dl_note + " (page %d)" % current_page, errnote=dl_errnote)
+
+            # extract all track URLs from fragment
+            urls = re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', resp)
+            # clean up URLs
+            urls = map(clean_html, urls)
+            # create absolute URLs
+            urls = map(lambda u: "https://www.mixcloud.com" + u, urls)
+            track_urls.extend(urls)
+
+            # advance to next fragment, if any
+            if " m-next-page-url=" in resp:
+                current_page += 1
+            else:
+                break
+
+        return track_urls
+
+    def _handle_track_urls(self, urls):
+        return map(lambda u: self.url_result(u, "Mixcloud"), urls)
+
+    def _get_user_description(self, page_content):
+        return self._html_search_regex(
+            r'<div class="description-text">.*?<p>(?P<description>.*?)</p></div></div></div>',
+            page_content,
+            "user description",
+            group="description",
+            fatal=False,
+            default="")
+
+    def _get_username(self, page_content):
+        return self._og_search_title(page_content)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group("user")
+        list_type = mobj.group("type")
+
+        # if only a profile URL was supplied, default to download all uploads
+        if list_type is None:
+            list_type = "uploads"
+
+        video_id = "%s/%s" % (user_id, list_type)
+
+        # download the user's profile to retrieve some metadata
+        profile = self._download_webpage("https://www.mixcloud.com/%s/" % user_id,
+                                         video_id,
+                                         note="Downloading user profile",
+                                         errnote="Unable to download user profile")
+
+        username = self._get_username(profile)
+        description = self._get_user_description(profile)
+
+        # retrieve all page fragments of uploads, favorites or listens
+        track_urls = self._fetch_tracks(
+            "https://www.mixcloud.com/%s/%s/" % (user_id, list_type),
+            video_id,
+            dl_note="Downloading list of %s" % list_type,
+            dl_errnote="Unable to download list of %s" % list_type)
+
+        # let MixcloudIE handle each track URL
+        entries = self._handle_track_urls(track_urls)
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': "%s (%s)" % (username, list_type),
+            'id': video_id,
+            "description": description
+        }
+
+
+class MixcloudPlaylistIE(MixcloudUserIE):
+    """
+    Information extractor for Mixcloud playlists.
+    """
+
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
+    IE_NAME = 'mixcloud:playlist'
+
+    _TESTS = [{
+        'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
+        'info_dict': {
+            'id': 'RedBullThre3style/playlists/tokyo-finalists-2015',
+            'title': 'National Champions 2015',
+            'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
+        },
+        'playlist_mincount': 16
+    }, {
+        'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
+        'info_dict': {
+            'id': 'maxvibes/playlists/jazzcat-on-ness-radio',
+            'title': 'Jazzcat on Ness Radio',
+            'description': 'md5:c2c51a1f1b8bb5442f2ca67c3dc4af27',
+        },
+        'playlist_mincount': 23
+    }]
+
+    def _get_playlist_title(self, page_content):
+        return self._html_search_regex(
+            r'<span class="main-list-title list-playlist-title ">(?P<title>.*?)</span>',
+            page_content,
+            "playlist title",
+            group="title",
+            fatal=True
+        )
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group("user")
+        playlist_id = mobj.group("playlist")
+        video_id = "%s/playlists/%s" % (user_id, playlist_id)
+
+        # download the playlist page to retrieve some metadata
+        profile = self._download_webpage(url,
+                                         user_id,
+                                         note="Downloading playlist page",
+                                         errnote="Unable to download playlist page")
+
+        description = self._get_user_description(profile)
+        playlist_title = self._get_playlist_title(profile)
+
+        # retrieve all page fragments of playlist
+        track_urls = self._fetch_tracks(
+            "https://www.mixcloud.com/%s/playlists/%s/" % (user_id, playlist_id),
+            video_id,
+            dl_note="Downloading tracklist of %s" % playlist_title,
+            dl_errnote="Unable to tracklist of %s" % playlist_title)
+
+        # let MixcloudIE handle each track
+        entries = self._handle_track_urls(track_urls)
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': playlist_title,
+            'id': video_id,
+            "description": description
+        }