Browse Source

[azmedien] Adopt to major site redesign (closes #17745)

Alexander Seiler 7 years ago
parent
commit
da56fb631f
2 changed files with 55 additions and 177 deletions
  1. 54 172
      youtube_dl/extractor/azmedien.py
  2. 1 5
      youtube_dl/extractor/extractors.py

+ 54 - 172
youtube_dl/extractor/azmedien.py

@@ -1,19 +1,16 @@
 # coding: utf-8
 # coding: utf-8
 from __future__ import unicode_literals
 from __future__ import unicode_literals
 
 
+import json
 import re
 import re
 
 
 from .common import InfoExtractor
 from .common import InfoExtractor
 from .kaltura import KalturaIE
 from .kaltura import KalturaIE
-from ..utils import (
-    get_element_by_class,
-    get_element_by_id,
-    strip_or_none,
-    urljoin,
-)
 
 
 
 
 class AZMedienBaseIE(InfoExtractor):
 class AZMedienBaseIE(InfoExtractor):
+    _PARTNER_ID = '1719221'
+
     def _kaltura_video(self, partner_id, entry_id):
     def _kaltura_video(self, partner_id, entry_id):
         return self.url_result(
         return self.url_result(
             'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(),
             'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(),
@@ -25,189 +22,74 @@ class AZMedienIE(AZMedienBaseIE):
     _VALID_URL = r'''(?x)
     _VALID_URL = r'''(?x)
                     https?://
                     https?://
                         (?:www\.)?
                         (?:www\.)?
-                        (?:
+                        (?P<host>
                             telezueri\.ch|
                             telezueri\.ch|
                             telebaern\.tv|
                             telebaern\.tv|
                             telem1\.ch
                             telem1\.ch
                         )/
                         )/
-                        [0-9]+-show-[^/\#]+
-                        (?:
-                            /[0-9]+-episode-[^/\#]+
-                            (?:
-                                /[0-9]+-segment-(?:[^/\#]+\#)?|
-                                \#
-                            )|
-                            \#
+                        [^/]+/
+                        (?P<id>
+                            [^/]+-(?P<article_id>\d+)
                         )
                         )
-                        (?P<id>[^\#]+)
+                        (?:
+                            \#video=
+                            (?P<kaltura_id>
+                                [_0-9a-z]+
+                            )
+                        )?
                     '''
                     '''
 
 
     _TESTS = [{
     _TESTS = [{
-        # URL with 'segment'
-        'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom',
+        'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
         'info_dict': {
         'info_dict': {
-            'id': '1_2444peh4',
+            'id': '1_anruz3wy',
             'ext': 'mp4',
             'ext': 'mp4',
-            'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom',
-            'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8',
-            'uploader_id': 'TeleZ?ri',
-            'upload_date': '20161218',
-            'timestamp': 1482084490,
+            'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen',
+            'description': 'md5:dd9f96751ec9c35e409a698a328402f3',
+            'uploader_id': 'TVOnline',
+            'upload_date': '20180930',
+            'timestamp': 1538328802,
         },
         },
         'params': {
         'params': {
             'skip_download': True,
             'skip_download': True,
         },
         },
     }, {
     }, {
-        # URL with 'segment' and fragment:
-        'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger',
-        'only_matching': True
-    }, {
-        # URL with 'episode' and fragment:
-        'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz',
-        'only_matching': True
-    }, {
-        # URL with 'show' and fragment:
-        'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch',
+        'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1',
         'only_matching': True
         'only_matching': True
     }]
     }]
 
 
     def _real_extract(self, url):
     def _real_extract(self, url):
         video_id = self._match_id(url)
         video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        partner_id = self._search_regex(
-            r'<script[^>]+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)',
-            webpage, 'kaltura partner id')
-        entry_id = self._html_search_regex(
-            r'<a[^>]+data-id=(["\'])(?P<id>(?:(?!\1).)+)\1[^>]+data-slug=["\']%s'
-            % re.escape(video_id), webpage, 'kaltura entry id', group='id')
-
-        return self._kaltura_video(partner_id, entry_id)
-
-
-class AZMedienPlaylistIE(AZMedienBaseIE):
-    IE_DESC = 'AZ Medien playlists'
-    _VALID_URL = r'''(?x)
-                    https?://
-                        (?:www\.)?
-                        (?:
-                            telezueri\.ch|
-                            telebaern\.tv|
-                            telem1\.ch
-                        )/
-                        (?P<id>[0-9]+-
-                            (?:
-                                show|
-                                topic|
-                                themen
-                            )-[^/\#]+
-                            (?:
-                                /[0-9]+-episode-[^/\#]+
-                            )?
-                        )$
-                    '''
-
-    _TESTS = [{
-        # URL with 'episode'
-        'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016',
-        'info_dict': {
-            'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016',
-            'title': 'News - Donnerstag, 15. Dezember 2016',
-        },
-        'playlist_count': 9,
-    }, {
-        # URL with 'themen'
-        'url': 'http://www.telem1.ch/258-themen-tele-m1-classics',
-        'info_dict': {
-            'id': '258-themen-tele-m1-classics',
-            'title': 'Tele M1 Classics',
-        },
-        'playlist_mincount': 15,
-    }, {
-        # URL with 'topic', contains nested playlists
-        'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen',
-        'only_matching': True,
-    }, {
-        # URL with 'show' only
-        'url': 'http://www.telezueri.ch/86-show-talktaeglich',
-        'only_matching': True
-    }]
-
-    def _real_extract(self, url):
-        show_id = self._match_id(url)
-        webpage = self._download_webpage(url, show_id)
-
-        entries = []
-
-        partner_id = self._search_regex(
-            r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)',
-            webpage, 'kaltura partner id', default=None)
-
-        if partner_id:
-            entries = [
-                self._kaltura_video(partner_id, m.group('id'))
-                for m in re.finditer(
-                    r'data-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage)]
-
-        if not entries:
-            entries = [
-                self.url_result(m.group('url'), ie=AZMedienIE.ie_key())
-                for m in re.finditer(
-                    r'<a[^>]+data-real=(["\'])(?P<url>http.+?)\1', webpage)]
-
-        if not entries:
-            entries = [
-                # May contain nested playlists (e.g. [1]) thus no explicit
-                # ie_key
-                # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen)
-                self.url_result(urljoin(url, m.group('url')))
-                for m in re.finditer(
-                    r'<a[^>]+name=[^>]+href=(["\'])(?P<url>/.+?)\1', webpage)]
-
-        title = self._search_regex(
-            r'episodeShareTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
-            webpage, 'title',
-            default=strip_or_none(get_element_by_id(
-                'video-title', webpage)), group='title')
-
-        return self.playlist_result(entries, show_id, title)
-
-
-class AZMedienShowPlaylistIE(AZMedienBaseIE):
-    IE_DESC = 'AZ Medien show playlists'
-    _VALID_URL = r'''(?x)
-                    https?://
-                        (?:www\.)?
-                        (?:
-                            telezueri\.ch|
-                            telebaern\.tv|
-                            telem1\.ch
-                        )/
-                        (?:
-                            all-episodes|
-                            alle-episoden
-                        )/
-                        (?P<id>[^/?#&]+)
-                    '''
-
-    _TEST = {
-        'url': 'http://www.telezueri.ch/all-episodes/astrotalk',
-        'info_dict': {
-            'id': 'astrotalk',
-            'title': 'TeleZüri: AstroTalk - alle episoden',
-            'description': 'md5:4c0f7e7d741d906004266e295ceb4a26',
-        },
-        'playlist_mincount': 13,
-    }
-
-    def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        webpage = self._download_webpage(url, playlist_id)
-        episodes = get_element_by_class('search-mobile-box', webpage)
-        entries = [self.url_result(
-            urljoin(url, m.group('url'))) for m in re.finditer(
-                r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', episodes)]
-        title = self._og_search_title(webpage, fatal=False)
-        description = self._og_search_description(webpage)
-        return self.playlist_result(entries, playlist_id, title, description)
+        mobj = re.match(self._VALID_URL, url)
+        entry_id = mobj.group('kaltura_id')
+
+        if not entry_id:
+            webpage = self._download_webpage(url, video_id)
+            api_path = self._search_regex(
+                r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']',
+                webpage, 'api path')
+            api_url = 'https://www.%s%s' % (mobj.group('host'), api_path)
+            payload = {
+                'query': '''query VideoContext($articleId: ID!) {
+                    article: node(id: $articleId) {
+                      ... on Article {
+                        mainAssetRelation {
+                          asset {
+                            ... on VideoAsset {
+                              kalturaId
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }''',
+                'variables': {'articleId': 'Article:%s' % mobj.group('article_id')},
+            }
+            json_data = self._download_json(
+                api_url, video_id, headers={
+                    'Content-Type': 'application/json',
+                },
+                data=json.dumps(payload).encode())
+            entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId']
+
+        return self._kaltura_video(self._PARTNER_ID, entry_id)

+ 1 - 5
youtube_dl/extractor/extractors.py

@@ -88,11 +88,7 @@ from .awaan import (
     AWAANLiveIE,
     AWAANLiveIE,
     AWAANSeasonIE,
     AWAANSeasonIE,
 )
 )
-from .azmedien import (
-    AZMedienIE,
-    AZMedienPlaylistIE,
-    AZMedienShowPlaylistIE,
-)
+from .azmedien import AZMedienIE
 from .baidu import BaiduVideoIE
 from .baidu import BaiduVideoIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
 from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE