vier.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import itertools
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. ExtractorError,
  8. urlencode_postdata,
  9. )
  10. class VierIE(InfoExtractor):
  11. IE_NAME = 'vier'
  12. IE_DESC = 'vier.be and vijf.be'
  13. _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
  14. _NETRC_MACHINE = 'vier'
  15. _TESTS = [{
  16. 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
  17. 'info_dict': {
  18. 'id': '16129',
  19. 'display_id': 'het-wordt-warm-de-moestuin',
  20. 'ext': 'mp4',
  21. 'title': 'Het wordt warm in De Moestuin',
  22. 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
  23. },
  24. 'params': {
  25. # m3u8 download
  26. 'skip_download': True,
  27. },
  28. # 'skip': 'Requires account credentials',
  29. }, {
  30. 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
  31. 'info_dict': {
  32. 'id': '2561614',
  33. 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
  34. 'ext': 'mp4',
  35. 'title': 'EXTRA: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
  36. 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
  37. },
  38. 'params': {
  39. # m3u8 download
  40. 'skip_download': True,
  41. },
  42. }, {
  43. 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
  44. 'info_dict': {
  45. 'id': '2674839',
  46. 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
  47. 'ext': 'mp4',
  48. 'title': 'Jani gaat naar Tokio - Aflevering 4',
  49. 'description': 'Bekijk hier de volledige vierde aflevering van het 2de seizoen van Jani gaat...',
  50. },
  51. 'params': {
  52. # m3u8 download
  53. 'skip_download': True,
  54. },
  55. 'skip': 'Requires account credentials',
  56. }, {
  57. 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
  58. 'info_dict': {
  59. 'id': '2674839',
  60. 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
  61. 'ext': 'mp4',
  62. 'title': 'jani-gaat-naar-tokio-aflevering-4',
  63. },
  64. 'params': {
  65. # m3u8 download
  66. 'skip_download': True,
  67. },
  68. 'expected_warnings': ['Log in to extract metadata'],
  69. }, {
  70. 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
  71. 'only_matching': True,
  72. }, {
  73. 'url': 'http://www.vier.be/video/v3/embed/16129',
  74. 'only_matching': True,
  75. }]
  76. def _real_initialize(self):
  77. self._logged_in = False
  78. def _login(self, site):
  79. username, password = self._get_login_info()
  80. if username is None or password is None:
  81. return
  82. login_page = self._download_webpage(
  83. 'http://www.%s.be/user/login' % site,
  84. None, note='Logging in', errnote='Unable to log in',
  85. data=urlencode_postdata({
  86. 'form_id': 'user_login',
  87. 'name': username,
  88. 'pass': password,
  89. }),
  90. headers={'Content-Type': 'application/x-www-form-urlencoded'})
  91. login_error = self._html_search_regex(
  92. r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
  93. login_page, 'login error', default=None)
  94. if login_error:
  95. self.report_warning('Unable to log in: %s' % login_error)
  96. else:
  97. self._logged_in = True
  98. def _real_extract(self, url):
  99. mobj = re.match(self._VALID_URL, url)
  100. embed_id = mobj.group('embed_id')
  101. display_id = mobj.group('display_id') or embed_id
  102. video_id = mobj.group('id') or embed_id
  103. site = mobj.group('site')
  104. if not self._logged_in:
  105. self._login(site)
  106. webpage = self._download_webpage(url, display_id)
  107. if r'id="user-login"' in webpage:
  108. self.report_warning(
  109. 'Log in to extract metadata', video_id=display_id)
  110. webpage = self._download_webpage(
  111. 'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
  112. display_id)
  113. video_id = self._search_regex(
  114. [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
  115. webpage, 'video id', default=video_id)
  116. application = self._search_regex(
  117. [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
  118. webpage, 'application', default=site + '_vod')
  119. filename = self._search_regex(
  120. [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
  121. webpage, 'filename')
  122. playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
  123. formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
  124. self._sort_formats(formats)
  125. title = self._og_search_title(webpage, default=display_id)
  126. description = self._og_search_description(webpage, default=None)
  127. thumbnail = self._og_search_thumbnail(webpage, default=None)
  128. return {
  129. 'id': video_id,
  130. 'display_id': display_id,
  131. 'title': title,
  132. 'description': description,
  133. 'thumbnail': thumbnail,
  134. 'formats': formats,
  135. }
  136. class VierVideosIE(InfoExtractor):
  137. IE_NAME = 'vier:videos'
  138. _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
  139. _TESTS = [{
  140. 'url': 'http://www.vier.be/demoestuin/videos',
  141. 'info_dict': {
  142. 'id': 'demoestuin',
  143. },
  144. 'playlist_mincount': 153,
  145. }, {
  146. 'url': 'http://www.vijf.be/temptationisland/videos',
  147. 'info_dict': {
  148. 'id': 'temptationisland',
  149. },
  150. 'playlist_mincount': 159,
  151. }, {
  152. 'url': 'http://www.vier.be/demoestuin/videos?page=6',
  153. 'info_dict': {
  154. 'id': 'demoestuin-page6',
  155. },
  156. 'playlist_mincount': 20,
  157. }, {
  158. 'url': 'http://www.vier.be/demoestuin/videos?page=7',
  159. 'info_dict': {
  160. 'id': 'demoestuin-page7',
  161. },
  162. 'playlist_mincount': 13,
  163. }]
  164. def _real_extract(self, url):
  165. mobj = re.match(self._VALID_URL, url)
  166. program = mobj.group('program')
  167. site = mobj.group('site')
  168. page_id = mobj.group('page')
  169. if page_id:
  170. page_id = int(page_id)
  171. start_page = page_id
  172. playlist_id = '%s-page%d' % (program, page_id)
  173. else:
  174. start_page = 0
  175. playlist_id = program
  176. entries = []
  177. for current_page_id in itertools.count(start_page):
  178. current_page = self._download_webpage(
  179. 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
  180. program,
  181. 'Downloading page %d' % (current_page_id + 1))
  182. page_entries = [
  183. self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
  184. for video_url in re.findall(
  185. r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
  186. entries.extend(page_entries)
  187. if page_id or '>Meer<' not in current_page:
  188. break
  189. return self.playlist_result(entries, playlist_id)