teletask.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import datetime
  5. from .common import InfoExtractor
  6. class TeleTaskIE(InfoExtractor):
  7. _VALID_URL = r'http?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)/'
  8. _TEST = {
  9. 'url': 'http://www.tele-task.de/archive/video/html5/26168/',
  10. 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
  11. 'info_dict': {
  12. 'id': '26168',
  13. 'ext': 'mp4',
  14. 'title': 'Duplicate Detection',
  15. 'thumbnail': 're:^https?://.*\.jpg$',
  16. 'date': '20141218',
  17. # TODO more properties, either as:
  18. # * A value
  19. # * MD5 checksum; start the string with md5:
  20. # * A regular expression; start the string with re:
  21. # * Any Python type (for example int or float)
  22. }
  23. }
  24. def _real_extract(self, url):
  25. video_id = self._match_id(url)
  26. webpage = self._download_webpage(url, video_id)
  27. lecture_url = self._html_search_regex(
  28. r'href="([^"]+)" itemprop="name">', webpage, 'title')
  29. lecture_id = re.search("([0-9]+)/",lecture_url).group(1)
  30. overview_page = self._download_webpage("http://www.tele-task.de" + lecture_url,
  31. lecture_id)
  32. title = self._html_search_regex(
  33. r'itemprop="name">([^"]+)</a>', webpage, 'title')
  34. url = self._html_search_regex(
  35. r'class="speaker".*?src="([^"]+)"', webpage, 'video_url', flags=re.DOTALL)
  36. description = self._html_search_regex(
  37. r'Description of the series:</p>([^"]+)</div>', overview_page,
  38. 'description',flags=re.DOTALL)
  39. date = self._html_search_regex(
  40. r'<td class="label">Date:</td><td>([^"]+)</td>', webpage, 'date')
  41. date = datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y%m%d')
  42. return {
  43. 'id': video_id,
  44. 'title': title,
  45. 'description': description,
  46. 'url': url,
  47. 'upload_date': date,
  48. }