ninegag.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. determine_ext,
  6. url_or_none,
  7. int_or_none,
  8. float_or_none,
  9. ExtractorError
  10. )
  11. class NineGagIE(InfoExtractor):
  12. IE_NAME = '9gag'
  13. _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[a-zA-Z0-9]+)'
  14. _TESTS = [{
  15. 'url': 'https://9gag.com/gag/an5Qz5b',
  16. 'info_dict': {
  17. 'id': 'an5Qz5b',
  18. 'ext': 'webm',
  19. 'title': 'Dogs playing tetherball',
  20. 'upload_date': '20191108',
  21. 'timestamp': 1573243994,
  22. 'age_limit': 0,
  23. 'categories': [
  24. 'Wholesome'
  25. ],
  26. 'tags': [
  27. 'Dog'
  28. ]
  29. }
  30. }, {
  31. 'url': 'https://9gag.com/gag/ae5Ag7B',
  32. 'info_dict': {
  33. 'id': 'ae5Ag7B',
  34. 'ext': 'webm',
  35. 'title': 'Capybara Agility Training',
  36. 'upload_date': '20191108',
  37. 'timestamp': 1573237208,
  38. 'age_limit': 0,
  39. 'categories': [
  40. 'Awesome'
  41. ],
  42. 'tags': [
  43. 'Weimaraner',
  44. 'American Pit Bull Terrier'
  45. ]
  46. }
  47. }]
  48. _EXTERNAL_VIDEO_PROVIDERS = {
  49. 'Youtube': 'https://youtube.com/watch?v=%s'
  50. }
  51. def _real_extract(self, url):
  52. video_id = self._match_id(url)
  53. webpage = self._download_webpage(url, video_id)
  54. rawJsonData = self._search_regex(
  55. r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);',
  56. webpage,
  57. 'data')
  58. rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/')
  59. data = self._parse_json(rawJsonData, video_id)['data']['post']
  60. if data['type'] == 'Video':
  61. vid = data['video']['id']
  62. ie_key = data['video']['source'].capitalize()
  63. return {
  64. '_type': 'url_transparent',
  65. 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid,
  66. 'ie_key': ie_key,
  67. 'id': vid,
  68. 'duration': data['video'].get('duration'),
  69. 'start_time': data['video'].get('startTs')
  70. }
  71. if data['type'] == 'EmbedVideo':
  72. vid = data['video']['id']
  73. ie_key = data['video']['source'].capitalize()
  74. return {
  75. '_type': 'url_transparent',
  76. 'url': data['video']['embedUrl'],
  77. #'ie_key': vid,
  78. 'start_time': data['video'].get('startTs')
  79. }
  80. if data['type'] != 'Animated':
  81. raise ExtractorError(
  82. 'The given url does not contain a video',
  83. expected=True)
  84. duration = None
  85. formats = []
  86. thumbnails = []
  87. for key in data['images']:
  88. image = data['images'][key]
  89. if 'duration' in image and duration is None:
  90. duration = int_or_none(image['duration'])
  91. url = url_or_none(image.get('url'))
  92. if url == None:
  93. continue
  94. ext = determine_ext(url)
  95. if ext == 'jpg' or ext == 'png':
  96. thumbnail = {
  97. 'url': url,
  98. 'width': float_or_none(image.get('width')),
  99. 'height': float_or_none(image.get('height'))
  100. }
  101. thumbnails.append(thumbnail)
  102. elif ext == 'webm' or ext == 'mp4':
  103. formats.append({
  104. 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url),
  105. 'ext': ext,
  106. 'url': url,
  107. 'width': float_or_none(image.get('width')),
  108. 'height': float_or_none(image.get('height'))
  109. })
  110. section = None
  111. postSection = data.get('postSection')
  112. if postSection != None and 'name' in postSection:
  113. section = re.sub(r'\\[^\\]{5}', '', postSection['name'])
  114. age_limit = int_or_none(data.get('nsfw'))
  115. if age_limit != None:
  116. age_limit = age_limit * 18
  117. tags = None
  118. if 'tags' in data:
  119. tags = []
  120. for tag in data.get('tags') or []:
  121. tags.append(tag.get('key'))
  122. return {
  123. 'id': video_id,
  124. 'title': data['title'],
  125. 'timestamp': int_or_none(data.get('creationTs')),
  126. 'duration': duration,
  127. 'formats': formats,
  128. 'thumbnails': thumbnails,
  129. 'like_count': int_or_none(data.get('upVoteCount')),
  130. 'dislike_count': int_or_none(data.get('downVoteCount')),
  131. 'comment_count': int_or_none(data.get('commentsCount')),
  132. 'age_limit': age_limit,
  133. 'categories': [section],
  134. 'tags': tags,
  135. 'is_live': False
  136. }