xhamster.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. compat_urllib_parse,
  6. unescapeHTML,
  7. determine_ext,
  8. ExtractorError,
  9. )
  10. class XHamsterIE(InfoExtractor):
  11. """Information Extractor for xHamster"""
  12. _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
  13. _TESTS = [{
  14. 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
  15. 'file': '1509445.flv',
  16. 'md5': '9f48e0e8d58e3076bb236ff412ab62fa',
  17. 'info_dict': {
  18. "upload_date": "20121014",
  19. "uploader_id": "Ruseful2011",
  20. "title": "FemaleAgent Shy beauty takes the bait",
  21. "age_limit": 18,
  22. }
  23. },
  24. {
  25. 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
  26. 'file': '2221348.flv',
  27. 'md5': 'e767b9475de189320f691f49c679c4c7',
  28. 'info_dict': {
  29. "upload_date": "20130914",
  30. "uploader_id": "jojo747400",
  31. "title": "Britney Spears Sexy Booty",
  32. "age_limit": 18,
  33. }
  34. }]
  35. def _real_extract(self,url):
  36. def extract_video_url(webpage):
  37. mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
  38. if mobj is None:
  39. raise ExtractorError(u'Unable to extract media URL')
  40. if len(mobj.group('server')) == 0:
  41. return compat_urllib_parse.unquote(mobj.group('file'))
  42. else:
  43. return mobj.group('server')+'/key='+mobj.group('file')
  44. def extract_mp4_video_url(webpage):
  45. mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage)
  46. if mp4 is None:
  47. return None
  48. else:
  49. return mp4.group(1)
  50. def is_hd(webpage):
  51. return webpage.find('<div class=\'icon iconHD\'') != -1
  52. mobj = re.match(self._VALID_URL, url)
  53. video_id = mobj.group('id')
  54. seo = mobj.group('seo')
  55. mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
  56. webpage = self._download_webpage(mrss_url, video_id)
  57. video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
  58. webpage, 'title')
  59. # Only a few videos have an description
  60. mobj = re.search('<span>Description: </span>(?P<description>[^<]+)', webpage)
  61. if mobj:
  62. video_description = unescapeHTML(mobj.group('description'))
  63. else:
  64. video_description = None
  65. mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
  66. if mobj:
  67. video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
  68. else:
  69. video_upload_date = None
  70. self._downloader.report_warning(u'Unable to extract upload date')
  71. video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
  72. webpage, 'uploader id', default=u'anonymous')
  73. video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
  74. webpage, 'thumbnail', fatal=False)
  75. age_limit = self._rta_search(webpage)
  76. hd = is_hd(webpage)
  77. video_url = extract_video_url(webpage)
  78. formats = [{
  79. 'url': video_url,
  80. 'ext': determine_ext(video_url),
  81. 'format': 'hd' if hd else 'sd',
  82. 'format_id': 'hd' if hd else 'sd',
  83. }]
  84. video_mp4_url = extract_mp4_video_url(webpage)
  85. if (not video_mp4_url is None) and (formats[0]['ext'] != 'mp4'):
  86. formats.append({
  87. 'url': video_mp4_url,
  88. 'ext': 'mp4',
  89. 'format': 'hd' if hd else 'sd',
  90. 'format_id': 'hd' if hd else 'sd',
  91. })
  92. if not hd:
  93. webpage = self._download_webpage(mrss_url+'?hd', video_id)
  94. if is_hd(webpage):
  95. video_url = extract_video_url(webpage)
  96. formats.append({
  97. 'url': video_url,
  98. 'ext': determine_ext(video_url),
  99. 'format': 'hd',
  100. 'format_id': 'hd',
  101. })
  102. return {
  103. 'id': video_id,
  104. 'title': video_title,
  105. 'formats': formats,
  106. 'description': video_description,
  107. 'upload_date': video_upload_date,
  108. 'uploader_id': video_uploader_id,
  109. 'thumbnail': video_thumbnail,
  110. 'age_limit': age_limit,
  111. }