Browse Source

Added extractors for 3 porn sites

Jeff Crouse 12 years ago
parent
commit
991ba7fae3
4 changed files with 336 additions and 8 deletions
  1. 1 1
      LATEST_VERSION
  2. 7 7
      README.md
  3. BIN
      youtube-dl
  4. 328 0
      youtube_dl/InfoExtractors.py

+ 1 - 1
LATEST_VERSION

@@ -1 +1 @@
-2012.10.09
+9999.99.99

+ 7 - 7
README.md

@@ -30,7 +30,7 @@ which means you can modify it, redistribute it or use it however you like.
     --list-extractors        List all supported extractors and the URLs they
     --list-extractors        List all supported extractors and the URLs they
                              would handle
                              would handle
 
 
-  Video Selection:
+## Video Selection:
     --playlist-start NUMBER  playlist video to start at (default is 1)
     --playlist-start NUMBER  playlist video to start at (default is 1)
     --playlist-end NUMBER    playlist video to end at (default is last)
     --playlist-end NUMBER    playlist video to end at (default is last)
     --match-title REGEX      download only matching titles (regex or caseless
     --match-title REGEX      download only matching titles (regex or caseless
@@ -39,7 +39,7 @@ which means you can modify it, redistribute it or use it however you like.
                              caseless sub-string)
                              caseless sub-string)
     --max-downloads NUMBER   Abort after downloading NUMBER files
     --max-downloads NUMBER   Abort after downloading NUMBER files
 
 
-  Filesystem Options:
+## Filesystem Options:
     -t, --title              use title in file name
     -t, --title              use title in file name
     --id                     use video ID in file name
     --id                     use video ID in file name
     -l, --literal            [deprecated] alias of --title
     -l, --literal            [deprecated] alias of --title
@@ -70,7 +70,7 @@ which means you can modify it, redistribute it or use it however you like.
     --write-description      write video description to a .description file
     --write-description      write video description to a .description file
     --write-info-json        write video metadata to a .info.json file
     --write-info-json        write video metadata to a .info.json file
 
 
-  Verbosity / Simulation Options:
+## Verbosity / Simulation Options:
     -q, --quiet              activates quiet mode
     -q, --quiet              activates quiet mode
     -s, --simulate           do not download the video and do not write anything
     -s, --simulate           do not download the video and do not write anything
                              to disk
                              to disk
@@ -85,7 +85,7 @@ which means you can modify it, redistribute it or use it however you like.
     --console-title          display progress in console titlebar
     --console-title          display progress in console titlebar
     -v, --verbose            print various debugging information
     -v, --verbose            print various debugging information
 
 
-  Video Format Options:
+## Video Format Options:
     -f, --format FORMAT      video format code
     -f, --format FORMAT      video format code
     --all-formats            download all available video formats
     --all-formats            download all available video formats
     --prefer-free-formats    prefer free video formats unless a specific one is
     --prefer-free-formats    prefer free video formats unless a specific one is
@@ -97,12 +97,12 @@ which means you can modify it, redistribute it or use it however you like.
     --srt-lang LANG          language of the closed captions to download
     --srt-lang LANG          language of the closed captions to download
                              (optional) use IETF language tags like 'en'
                              (optional) use IETF language tags like 'en'
 
 
-  Authentication Options:
+## Authentication Options:
     -u, --username USERNAME  account username
     -u, --username USERNAME  account username
     -p, --password PASSWORD  account password
     -p, --password PASSWORD  account password
     -n, --netrc              use .netrc authentication data
     -n, --netrc              use .netrc authentication data
 
 
-  Post-processing Options:
+## Post-processing Options:
     -x, --extract-audio      convert video files to audio-only files (requires
     -x, --extract-audio      convert video files to audio-only files (requires
                              ffmpeg or avconv and ffprobe or avprobe)
                              ffmpeg or avconv and ffprobe or avprobe)
     --audio-format FORMAT    "best", "aac", "vorbis", "mp3", "m4a", or "wav";
     --audio-format FORMAT    "best", "aac", "vorbis", "mp3", "m4a", or "wav";
@@ -172,7 +172,7 @@ youtube requires an additional signature since September 2012 which is not suppo
 The error
 The error
 
 
     File "youtube-dl", line 2
     File "youtube-dl", line 2
-    SyntaxError: Non-ASCII character '' ...
+    SyntaxError: Non-ASCII character '\x93' ...
 
 
 means you're using an outdated version of Python. Please update to Python 2.6 or 2.7.
 means you're using an outdated version of Python. Please update to Python 2.6 or 2.7.
 
 

BIN
youtube-dl


+ 328 - 0
youtube_dl/InfoExtractors.py

@@ -14,6 +14,10 @@ import email.utils
 import xml.etree.ElementTree
 import xml.etree.ElementTree
 import random
 import random
 import math
 import math
+import urllib
+import urllib2
+import httplib
+from urlparse import parse_qs, urlparse
 
 
 from .utils import *
 from .utils import *
 
 
@@ -3735,6 +3739,327 @@ class UstreamIE(InfoExtractor):
         return [info]
         return [info]
 
 
 
 
+
+class YouPornIE(InfoExtractor):
+    """Information extractor for youporn.com."""
+
+    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
+    IE_NAME = u'youporn'
+    VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
+    VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
+    VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
+    DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
+    LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
+
+    def __init__(self, downloader=None):
+        InfoExtractor.__init__(self, downloader)
+
+    def report_id(self, video_id):
+        """Report finding video ID"""
+        self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
+
+    def report_webpage(self, url):
+        """Report downloading page"""
+        self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
+
+    def report_title(self, video_title):
+        """Report dfinding title"""
+        self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
+    
+    def report_uploader(self, uploader):
+        """Report dfinding title"""
+        self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
+
+    def report_upload_date(self, video_date):
+        """Report finding date"""
+        self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
+
+    def _print_formats(self, formats):
+        """Print all available formats"""
+        print 'Available formats:'
+        print u'ext\t\tformat'
+        print u'---------------------------------'
+        for format in formats:
+            print u'%s\t\t%s'  % (format['ext'], format['format'])
+
+    def _specific(self, req_format, formats):
+        for x in formats:
+            if(x["format"]==req_format):
+                return x
+        return None
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        video_id = mobj.group('videoid').decode('utf-8')
+        self.report_id(video_id)
+
+        # Get webpage content
+        try:
+            webpage = urllib2.urlopen(url).read()
+        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+            self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
+            return
+        self.report_webpage(url)
+
+        # Get the video title
+        result = re.search(self.VIDEO_TITLE_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video title')
+            return
+        video_title = result.group('title').decode('utf-8').strip()
+        self.report_title(video_title)
+
+        # Get the video date
+        result = re.search(self.VIDEO_DATE_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video date')
+            return
+        upload_date = result.group('date').decode('utf-8').strip()
+        self.report_upload_date(upload_date)
+
+        # Get the video uploader
+        result = re.search(self.VIDEO_UPLOADER_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract uploader')
+            return
+        video_uploader = result.group('uploader').decode('utf-8').strip()
+        video_uploader = clean_html( video_uploader )
+        self.report_uploader(video_uploader)
+
+        # Get all of the formats available
+        result = re.search(self.DOWNLOAD_LIST_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract download list')
+            return
+        download_list_html = result.group('download_list').decode('utf-8').strip()
+
+        # Get all of the links from the page
+        links = re.findall(self.LINK_RE, download_list_html)
+        if(len(links) == 0):
+            self._downloader.trouble(u'ERROR: no known formats available for video')
+            return
+        
+        self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))   
+
+        formats = []
+        for link in links:
+
+            # A link looks like this:
+            # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
+            # A path looks like this:
+            # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
+            video_url = unescapeHTML( link.decode('utf-8') )
+            path = urlparse( video_url ).path
+            extension = os.path.splitext( path )[1][1:]
+            format = path.split('/')[4].split('_')[:2]
+            size = format[0]
+            bitrate = format[1]
+            format = "-".join( format )
+            title = u'%s-%s-%s' % (video_title, size, bitrate)
+
+            formats.append({
+                'id': video_id,
+                'url': video_url,
+                'uploader': video_uploader,
+                'upload_date': upload_date,
+                'title': title,
+                'ext': extension,
+                'format': format,
+                'thumbnail': None,
+                'description': None,
+                'player_url': None
+            })
+
+        if self._downloader.params.get('listformats', None):
+            self._print_formats(formats)
+            return
+
+        req_format = self._downloader.params.get('format', None)
+        #format_limit = self._downloader.params.get('format_limit', None)
+        self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
+
+
+        if req_format is None or req_format == 'best':
+            return [formats[0]]
+        elif req_format == 'worst':
+            return [formats[-1]]
+        elif req_format in ('-1', 'all'):
+            return formats
+        else:
+            format = self._specific( req_format, formats )
+            if result is None:
+                self._downloader.trouble(u'ERROR: requested format not available')
+                return
+            return [format]
+
+        
+
+class PornotubeIE(InfoExtractor):
+    """Information extractor for pornotube.com."""
+
+    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
+    IE_NAME = u'pornotube'
+    VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
+    VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
+
+
+    def __init__(self, downloader=None):
+        InfoExtractor.__init__(self, downloader)
+
+    def report_extract_entry(self, url):
+        """Report downloading extry"""
+        self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
+
+    def report_date(self, upload_date):
+        """Report finding uploaded date"""
+        self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
+
+    def report_webpage(self, url):
+        """Report downloading page"""
+        self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
+
+    def report_title(self, video_title):
+        """Report downloading extry"""
+        self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        video_id = mobj.group('videoid').decode('utf-8')
+        video_title = mobj.group('title').decode('utf-8')
+        self.report_title(video_title);
+
+        # Get webpage content
+        try:
+            webpage = urllib2.urlopen(url).read()
+        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+            self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
+            return
+        self.report_webpage(url)
+
+        # Get the video URL
+        result = re.search(self.VIDEO_URL_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video url')
+            return
+        video_url = urllib.unquote(result.group('url').decode('utf-8'))
+        self.report_extract_entry(video_url)
+
+        #Get the uploaded date
+        result = re.search(self.VIDEO_UPLOADED_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video title')
+            return
+        upload_date = result.group('date').decode('utf-8')
+        self.report_date(upload_date);
+
+
+        info = {'id': video_id,
+                'url': video_url,
+                'uploader': None,
+                'upload_date': upload_date,
+                'title': video_title,
+                'ext': 'flv',
+                'format': 'flv',
+                'thumbnail': None,
+                'description': None,
+                'player_url': None}
+
+        return [info]
+
+
+
+class YouJizzIE(InfoExtractor):
+    """Information extractor for youjizz.com."""
+
+    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/([^.]+).html$'
+    IE_NAME = u'youjizz'
+    VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
+    EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
+    SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
+
+    def __init__(self, downloader=None):
+        InfoExtractor.__init__(self, downloader)
+
+    def report_extract_entry(self, url):
+        """Report downloading extry"""
+        self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8'))
+
+    def report_webpage(self, url):
+        """Report downloading page"""
+        self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url)
+
+    def report_title(self, video_title):
+        """Report downloading extry"""
+        self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8'))
+
+    def report_embed_page(self, embed_page):
+        """Report downloading extry"""
+        self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8'))
+
+    def _real_extract(self, url):
+        # Get webpage content
+        try:
+            webpage = urllib2.urlopen(url).read()
+        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+            self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
+            return
+        self.report_webpage(url)
+
+        # Get the video title
+        result = re.search(self.VIDEO_TITLE_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video title')
+            return
+        video_title = result.group('title').decode('utf-8').strip()
+        self.report_title(video_title)
+
+        # Get the embed page
+        result = re.search(self.EMBED_PAGE_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract embed page')
+            return
+
+        embed_page_url = result.group(0).decode('utf-8').strip()
+        video_id = result.group('videoid').decode('utf-8')
+        self.report_embed_page(embed_page_url)
+    
+        try:
+            webpage = urllib2.urlopen(embed_page_url).read()
+        except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+            self._downloader.trouble(u'ERROR: unable to download video embed page: %s' % err)
+            return
+        
+        # Get the video URL
+        result = re.search(self.SOURCE_RE, webpage)
+        if result is None:
+            self._downloader.trouble(u'ERROR: unable to extract video url')
+            return
+        video_url = result.group('source').decode('utf-8')
+        self.report_extract_entry(video_url)
+
+        info = {'id': video_id,
+                'url': video_url,
+                'uploader': None,
+                'upload_date': None,
+                'title': video_title,
+                'ext': 'flv',
+                'format': 'flv',
+                'thumbnail': None,
+                'description': None,
+                'player_url': embed_page_url}
+
+        return [info]
+
+
 def gen_extractors():
 def gen_extractors():
     """ Return a list of an instance of every supported extractor.
     """ Return a list of an instance of every supported extractor.
     The order does matter; the first extractor matched is the one handling the URL.
     The order does matter; the first extractor matched is the one handling the URL.
@@ -3768,6 +4093,9 @@ def gen_extractors():
         MTVIE(),
         MTVIE(),
         YoukuIE(),
         YoukuIE(),
         XNXXIE(),
         XNXXIE(),
+        YouJizzIE(),
+        PornotubeIE(),
+        YouPornIE(),
         GooglePlusIE(),
         GooglePlusIE(),
         ArteTvIE(),
         ArteTvIE(),
         NBAIE(),
         NBAIE(),