|
@@ -245,6 +245,10 @@ class InfoExtractor(object):
|
|
specified in the URL.
|
|
specified in the URL.
|
|
end_time: Time in seconds where the reproduction should end, as
|
|
end_time: Time in seconds where the reproduction should end, as
|
|
specified in the URL.
|
|
specified in the URL.
|
|
|
|
+ chapters: A list of dictionaries, with the following entries:
|
|
|
|
+ * "start_time" - The start time of the chapter in seconds
|
|
|
|
+ * "end_time" - The end time of the chapter in seconds
|
|
|
|
+ * "title" (optional, string)
|
|
|
|
|
|
The following fields should only be used when the video belongs to some logical
|
|
The following fields should only be used when the video belongs to some logical
|
|
chapter or section:
|
|
chapter or section:
|
|
@@ -976,6 +980,23 @@ class InfoExtractor(object):
|
|
return info
|
|
return info
|
|
if isinstance(json_ld, dict):
|
|
if isinstance(json_ld, dict):
|
|
json_ld = [json_ld]
|
|
json_ld = [json_ld]
|
|
|
|
+
|
|
|
|
+ def extract_video_object(e):
|
|
|
|
+ assert e['@type'] == 'VideoObject'
|
|
|
|
+ info.update({
|
|
|
|
+ 'url': e.get('contentUrl'),
|
|
|
|
+ 'title': unescapeHTML(e.get('name')),
|
|
|
|
+ 'description': unescapeHTML(e.get('description')),
|
|
|
|
+ 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
|
|
|
|
+ 'duration': parse_duration(e.get('duration')),
|
|
|
|
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
|
|
|
|
+ 'filesize': float_or_none(e.get('contentSize')),
|
|
|
|
+ 'tbr': int_or_none(e.get('bitrate')),
|
|
|
|
+ 'width': int_or_none(e.get('width')),
|
|
|
|
+ 'height': int_or_none(e.get('height')),
|
|
|
|
+ 'view_count': int_or_none(e.get('interactionCount')),
|
|
|
|
+ })
|
|
|
|
+
|
|
for e in json_ld:
|
|
for e in json_ld:
|
|
if e.get('@context') == 'http://schema.org':
|
|
if e.get('@context') == 'http://schema.org':
|
|
item_type = e.get('@type')
|
|
item_type = e.get('@type')
|
|
@@ -1000,18 +1021,11 @@ class InfoExtractor(object):
|
|
'description': unescapeHTML(e.get('articleBody')),
|
|
'description': unescapeHTML(e.get('articleBody')),
|
|
})
|
|
})
|
|
elif item_type == 'VideoObject':
|
|
elif item_type == 'VideoObject':
|
|
- info.update({
|
|
|
|
- 'url': e.get('contentUrl'),
|
|
|
|
- 'title': unescapeHTML(e.get('name')),
|
|
|
|
- 'description': unescapeHTML(e.get('description')),
|
|
|
|
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
|
|
|
|
- 'duration': parse_duration(e.get('duration')),
|
|
|
|
- 'timestamp': unified_timestamp(e.get('uploadDate')),
|
|
|
|
- 'filesize': float_or_none(e.get('contentSize')),
|
|
|
|
- 'tbr': int_or_none(e.get('bitrate')),
|
|
|
|
- 'width': int_or_none(e.get('width')),
|
|
|
|
- 'height': int_or_none(e.get('height')),
|
|
|
|
- })
|
|
|
|
|
|
+ extract_video_object(e)
|
|
|
|
+ elif item_type == 'WebPage':
|
|
|
|
+ video = e.get('video')
|
|
|
|
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
|
|
|
|
+ extract_video_object(video)
|
|
break
|
|
break
|
|
return dict((k, v) for k, v in info.items() if v is not None)
|
|
return dict((k, v) for k, v in info.items() if v is not None)
|
|
|
|
|
|
@@ -1303,40 +1317,50 @@ class InfoExtractor(object):
|
|
entry_protocol='m3u8', preference=None,
|
|
entry_protocol='m3u8', preference=None,
|
|
m3u8_id=None, note=None, errnote=None,
|
|
m3u8_id=None, note=None, errnote=None,
|
|
fatal=True, live=False):
|
|
fatal=True, live=False):
|
|
-
|
|
|
|
res = self._download_webpage_handle(
|
|
res = self._download_webpage_handle(
|
|
m3u8_url, video_id,
|
|
m3u8_url, video_id,
|
|
note=note or 'Downloading m3u8 information',
|
|
note=note or 'Downloading m3u8 information',
|
|
errnote=errnote or 'Failed to download m3u8 information',
|
|
errnote=errnote or 'Failed to download m3u8 information',
|
|
fatal=fatal)
|
|
fatal=fatal)
|
|
|
|
+
|
|
if res is False:
|
|
if res is False:
|
|
return []
|
|
return []
|
|
|
|
+
|
|
m3u8_doc, urlh = res
|
|
m3u8_doc, urlh = res
|
|
m3u8_url = urlh.geturl()
|
|
m3u8_url = urlh.geturl()
|
|
|
|
|
|
|
|
+ return self._parse_m3u8_formats(
|
|
|
|
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
|
|
|
|
+ preference=preference, m3u8_id=m3u8_id, live=live)
|
|
|
|
+
|
|
|
|
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
|
|
|
|
+ entry_protocol='m3u8', preference=None,
|
|
|
|
+ m3u8_id=None, live=False):
|
|
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
|
|
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
|
|
return []
|
|
return []
|
|
|
|
|
|
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
|
|
|
|
|
|
+ formats = []
|
|
|
|
|
|
format_url = lambda u: (
|
|
format_url = lambda u: (
|
|
u
|
|
u
|
|
if re.match(r'^https?://', u)
|
|
if re.match(r'^https?://', u)
|
|
else compat_urlparse.urljoin(m3u8_url, u))
|
|
else compat_urlparse.urljoin(m3u8_url, u))
|
|
|
|
|
|
- # We should try extracting formats only from master playlists [1], i.e.
|
|
|
|
- # playlists that describe available qualities. On the other hand media
|
|
|
|
- # playlists [2] should be returned as is since they contain just the media
|
|
|
|
- # without qualities renditions.
|
|
|
|
|
|
+ # References:
|
|
|
|
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
|
|
|
|
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
|
|
|
|
+
|
|
|
|
+ # We should try extracting formats only from master playlists [1, 4.3.4],
|
|
|
|
+ # i.e. playlists that describe available qualities. On the other hand
|
|
|
|
+ # media playlists [1, 4.3.3] should be returned as is since they contain
|
|
|
|
+ # just the media without qualities renditions.
|
|
# Fortunately, master playlist can be easily distinguished from media
|
|
# Fortunately, master playlist can be easily distinguished from media
|
|
- # playlist based on particular tags availability. As of [1, 2] master
|
|
|
|
- # playlist tags MUST NOT appear in a media playist and vice versa.
|
|
|
|
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
|
|
|
|
- # and MUST NOT appear in master playlist thus we can clearly detect media
|
|
|
|
- # playlist with this criterion.
|
|
|
|
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
|
|
|
|
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
|
|
|
|
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
|
|
|
|
|
|
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
|
|
|
|
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
|
|
|
|
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
|
|
|
|
+ # media playlist and MUST NOT appear in master playlist thus we can
|
|
|
|
+ # clearly detect media playlist with this criterion.
|
|
|
|
+
|
|
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
|
|
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
|
|
return [{
|
|
return [{
|
|
'url': m3u8_url,
|
|
'url': m3u8_url,
|
|
@@ -1345,52 +1369,72 @@ class InfoExtractor(object):
|
|
'protocol': entry_protocol,
|
|
'protocol': entry_protocol,
|
|
'preference': preference,
|
|
'preference': preference,
|
|
}]
|
|
}]
|
|
- audio_in_video_stream = {}
|
|
|
|
- last_info = {}
|
|
|
|
- last_media = {}
|
|
|
|
|
|
+
|
|
|
|
+ groups = {}
|
|
|
|
+ last_stream_inf = {}
|
|
|
|
+
|
|
|
|
+ def extract_media(x_media_line):
|
|
|
|
+ media = parse_m3u8_attributes(x_media_line)
|
|
|
|
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
|
|
|
|
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
|
|
|
|
+ if not (media_type and group_id and name):
|
|
|
|
+ return
|
|
|
|
+ groups.setdefault(group_id, []).append(media)
|
|
|
|
+ if media_type not in ('VIDEO', 'AUDIO'):
|
|
|
|
+ return
|
|
|
|
+ media_url = media.get('URI')
|
|
|
|
+ if media_url:
|
|
|
|
+ format_id = []
|
|
|
|
+ for v in (group_id, name):
|
|
|
|
+ if v:
|
|
|
|
+ format_id.append(v)
|
|
|
|
+ f = {
|
|
|
|
+ 'format_id': '-'.join(format_id),
|
|
|
|
+ 'url': format_url(media_url),
|
|
|
|
+ 'manifest_url': m3u8_url,
|
|
|
|
+ 'language': media.get('LANGUAGE'),
|
|
|
|
+ 'ext': ext,
|
|
|
|
+ 'protocol': entry_protocol,
|
|
|
|
+ 'preference': preference,
|
|
|
|
+ }
|
|
|
|
+ if media_type == 'AUDIO':
|
|
|
|
+ f['vcodec'] = 'none'
|
|
|
|
+ formats.append(f)
|
|
|
|
+
|
|
|
|
+ def build_stream_name():
|
|
|
|
+ # Despite specification does not mention NAME attribute for
|
|
|
|
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
|
|
|
|
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
|
|
|
|
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
|
|
|
|
+ stream_name = last_stream_inf.get('NAME')
|
|
|
|
+ if stream_name:
|
|
|
|
+ return stream_name
|
|
|
|
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
|
|
|
|
+ # from corresponding rendition group
|
|
|
|
+ stream_group_id = last_stream_inf.get('VIDEO')
|
|
|
|
+ if not stream_group_id:
|
|
|
|
+ return
|
|
|
|
+ stream_group = groups.get(stream_group_id)
|
|
|
|
+ if not stream_group:
|
|
|
|
+ return stream_group_id
|
|
|
|
+ rendition = stream_group[0]
|
|
|
|
+ return rendition.get('NAME') or stream_group_id
|
|
|
|
+
|
|
for line in m3u8_doc.splitlines():
|
|
for line in m3u8_doc.splitlines():
|
|
if line.startswith('#EXT-X-STREAM-INF:'):
|
|
if line.startswith('#EXT-X-STREAM-INF:'):
|
|
- last_info = parse_m3u8_attributes(line)
|
|
|
|
|
|
+ last_stream_inf = parse_m3u8_attributes(line)
|
|
elif line.startswith('#EXT-X-MEDIA:'):
|
|
elif line.startswith('#EXT-X-MEDIA:'):
|
|
- media = parse_m3u8_attributes(line)
|
|
|
|
- media_type = media.get('TYPE')
|
|
|
|
- if media_type in ('VIDEO', 'AUDIO'):
|
|
|
|
- group_id = media.get('GROUP-ID')
|
|
|
|
- media_url = media.get('URI')
|
|
|
|
- if media_url:
|
|
|
|
- format_id = []
|
|
|
|
- for v in (group_id, media.get('NAME')):
|
|
|
|
- if v:
|
|
|
|
- format_id.append(v)
|
|
|
|
- f = {
|
|
|
|
- 'format_id': '-'.join(format_id),
|
|
|
|
- 'url': format_url(media_url),
|
|
|
|
- 'language': media.get('LANGUAGE'),
|
|
|
|
- 'ext': ext,
|
|
|
|
- 'protocol': entry_protocol,
|
|
|
|
- 'preference': preference,
|
|
|
|
- }
|
|
|
|
- if media_type == 'AUDIO':
|
|
|
|
- f['vcodec'] = 'none'
|
|
|
|
- if group_id and not audio_in_video_stream.get(group_id):
|
|
|
|
- audio_in_video_stream[group_id] = False
|
|
|
|
- formats.append(f)
|
|
|
|
- else:
|
|
|
|
- # When there is no URI in EXT-X-MEDIA let this tag's
|
|
|
|
- # data be used by regular URI lines below
|
|
|
|
- last_media = media
|
|
|
|
- if media_type == 'AUDIO' and group_id:
|
|
|
|
- audio_in_video_stream[group_id] = True
|
|
|
|
|
|
+ extract_media(line)
|
|
elif line.startswith('#') or not line.strip():
|
|
elif line.startswith('#') or not line.strip():
|
|
continue
|
|
continue
|
|
else:
|
|
else:
|
|
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
|
|
|
|
|
|
+ tbr = float_or_none(
|
|
|
|
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
|
|
|
|
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
|
|
format_id = []
|
|
format_id = []
|
|
if m3u8_id:
|
|
if m3u8_id:
|
|
format_id.append(m3u8_id)
|
|
format_id.append(m3u8_id)
|
|
- # Despite specification does not mention NAME attribute for
|
|
|
|
- # EXT-X-STREAM-INF it still sometimes may be present
|
|
|
|
- stream_name = last_info.get('NAME') or last_media.get('NAME')
|
|
|
|
|
|
+ stream_name = build_stream_name()
|
|
# Bandwidth of live streams may differ over time thus making
|
|
# Bandwidth of live streams may differ over time thus making
|
|
# format_id unpredictable. So it's better to keep provided
|
|
# format_id unpredictable. So it's better to keep provided
|
|
# format_id intact.
|
|
# format_id intact.
|
|
@@ -1400,14 +1444,14 @@ class InfoExtractor(object):
|
|
f = {
|
|
f = {
|
|
'format_id': '-'.join(format_id),
|
|
'format_id': '-'.join(format_id),
|
|
'url': manifest_url,
|
|
'url': manifest_url,
|
|
- 'manifest_url': manifest_url,
|
|
|
|
|
|
+ 'manifest_url': m3u8_url,
|
|
'tbr': tbr,
|
|
'tbr': tbr,
|
|
'ext': ext,
|
|
'ext': ext,
|
|
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
|
|
|
|
|
|
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
|
|
'protocol': entry_protocol,
|
|
'protocol': entry_protocol,
|
|
'preference': preference,
|
|
'preference': preference,
|
|
}
|
|
}
|
|
- resolution = last_info.get('RESOLUTION')
|
|
|
|
|
|
+ resolution = last_stream_inf.get('RESOLUTION')
|
|
if resolution:
|
|
if resolution:
|
|
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
|
|
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
|
|
if mobj:
|
|
if mobj:
|
|
@@ -1423,13 +1467,26 @@ class InfoExtractor(object):
|
|
'vbr': vbr,
|
|
'vbr': vbr,
|
|
'abr': abr,
|
|
'abr': abr,
|
|
})
|
|
})
|
|
- f.update(parse_codecs(last_info.get('CODECS')))
|
|
|
|
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
|
|
|
|
- # TODO: update acodec for audio only formats with the same GROUP-ID
|
|
|
|
- f['acodec'] = 'none'
|
|
|
|
|
|
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
|
|
|
|
+ f.update(codecs)
|
|
|
|
+ audio_group_id = last_stream_inf.get('AUDIO')
|
|
|
|
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
|
|
|
|
+ # references a rendition group MUST have a CODECS attribute.
|
|
|
|
+ # However, this is not always respected, for example, [2]
|
|
|
|
+ # contains EXT-X-STREAM-INF tag which references AUDIO
|
|
|
|
+ # rendition group but does not have CODECS and despite
|
|
|
|
+ # referencing audio group an audio group, it represents
|
|
|
|
+ # a complete (with audio and video) format. So, for such cases
|
|
|
|
+ # we will ignore references to rendition groups and treat them
|
|
|
|
+ # as complete formats.
|
|
|
|
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
|
|
|
|
+ audio_group = groups.get(audio_group_id)
|
|
|
|
+ if audio_group and audio_group[0].get('URI'):
|
|
|
|
+ # TODO: update acodec for audio only formats with
|
|
|
|
+ # the same GROUP-ID
|
|
|
|
+ f['acodec'] = 'none'
|
|
formats.append(f)
|
|
formats.append(f)
|
|
- last_info = {}
|
|
|
|
- last_media = {}
|
|
|
|
|
|
+ last_stream_inf = {}
|
|
return formats
|
|
return formats
|
|
|
|
|
|
@staticmethod
|
|
@staticmethod
|
|
@@ -1803,7 +1860,7 @@ class InfoExtractor(object):
|
|
'ext': mimetype2ext(mime_type),
|
|
'ext': mimetype2ext(mime_type),
|
|
'width': int_or_none(representation_attrib.get('width')),
|
|
'width': int_or_none(representation_attrib.get('width')),
|
|
'height': int_or_none(representation_attrib.get('height')),
|
|
'height': int_or_none(representation_attrib.get('height')),
|
|
- 'tbr': int_or_none(bandwidth, 1000),
|
|
|
|
|
|
+ 'tbr': float_or_none(bandwidth, 1000),
|
|
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
|
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
|
'fps': int_or_none(representation_attrib.get('frameRate')),
|
|
'fps': int_or_none(representation_attrib.get('frameRate')),
|
|
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
|
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
|
@@ -2182,7 +2239,7 @@ class InfoExtractor(object):
|
|
|
|
|
|
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
|
|
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
|
|
mobj = re.search(
|
|
mobj = re.search(
|
|
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
|
|
|
|
|
|
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
|
|
webpage)
|
|
webpage)
|
|
if mobj:
|
|
if mobj:
|
|
try:
|
|
try:
|
|
@@ -2258,11 +2315,17 @@ class InfoExtractor(object):
|
|
|
|
|
|
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
|
|
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
|
|
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
|
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
|
|
|
+ urls = []
|
|
formats = []
|
|
formats = []
|
|
for source in jwplayer_sources_data:
|
|
for source in jwplayer_sources_data:
|
|
- source_url = self._proto_relative_url(source['file'])
|
|
|
|
|
|
+ source_url = self._proto_relative_url(source.get('file'))
|
|
|
|
+ if not source_url:
|
|
|
|
+ continue
|
|
if base_url:
|
|
if base_url:
|
|
source_url = compat_urlparse.urljoin(base_url, source_url)
|
|
source_url = compat_urlparse.urljoin(base_url, source_url)
|
|
|
|
+ if source_url in urls:
|
|
|
|
+ continue
|
|
|
|
+ urls.append(source_url)
|
|
source_type = source.get('type') or ''
|
|
source_type = source.get('type') or ''
|
|
ext = mimetype2ext(source_type) or determine_ext(source_url)
|
|
ext = mimetype2ext(source_type) or determine_ext(source_url)
|
|
if source_type == 'hls' or ext == 'm3u8':
|
|
if source_type == 'hls' or ext == 'm3u8':
|