|
@@ -25,6 +25,7 @@ from ..compat import (
|
|
|
compat_getpass,
|
|
|
compat_integer_types,
|
|
|
compat_http_client,
|
|
|
+ compat_kwargs,
|
|
|
compat_map as map,
|
|
|
compat_open as open,
|
|
|
compat_os_name,
|
|
@@ -1102,6 +1103,60 @@ class InfoExtractor(object):
|
|
|
self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
|
|
|
return None
|
|
|
|
|
|
+ def _search_json(self, start_pattern, string, name, video_id, **kwargs):
|
|
|
+ """Searches string for the JSON object specified by start_pattern"""
|
|
|
+
|
|
|
+ # self, start_pattern, string, name, video_id, *, end_pattern='',
|
|
|
+ # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
|
|
|
+ # NB: end_pattern is only used to reduce the size of the initial match
|
|
|
+ end_pattern = kwargs.pop('end_pattern', '')
|
|
|
+ # (?:[\s\S]) simulates (?(s):.) (eg)
|
|
|
+ contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}')
|
|
|
+ fatal = kwargs.pop('fatal', True)
|
|
|
+ default = kwargs.pop('default', NO_DEFAULT)
|
|
|
+
|
|
|
+ if default is NO_DEFAULT:
|
|
|
+ default, has_default = {}, False
|
|
|
+ else:
|
|
|
+ fatal, has_default = False, True
|
|
|
+
|
|
|
+ json_string = self._search_regex(
|
|
|
+ r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
|
|
|
+ start_pattern, contains_pattern, end_pattern),
|
|
|
+ string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
|
|
|
+ if not json_string:
|
|
|
+ return default
|
|
|
+
|
|
|
+ # yt-dlp has a special JSON parser that allows trailing text.
|
|
|
+ # Until that arrives here, the diagnostic from the exception
|
|
|
+ # raised by json.loads() is used to extract the wanted text.
|
|
|
+ # Either way, it's a problem if a transform_source() can't
|
|
|
+ # handle the trailing text.
|
|
|
+
|
|
|
+ # force an exception
|
|
|
+ kwargs['fatal'] = True
|
|
|
+
|
|
|
+ # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
|
|
|
+ for _ in range(2):
|
|
|
+ try:
|
|
|
+ # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
|
|
|
+ transform_source = kwargs.pop('transform_source', None)
|
|
|
+ if transform_source:
|
|
|
+ json_string = transform_source(json_string)
|
|
|
+ return self._parse_json(json_string, video_id, **compat_kwargs(kwargs))
|
|
|
+ except ExtractorError as e:
|
|
|
+ end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None))
|
|
|
+ if end is not None:
|
|
|
+ json_string = json_string[:end]
|
|
|
+ continue
|
|
|
+ msg = 'Unable to extract {0} - Failed to parse JSON'.format(name)
|
|
|
+ if fatal:
|
|
|
+ raise ExtractorError(msg, cause=e.cause, video_id=video_id)
|
|
|
+ elif not has_default:
|
|
|
+ self.report_warning(
|
|
|
+ '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
|
|
|
+ return default
|
|
|
+
|
|
|
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
|
|
|
"""
|
|
|
Like _search_regex, but strips HTML tags and unescapes entities.
|