mirror of
https://github.com/ytdl-org/youtube-dl.git
synced 2024-01-07 17:16:08 +00:00
[InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
222a230871
,
thanks Lesmiscore
* update extractors in PR using above, fix tests.
This commit is contained in:
parent
8465222041
commit
b2741f2654
|
@ -7,15 +7,33 @@ import io
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
|
|
||||||
from youtube_dl.compat import compat_etree_fromstring, compat_http_server
|
|
||||||
from youtube_dl.extractor.common import InfoExtractor
|
|
||||||
from youtube_dl.extractor import YoutubeIE, get_info_extractor
|
|
||||||
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
|
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
|
from test.helper import (
|
||||||
|
expect_dict,
|
||||||
|
expect_value,
|
||||||
|
FakeYDL,
|
||||||
|
http_server_port,
|
||||||
|
)
|
||||||
|
from youtube_dl.compat import (
|
||||||
|
compat_etree_fromstring,
|
||||||
|
compat_http_server,
|
||||||
|
)
|
||||||
|
from youtube_dl.extractor.common import InfoExtractor
|
||||||
|
from youtube_dl.extractor import (
|
||||||
|
get_info_extractor,
|
||||||
|
YoutubeIE,
|
||||||
|
)
|
||||||
|
from youtube_dl.utils import (
|
||||||
|
encode_data_uri,
|
||||||
|
ExtractorError,
|
||||||
|
RegexNotFoundError,
|
||||||
|
strip_jsonp,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
TEAPOT_RESPONSE_STATUS = 418
|
TEAPOT_RESPONSE_STATUS = 418
|
||||||
TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
|
TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
|
||||||
|
@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase):
|
||||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
|
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
|
||||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
|
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
|
||||||
|
|
||||||
|
def test_search_nextjs_data(self):
|
||||||
|
html = '''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="content-type" content=
|
||||||
|
"text/html; charset=utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width">
|
||||||
|
<title>Test _search_nextjs_data()</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="__next">
|
||||||
|
<div style="background-color:#17171E" class="FU" dir="ltr">
|
||||||
|
<div class="sc-93de261d-0 dyzzYE">
|
||||||
|
<div>
|
||||||
|
<header class="HD"></header>
|
||||||
|
<main class="MN">
|
||||||
|
<div style="height:0" class="HT0">
|
||||||
|
<div style="width:NaN%" data-testid=
|
||||||
|
"stream-container" class="WDN"></div>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="sc-6e5faf91-0 dEGaHS"></footer>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script id="__NEXT_DATA__" type="application/json">
|
||||||
|
{"props":{"pageProps":{"video":{"id":"testid"}}}}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
search = self.ie._search_nextjs_data(html, 'testID')
|
||||||
|
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
|
||||||
|
|
||||||
|
def test_search_nuxt_data(self):
|
||||||
|
html = '''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="content-type" content=
|
||||||
|
"text/html; charset=utf-8">
|
||||||
|
<title>Nuxt.js Test Page</title>
|
||||||
|
<meta name="viewport" content=
|
||||||
|
"width=device-width, initial-scale=1">
|
||||||
|
<meta data-hid="robots" name="robots" content="all">
|
||||||
|
</head>
|
||||||
|
<body class="BD">
|
||||||
|
<div id="__layout">
|
||||||
|
<h1 class="H1">Example heading</h1>
|
||||||
|
<div class="IN">
|
||||||
|
<p>Decoy text</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null));
|
||||||
|
</script>
|
||||||
|
<script src="/_nuxt/a12345b.js" defer="defer"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
search = self.ie._search_nuxt_data(html, 'testID')
|
||||||
|
self.assertEqual(search['track']['id'], 'testid')
|
||||||
|
|
||||||
def test_search_json_ld_realworld(self):
|
def test_search_json_ld_realworld(self):
|
||||||
# https://github.com/ytdl-org/youtube-dl/issues/23306
|
# https://github.com/ytdl-org/youtube-dl/issues/23306
|
||||||
expect_dict(
|
expect_dict(
|
||||||
|
@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase):
|
||||||
}],
|
}],
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# from https://0000.studio/
|
||||||
|
# with type attribute but without extension in URL
|
||||||
|
expect_dict(
|
||||||
|
self,
|
||||||
|
self.ie._parse_html5_media_entries(
|
||||||
|
'https://0000.studio',
|
||||||
|
r'''
|
||||||
|
<video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
|
||||||
|
controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
|
||||||
|
</video>
|
||||||
|
''', None)[0],
|
||||||
|
{
|
||||||
|
'formats': [{
|
||||||
|
'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
|
||||||
|
'ext': 'mp4',
|
||||||
|
}],
|
||||||
|
})
|
||||||
|
|
||||||
def test_extract_jwplayer_data_realworld(self):
|
def test_extract_jwplayer_data_realworld(self):
|
||||||
# from http://www.suffolk.edu/sjc/
|
# from http://www.suffolk.edu/sjc/
|
||||||
expect_dict(
|
expect_dict(
|
||||||
|
|
|
@ -35,13 +35,6 @@ class ClipchampIE(InfoExtractor):
|
||||||
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
|
_STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s'
|
||||||
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
|
_STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'}
|
||||||
|
|
||||||
def _search_nextjs_data(self, webpage, video_id, **kw):
|
|
||||||
return self._parse_json(
|
|
||||||
self._search_regex(
|
|
||||||
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
|
|
||||||
webpage, 'next.js data', **kw),
|
|
||||||
video_id, **kw)
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import datetime
|
import datetime
|
||||||
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import netrc
|
import netrc
|
||||||
|
@ -23,6 +24,7 @@ from ..compat import (
|
||||||
compat_getpass,
|
compat_getpass,
|
||||||
compat_integer_types,
|
compat_integer_types,
|
||||||
compat_http_client,
|
compat_http_client,
|
||||||
|
compat_map as map,
|
||||||
compat_os_name,
|
compat_os_name,
|
||||||
compat_str,
|
compat_str,
|
||||||
compat_urllib_error,
|
compat_urllib_error,
|
||||||
|
@ -31,6 +33,7 @@ from ..compat import (
|
||||||
compat_urllib_request,
|
compat_urllib_request,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
compat_xml_parse_error,
|
compat_xml_parse_error,
|
||||||
|
compat_zip as zip,
|
||||||
)
|
)
|
||||||
from ..downloader.f4m import (
|
from ..downloader.f4m import (
|
||||||
get_base_url,
|
get_base_url,
|
||||||
|
@ -70,6 +73,7 @@ from ..utils import (
|
||||||
str_or_none,
|
str_or_none,
|
||||||
str_to_int,
|
str_to_int,
|
||||||
strip_or_none,
|
strip_or_none,
|
||||||
|
traverse_obj,
|
||||||
try_get,
|
try_get,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
|
@ -1349,6 +1353,44 @@ class InfoExtractor(object):
|
||||||
break
|
break
|
||||||
return dict((k, v) for k, v in info.items() if v is not None)
|
return dict((k, v) for k, v in info.items() if v is not None)
|
||||||
|
|
||||||
|
def _search_nextjs_data(self, webpage, video_id, **kw):
|
||||||
|
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
|
||||||
|
kw.pop('transform_source', None)
|
||||||
|
next_data = self._search_regex(
|
||||||
|
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
|
||||||
|
webpage, 'next.js data', group='nd', **kw)
|
||||||
|
if not next_data:
|
||||||
|
return {}
|
||||||
|
return self._parse_json(next_data, video_id, **nkw)
|
||||||
|
|
||||||
|
def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
|
||||||
|
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
|
||||||
|
|
||||||
|
# self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
|
||||||
|
context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
|
||||||
|
fatal = kwargs.get('fatal', True)
|
||||||
|
traverse = kwargs.get('traverse', ('data', 0))
|
||||||
|
|
||||||
|
re_ctx = re.escape(context_name)
|
||||||
|
|
||||||
|
FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
|
||||||
|
r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
|
||||||
|
|
||||||
|
js, arg_keys, arg_vals = self._search_regex(
|
||||||
|
(p.format(re_ctx, FUNCTION_RE) for p in
|
||||||
|
(r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
|
||||||
|
r'{0}\s*\([\s\S]*?{1}')),
|
||||||
|
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
|
||||||
|
default=NO_DEFAULT if fatal else (None, None, None))
|
||||||
|
if js is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
|
||||||
|
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
|
||||||
|
|
||||||
|
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
||||||
|
return traverse_obj(ret, traverse) or {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _hidden_inputs(html):
|
def _hidden_inputs(html):
|
||||||
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
||||||
|
@ -2496,7 +2538,8 @@ class InfoExtractor(object):
|
||||||
return f
|
return f
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _media_formats(src, cur_media_type, type_info={}):
|
def _media_formats(src, cur_media_type, type_info=None):
|
||||||
|
type_info = type_info or {}
|
||||||
full_url = absolute_url(src)
|
full_url = absolute_url(src)
|
||||||
ext = type_info.get('ext') or determine_ext(full_url)
|
ext = type_info.get('ext') or determine_ext(full_url)
|
||||||
if ext == 'm3u8':
|
if ext == 'm3u8':
|
||||||
|
@ -2514,6 +2557,7 @@ class InfoExtractor(object):
|
||||||
formats = [{
|
formats = [{
|
||||||
'url': full_url,
|
'url': full_url,
|
||||||
'vcodec': 'none' if cur_media_type == 'audio' else None,
|
'vcodec': 'none' if cur_media_type == 'audio' else None,
|
||||||
|
'ext': ext,
|
||||||
}]
|
}]
|
||||||
return is_plain_url, formats
|
return is_plain_url, formats
|
||||||
|
|
||||||
|
@ -2522,7 +2566,7 @@ class InfoExtractor(object):
|
||||||
# so we wll include them right here (see
|
# so we wll include them right here (see
|
||||||
# https://www.ampproject.org/docs/reference/components/amp-video)
|
# https://www.ampproject.org/docs/reference/components/amp-video)
|
||||||
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
|
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
|
||||||
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
|
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
|
||||||
media_tags = [(media_tag, media_tag_name, media_type, '')
|
media_tags = [(media_tag, media_tag_name, media_type, '')
|
||||||
for media_tag, media_tag_name, media_type
|
for media_tag, media_tag_name, media_type
|
||||||
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
|
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
|
||||||
|
@ -2540,7 +2584,8 @@ class InfoExtractor(object):
|
||||||
media_attributes = extract_attributes(media_tag)
|
media_attributes = extract_attributes(media_tag)
|
||||||
src = strip_or_none(media_attributes.get('src'))
|
src = strip_or_none(media_attributes.get('src'))
|
||||||
if src:
|
if src:
|
||||||
_, formats = _media_formats(src, media_type)
|
f = parse_content_type(media_attributes.get('type'))
|
||||||
|
_, formats = _media_formats(src, media_type, f)
|
||||||
media_info['formats'].extend(formats)
|
media_info['formats'].extend(formats)
|
||||||
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
|
media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
|
||||||
if media_content:
|
if media_content:
|
||||||
|
|
|
@ -24,13 +24,6 @@ class GlobalPlayerBaseIE(InfoExtractor):
|
||||||
def _match_valid_url(cls, url):
|
def _match_valid_url(cls, url):
|
||||||
return cls.re.match(cls._VALID_URL, url)
|
return cls.re.match(cls._VALID_URL, url)
|
||||||
|
|
||||||
def _search_nextjs_data(self, webpage, video_id, **kw):
|
|
||||||
return self._parse_json(
|
|
||||||
self._search_regex(
|
|
||||||
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
|
|
||||||
webpage, 'next.js data', **kw),
|
|
||||||
video_id, **kw)
|
|
||||||
|
|
||||||
def _get_page_props(self, url, video_id):
|
def _get_page_props(self, url, video_id):
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
|
return self._search_nextjs_data(webpage, video_id)['props']['pageProps']
|
||||||
|
@ -39,13 +32,14 @@ class GlobalPlayerBaseIE(InfoExtractor):
|
||||||
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
|
return urlhandle_detect_ext(self._request_webpage( # Server rejects HEAD requests
|
||||||
url, video_id, note='Determining source extension'))
|
url, video_id, note='Determining source extension'))
|
||||||
|
|
||||||
def _extract_audio(self, episode, series):
|
@staticmethod
|
||||||
|
def _clean_desc(x):
|
||||||
|
x = clean_html(x)
|
||||||
|
if x:
|
||||||
|
x = x.replace('\xa0', ' ')
|
||||||
|
return x
|
||||||
|
|
||||||
def clean_desc(x):
|
def _extract_audio(self, episode, series):
|
||||||
x = clean_html(x)
|
|
||||||
if x:
|
|
||||||
x = x.replace('\xa0', ' ')
|
|
||||||
return x
|
|
||||||
|
|
||||||
return merge_dicts({
|
return merge_dicts({
|
||||||
'vcodec': 'none',
|
'vcodec': 'none',
|
||||||
|
@ -56,7 +50,7 @@ class GlobalPlayerBaseIE(InfoExtractor):
|
||||||
'uploader': 'itunesAuthor', # podcasts only
|
'uploader': 'itunesAuthor', # podcasts only
|
||||||
}), traverse_obj(episode, {
|
}), traverse_obj(episode, {
|
||||||
'id': 'id',
|
'id': 'id',
|
||||||
'description': ('description', T(clean_desc)),
|
'description': ('description', T(self._clean_desc)),
|
||||||
'duration': ('duration', T(parse_duration)),
|
'duration': ('duration', T(parse_duration)),
|
||||||
'thumbnail': 'imageUrl',
|
'thumbnail': 'imageUrl',
|
||||||
'url': 'streamUrl',
|
'url': 'streamUrl',
|
||||||
|
@ -141,9 +135,9 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
|
||||||
'ext': 'aac',
|
'ext': 'aac',
|
||||||
# 'live_status': 'is_live',
|
# 'live_status': 'is_live',
|
||||||
'is_live': True,
|
'is_live': True,
|
||||||
'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d',
|
'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b',
|
||||||
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
|
'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=',
|
||||||
'title': 're:^Classic FM Hall of Fame.+$'
|
'title': 're:Classic FM Hall of Fame.+$'
|
||||||
},
|
},
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
@ -160,7 +154,7 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE):
|
||||||
'is_live': True,
|
'is_live': True,
|
||||||
}, traverse_obj(station, {
|
}, traverse_obj(station, {
|
||||||
'title': 'title',
|
'title': 'title',
|
||||||
'description': 'description',
|
'description': ('description', T(self._clean_desc)),
|
||||||
'thumbnail': 'image',
|
'thumbnail': 'image',
|
||||||
}), rev=True)
|
}), rev=True)
|
||||||
|
|
||||||
|
@ -177,7 +171,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
|
||||||
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
|
'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e',
|
||||||
'categories': ['Society & Culture', 'True Crime'],
|
'categories': ['Society & Culture', 'True Crime'],
|
||||||
'uploader': 'Global',
|
'uploader': 'Global',
|
||||||
'description': 'md5:da5b918eac9ae319454a10a563afacf9',
|
'description': r're:(?s).+\bscam\b.+?\bseries available now\b',
|
||||||
},
|
},
|
||||||
}, {
|
}, {
|
||||||
# radio catchup
|
# radio catchup
|
||||||
|
@ -203,7 +197,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE):
|
||||||
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
|
series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))],
|
||||||
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
|
'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None,
|
||||||
}, traverse_obj(series, {
|
}, traverse_obj(series, {
|
||||||
'description': 'description',
|
'description': ('description', T(self._clean_desc)),
|
||||||
'thumbnail': 'imageUrl',
|
'thumbnail': 'imageUrl',
|
||||||
'title': 'title',
|
'title': 'title',
|
||||||
'uploader': 'itunesAuthor', # podcasts only
|
'uploader': 'itunesAuthor', # podcasts only
|
||||||
|
|
|
@ -21,7 +21,7 @@ class WhypIE(InfoExtractor):
|
||||||
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
|
'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3',
|
||||||
'id': '18337',
|
'id': '18337',
|
||||||
'title': 'Home Page Example Track',
|
'title': 'Home Page Example Track',
|
||||||
'description': 'md5:bd758000fb93f3159339c852b5b9133c',
|
'description': r're:(?s).+\bexample track\b',
|
||||||
'ext': 'mp3',
|
'ext': 'mp3',
|
||||||
'duration': 52.82,
|
'duration': 52.82,
|
||||||
'uploader': 'Brad',
|
'uploader': 'Brad',
|
||||||
|
@ -33,29 +33,6 @@ class WhypIE(InfoExtractor):
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', fatal=True, traverse=('data', 0)):
|
|
||||||
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
|
|
||||||
|
|
||||||
import functools
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from ..utils import (js_to_json, NO_DEFAULT)
|
|
||||||
|
|
||||||
re_ctx = re.escape(context_name)
|
|
||||||
FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)'
|
|
||||||
js, arg_keys, arg_vals = self._search_regex(
|
|
||||||
(p.format(re_ctx, FUNCTION_RE) for p in (r'<script>\s*window\.{0}={1}\s*\)\s*;?\s*</script>', r'{0}\(.*?{1}')),
|
|
||||||
webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
|
|
||||||
default=NO_DEFAULT if fatal else (None, None, None))
|
|
||||||
if js is None:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
|
|
||||||
'[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
|
|
||||||
|
|
||||||
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
|
||||||
return traverse_obj(ret, traverse) or {}
|
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
unique_id = self._match_id(url)
|
unique_id = self._match_id(url)
|
||||||
webpage = self._download_webpage(url, unique_id)
|
webpage = self._download_webpage(url, unique_id)
|
||||||
|
|
Loading…
Reference in a new issue