mirror of
				https://github.com/ytdl-org/youtube-dl.git
				synced 2025-10-29 09:26:20 -07:00 
			
		
		
		
	[InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp
* add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386
  thanks selfisekai
* add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921,
  thanks Lesmiscore, pukkandan
* add tests for the above
* also fix HTML5 type recognition and tests, from
  222a230871,
  thanks Lesmiscore
* update extractors in PR using above, fix tests.
			
			
This commit is contained in:
		| @@ -7,15 +7,33 @@ import io | |||||||
| import os | import os | ||||||
| import sys | import sys | ||||||
| import unittest | import unittest | ||||||
|  |  | ||||||
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | ||||||
|  |  | ||||||
| from test.helper import FakeYDL, expect_dict, expect_value, http_server_port |  | ||||||
| from youtube_dl.compat import compat_etree_fromstring, compat_http_server |  | ||||||
| from youtube_dl.extractor.common import InfoExtractor |  | ||||||
| from youtube_dl.extractor import YoutubeIE, get_info_extractor |  | ||||||
| from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError |  | ||||||
| import threading | import threading | ||||||
|  |  | ||||||
|  | from test.helper import ( | ||||||
|  |     expect_dict, | ||||||
|  |     expect_value, | ||||||
|  |     FakeYDL, | ||||||
|  |     http_server_port, | ||||||
|  | ) | ||||||
|  | from youtube_dl.compat import ( | ||||||
|  |     compat_etree_fromstring, | ||||||
|  |     compat_http_server, | ||||||
|  | ) | ||||||
|  | from youtube_dl.extractor.common import InfoExtractor | ||||||
|  | from youtube_dl.extractor import ( | ||||||
|  |     get_info_extractor, | ||||||
|  |     YoutubeIE, | ||||||
|  | ) | ||||||
|  | from youtube_dl.utils import ( | ||||||
|  |     encode_data_uri, | ||||||
|  |     ExtractorError, | ||||||
|  |     RegexNotFoundError, | ||||||
|  |     strip_jsonp, | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| TEAPOT_RESPONSE_STATUS = 418 | TEAPOT_RESPONSE_STATUS = 418 | ||||||
| TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" | TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>" | ||||||
| @@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase): | |||||||
|         self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) |         self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) | ||||||
|         self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) |         self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) | ||||||
|  |  | ||||||
|  |     def test_search_nextjs_data(self): | ||||||
|  |         html = ''' | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  |   <meta http-equiv="content-type" content= | ||||||
|  |   "text/html; charset=utf-8"> | ||||||
|  |   <meta name="viewport" content="width=device-width"> | ||||||
|  |   <title>Test _search_nextjs_data()</title> | ||||||
|  | </head> | ||||||
|  | <body> | ||||||
|  |   <div id="__next"> | ||||||
|  |     <div style="background-color:#17171E" class="FU" dir="ltr"> | ||||||
|  |       <div class="sc-93de261d-0 dyzzYE"> | ||||||
|  |         <div> | ||||||
|  |           <header class="HD"></header> | ||||||
|  |           <main class="MN"> | ||||||
|  |             <div style="height:0" class="HT0"> | ||||||
|  |               <div style="width:NaN%" data-testid= | ||||||
|  |               "stream-container" class="WDN"></div> | ||||||
|  |             </div> | ||||||
|  |           </main> | ||||||
|  |         </div> | ||||||
|  |         <footer class="sc-6e5faf91-0 dEGaHS"></footer> | ||||||
|  |       </div> | ||||||
|  |     </div> | ||||||
|  |   </div> | ||||||
|  |   <script id="__NEXT_DATA__" type="application/json"> | ||||||
|  |   {"props":{"pageProps":{"video":{"id":"testid"}}}} | ||||||
|  |   </script> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
|  | ''' | ||||||
|  |         search = self.ie._search_nextjs_data(html, 'testID') | ||||||
|  |         self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') | ||||||
|  |  | ||||||
|  |     def test_search_nuxt_data(self): | ||||||
|  |         html = ''' | ||||||
|  | <!DOCTYPE html> | ||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  |   <meta http-equiv="content-type" content= | ||||||
|  |   "text/html; charset=utf-8"> | ||||||
|  |   <title>Nuxt.js Test Page</title> | ||||||
|  |   <meta name="viewport" content= | ||||||
|  |   "width=device-width, initial-scale=1"> | ||||||
|  |   <meta data-hid="robots" name="robots" content="all"> | ||||||
|  | </head> | ||||||
|  | <body class="BD"> | ||||||
|  |   <div id="__layout"> | ||||||
|  |     <h1 class="H1">Example heading</h1> | ||||||
|  |     <div class="IN"> | ||||||
|  |       <p>Decoy text</p> | ||||||
|  |     </div> | ||||||
|  |   </div> | ||||||
|  |   <script> | ||||||
|  |   window.__NUXT__=(function(a,b,c,d,e,f,g,h){return {decoy:" default",data:[{track:{id:f,title:g}}]}}(null,null,"c",null,null,"testid","Nuxt.js title",null)); | ||||||
|  |   </script> | ||||||
|  |   <script src="/_nuxt/a12345b.js" defer="defer"></script> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
|  | ''' | ||||||
|  |         search = self.ie._search_nuxt_data(html, 'testID') | ||||||
|  |         self.assertEqual(search['track']['id'], 'testid') | ||||||
|  |  | ||||||
|     def test_search_json_ld_realworld(self): |     def test_search_json_ld_realworld(self): | ||||||
|         # https://github.com/ytdl-org/youtube-dl/issues/23306 |         # https://github.com/ytdl-org/youtube-dl/issues/23306 | ||||||
|         expect_dict( |         expect_dict( | ||||||
| @@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase): | |||||||
|                 }], |                 }], | ||||||
|             }) |             }) | ||||||
|  |  | ||||||
|  |         # from https://0000.studio/ | ||||||
|  |         # with type attribute but without extension in URL | ||||||
|  |         expect_dict( | ||||||
|  |             self, | ||||||
|  |             self.ie._parse_html5_media_entries( | ||||||
|  |                 'https://0000.studio', | ||||||
|  |                 r''' | ||||||
|  |                 <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92" | ||||||
|  |                     controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain"> | ||||||
|  |                 </video> | ||||||
|  |                 ''', None)[0], | ||||||
|  |             { | ||||||
|  |                 'formats': [{ | ||||||
|  |                     'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', | ||||||
|  |                     'ext': 'mp4', | ||||||
|  |                 }], | ||||||
|  |             }) | ||||||
|  |  | ||||||
|     def test_extract_jwplayer_data_realworld(self): |     def test_extract_jwplayer_data_realworld(self): | ||||||
|         # from http://www.suffolk.edu/sjc/ |         # from http://www.suffolk.edu/sjc/ | ||||||
|         expect_dict( |         expect_dict( | ||||||
|   | |||||||
| @@ -35,13 +35,6 @@ class ClipchampIE(InfoExtractor): | |||||||
|     _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' |     _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' | ||||||
|     _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} |     _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} | ||||||
|  |  | ||||||
|     def _search_nextjs_data(self, webpage, video_id, **kw): |  | ||||||
|         return self._parse_json( |  | ||||||
|             self._search_regex( |  | ||||||
|                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', |  | ||||||
|                 webpage, 'next.js data', **kw), |  | ||||||
|             video_id, **kw) |  | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         video_id = self._match_id(url) |         video_id = self._match_id(url) | ||||||
|         webpage = self._download_webpage(url, video_id) |         webpage = self._download_webpage(url, video_id) | ||||||
|   | |||||||
| @@ -3,6 +3,7 @@ from __future__ import unicode_literals | |||||||
|  |  | ||||||
| import base64 | import base64 | ||||||
| import datetime | import datetime | ||||||
|  | import functools | ||||||
| import hashlib | import hashlib | ||||||
| import json | import json | ||||||
| import netrc | import netrc | ||||||
| @@ -23,6 +24,7 @@ from ..compat import ( | |||||||
|     compat_getpass, |     compat_getpass, | ||||||
|     compat_integer_types, |     compat_integer_types, | ||||||
|     compat_http_client, |     compat_http_client, | ||||||
|  |     compat_map as map, | ||||||
|     compat_os_name, |     compat_os_name, | ||||||
|     compat_str, |     compat_str, | ||||||
|     compat_urllib_error, |     compat_urllib_error, | ||||||
| @@ -31,6 +33,7 @@ from ..compat import ( | |||||||
|     compat_urllib_request, |     compat_urllib_request, | ||||||
|     compat_urlparse, |     compat_urlparse, | ||||||
|     compat_xml_parse_error, |     compat_xml_parse_error, | ||||||
|  |     compat_zip as zip, | ||||||
| ) | ) | ||||||
| from ..downloader.f4m import ( | from ..downloader.f4m import ( | ||||||
|     get_base_url, |     get_base_url, | ||||||
| @@ -70,6 +73,7 @@ from ..utils import ( | |||||||
|     str_or_none, |     str_or_none, | ||||||
|     str_to_int, |     str_to_int, | ||||||
|     strip_or_none, |     strip_or_none, | ||||||
|  |     traverse_obj, | ||||||
|     try_get, |     try_get, | ||||||
|     unescapeHTML, |     unescapeHTML, | ||||||
|     unified_strdate, |     unified_strdate, | ||||||
| @@ -1349,6 +1353,44 @@ class InfoExtractor(object): | |||||||
|                     break |                     break | ||||||
|         return dict((k, v) for k, v in info.items() if v is not None) |         return dict((k, v) for k, v in info.items() if v is not None) | ||||||
|  |  | ||||||
|  |     def _search_nextjs_data(self, webpage, video_id, **kw): | ||||||
|  |         nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) | ||||||
|  |         kw.pop('transform_source', None) | ||||||
|  |         next_data = self._search_regex( | ||||||
|  |             r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', | ||||||
|  |             webpage, 'next.js data', group='nd', **kw) | ||||||
|  |         if not next_data: | ||||||
|  |             return {} | ||||||
|  |         return self._parse_json(next_data, video_id, **nkw) | ||||||
|  |  | ||||||
|  |     def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): | ||||||
|  |         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" | ||||||
|  |  | ||||||
|  |         # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) | ||||||
|  |         context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') | ||||||
|  |         fatal = kwargs.get('fatal', True) | ||||||
|  |         traverse = kwargs.get('traverse', ('data', 0)) | ||||||
|  |  | ||||||
|  |         re_ctx = re.escape(context_name) | ||||||
|  |  | ||||||
|  |         FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*' | ||||||
|  |                        r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)') | ||||||
|  |  | ||||||
|  |         js, arg_keys, arg_vals = self._search_regex( | ||||||
|  |             (p.format(re_ctx, FUNCTION_RE) for p in | ||||||
|  |              (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>', | ||||||
|  |               r'{0}\s*\([\s\S]*?{1}')), | ||||||
|  |             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), | ||||||
|  |             default=NO_DEFAULT if fatal else (None, None, None)) | ||||||
|  |         if js is None: | ||||||
|  |             return {} | ||||||
|  |  | ||||||
|  |         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( | ||||||
|  |             '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) | ||||||
|  |  | ||||||
|  |         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) | ||||||
|  |         return traverse_obj(ret, traverse) or {} | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _hidden_inputs(html): |     def _hidden_inputs(html): | ||||||
|         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) |         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) | ||||||
| @@ -2496,7 +2538,8 @@ class InfoExtractor(object): | |||||||
|                 return f |                 return f | ||||||
|             return {} |             return {} | ||||||
|  |  | ||||||
|         def _media_formats(src, cur_media_type, type_info={}): |         def _media_formats(src, cur_media_type, type_info=None): | ||||||
|  |             type_info = type_info or {} | ||||||
|             full_url = absolute_url(src) |             full_url = absolute_url(src) | ||||||
|             ext = type_info.get('ext') or determine_ext(full_url) |             ext = type_info.get('ext') or determine_ext(full_url) | ||||||
|             if ext == 'm3u8': |             if ext == 'm3u8': | ||||||
| @@ -2514,6 +2557,7 @@ class InfoExtractor(object): | |||||||
|                 formats = [{ |                 formats = [{ | ||||||
|                     'url': full_url, |                     'url': full_url, | ||||||
|                     'vcodec': 'none' if cur_media_type == 'audio' else None, |                     'vcodec': 'none' if cur_media_type == 'audio' else None, | ||||||
|  |                     'ext': ext, | ||||||
|                 }] |                 }] | ||||||
|             return is_plain_url, formats |             return is_plain_url, formats | ||||||
|  |  | ||||||
| @@ -2522,7 +2566,7 @@ class InfoExtractor(object): | |||||||
|         # so we wll include them right here (see |         # so we wll include them right here (see | ||||||
|         # https://www.ampproject.org/docs/reference/components/amp-video) |         # https://www.ampproject.org/docs/reference/components/amp-video) | ||||||
|         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ |         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ | ||||||
|         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' |         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)' | ||||||
|         media_tags = [(media_tag, media_tag_name, media_type, '') |         media_tags = [(media_tag, media_tag_name, media_type, '') | ||||||
|                       for media_tag, media_tag_name, media_type |                       for media_tag, media_tag_name, media_type | ||||||
|                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] |                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] | ||||||
| @@ -2540,7 +2584,8 @@ class InfoExtractor(object): | |||||||
|             media_attributes = extract_attributes(media_tag) |             media_attributes = extract_attributes(media_tag) | ||||||
|             src = strip_or_none(media_attributes.get('src')) |             src = strip_or_none(media_attributes.get('src')) | ||||||
|             if src: |             if src: | ||||||
|                 _, formats = _media_formats(src, media_type) |                 f = parse_content_type(media_attributes.get('type')) | ||||||
|  |                 _, formats = _media_formats(src, media_type, f) | ||||||
|                 media_info['formats'].extend(formats) |                 media_info['formats'].extend(formats) | ||||||
|             media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) |             media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) | ||||||
|             if media_content: |             if media_content: | ||||||
|   | |||||||
| @@ -24,13 +24,6 @@ class GlobalPlayerBaseIE(InfoExtractor): | |||||||
|     def _match_valid_url(cls, url): |     def _match_valid_url(cls, url): | ||||||
|         return cls.re.match(cls._VALID_URL, url) |         return cls.re.match(cls._VALID_URL, url) | ||||||
|  |  | ||||||
|     def _search_nextjs_data(self, webpage, video_id, **kw): |  | ||||||
|         return self._parse_json( |  | ||||||
|             self._search_regex( |  | ||||||
|                 r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', |  | ||||||
|                 webpage, 'next.js data', **kw), |  | ||||||
|             video_id, **kw) |  | ||||||
|  |  | ||||||
|     def _get_page_props(self, url, video_id): |     def _get_page_props(self, url, video_id): | ||||||
|         webpage = self._download_webpage(url, video_id) |         webpage = self._download_webpage(url, video_id) | ||||||
|         return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] |         return self._search_nextjs_data(webpage, video_id)['props']['pageProps'] | ||||||
| @@ -39,13 +32,14 @@ class GlobalPlayerBaseIE(InfoExtractor): | |||||||
|         return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests |         return urlhandle_detect_ext(self._request_webpage(  # Server rejects HEAD requests | ||||||
|             url, video_id, note='Determining source extension')) |             url, video_id, note='Determining source extension')) | ||||||
|  |  | ||||||
|     def _extract_audio(self, episode, series): |     @staticmethod | ||||||
|  |     def _clean_desc(x): | ||||||
|  |         x = clean_html(x) | ||||||
|  |         if x: | ||||||
|  |             x = x.replace('\xa0', ' ') | ||||||
|  |         return x | ||||||
|  |  | ||||||
|         def clean_desc(x): |     def _extract_audio(self, episode, series): | ||||||
|             x = clean_html(x) |  | ||||||
|             if x: |  | ||||||
|                 x = x.replace('\xa0', ' ') |  | ||||||
|             return x |  | ||||||
|  |  | ||||||
|         return merge_dicts({ |         return merge_dicts({ | ||||||
|             'vcodec': 'none', |             'vcodec': 'none', | ||||||
| @@ -56,7 +50,7 @@ class GlobalPlayerBaseIE(InfoExtractor): | |||||||
|             'uploader': 'itunesAuthor',  # podcasts only |             'uploader': 'itunesAuthor',  # podcasts only | ||||||
|         }), traverse_obj(episode, { |         }), traverse_obj(episode, { | ||||||
|             'id': 'id', |             'id': 'id', | ||||||
|             'description': ('description', T(clean_desc)), |             'description': ('description', T(self._clean_desc)), | ||||||
|             'duration': ('duration', T(parse_duration)), |             'duration': ('duration', T(parse_duration)), | ||||||
|             'thumbnail': 'imageUrl', |             'thumbnail': 'imageUrl', | ||||||
|             'url': 'streamUrl', |             'url': 'streamUrl', | ||||||
| @@ -141,9 +135,9 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): | |||||||
|             'ext': 'aac', |             'ext': 'aac', | ||||||
|             # 'live_status': 'is_live', |             # 'live_status': 'is_live', | ||||||
|             'is_live': True, |             'is_live': True, | ||||||
|             'description': 'md5:e10f5e10b01a7f2c14ba815509fbb38d', |             'description': r're:(?s).+\bclassical\b.+\bClassic FM Hall [oO]f Fame\b', | ||||||
|             'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', |             'thumbnail': 'https://images.globalplayer.com/images/551379?width=450&signature=oMLPZIoi5_dBSHnTMREW0Xg76mA=', | ||||||
|             'title': 're:^Classic FM Hall of Fame.+$' |             'title': 're:Classic FM Hall of Fame.+$' | ||||||
|         }, |         }, | ||||||
|     }] |     }] | ||||||
|  |  | ||||||
| @@ -160,7 +154,7 @@ class GlobalPlayerLivePlaylistIE(GlobalPlayerBaseIE): | |||||||
|             'is_live': True, |             'is_live': True, | ||||||
|         }, traverse_obj(station, { |         }, traverse_obj(station, { | ||||||
|             'title': 'title', |             'title': 'title', | ||||||
|             'description': 'description', |             'description': ('description', T(self._clean_desc)), | ||||||
|             'thumbnail': 'image', |             'thumbnail': 'image', | ||||||
|         }), rev=True) |         }), rev=True) | ||||||
|  |  | ||||||
| @@ -177,7 +171,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE): | |||||||
|             'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', |             'thumbnail': 'md5:60286e7d12d795bd1bbc9efc6cee643e', | ||||||
|             'categories': ['Society & Culture', 'True Crime'], |             'categories': ['Society & Culture', 'True Crime'], | ||||||
|             'uploader': 'Global', |             'uploader': 'Global', | ||||||
|             'description': 'md5:da5b918eac9ae319454a10a563afacf9', |             'description': r're:(?s).+\bscam\b.+?\bseries available now\b', | ||||||
|         }, |         }, | ||||||
|     }, { |     }, { | ||||||
|         # radio catchup |         # radio catchup | ||||||
| @@ -203,7 +197,7 @@ class GlobalPlayerAudioIE(GlobalPlayerBaseIE): | |||||||
|                         series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], |                         series, ('episodes', lambda _, v: v['id'] and v['streamUrl']))], | ||||||
|             'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None, |             'categories': traverse_obj(series, ('categories', Ellipsis, 'name')) or None, | ||||||
|         }, traverse_obj(series, { |         }, traverse_obj(series, { | ||||||
|             'description': 'description', |             'description': ('description', T(self._clean_desc)), | ||||||
|             'thumbnail': 'imageUrl', |             'thumbnail': 'imageUrl', | ||||||
|             'title': 'title', |             'title': 'title', | ||||||
|             'uploader': 'itunesAuthor',  # podcasts only |             'uploader': 'itunesAuthor',  # podcasts only | ||||||
|   | |||||||
| @@ -21,7 +21,7 @@ class WhypIE(InfoExtractor): | |||||||
|             'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3', |             'url': 'https://cdn.whyp.it/50eb17cc-e9ff-4e18-b89b-dc9206a95cb1.mp3', | ||||||
|             'id': '18337', |             'id': '18337', | ||||||
|             'title': 'Home Page Example Track', |             'title': 'Home Page Example Track', | ||||||
|             'description': 'md5:bd758000fb93f3159339c852b5b9133c', |             'description': r're:(?s).+\bexample track\b', | ||||||
|             'ext': 'mp3', |             'ext': 'mp3', | ||||||
|             'duration': 52.82, |             'duration': 52.82, | ||||||
|             'uploader': 'Brad', |             'uploader': 'Brad', | ||||||
| @@ -33,29 +33,6 @@ class WhypIE(InfoExtractor): | |||||||
|         'only_matching': True, |         'only_matching': True, | ||||||
|     }] |     }] | ||||||
|  |  | ||||||
|     def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', fatal=True, traverse=('data', 0)): |  | ||||||
|         """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" |  | ||||||
|  |  | ||||||
|         import functools |  | ||||||
|         import json |  | ||||||
|         import re |  | ||||||
|         from ..utils import (js_to_json, NO_DEFAULT) |  | ||||||
|  |  | ||||||
|         re_ctx = re.escape(context_name) |  | ||||||
|         FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)' |  | ||||||
|         js, arg_keys, arg_vals = self._search_regex( |  | ||||||
|             (p.format(re_ctx, FUNCTION_RE) for p in (r'<script>\s*window\.{0}={1}\s*\)\s*;?\s*</script>', r'{0}\(.*?{1}')), |  | ||||||
|             webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), |  | ||||||
|             default=NO_DEFAULT if fatal else (None, None, None)) |  | ||||||
|         if js is None: |  | ||||||
|             return {} |  | ||||||
|  |  | ||||||
|         args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( |  | ||||||
|             '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) |  | ||||||
|  |  | ||||||
|         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) |  | ||||||
|         return traverse_obj(ret, traverse) or {} |  | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         unique_id = self._match_id(url) |         unique_id = self._match_id(url) | ||||||
|         webpage = self._download_webpage(url, unique_id) |         webpage = self._download_webpage(url, unique_id) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user