From b2741f2654e6ddfebc1771b5d5fadb5fd6fe3863 Mon Sep 17 00:00:00 2001 From: dirkf Date: Fri, 5 May 2023 19:25:42 +0100 Subject: [InfoExtractor] Add search methods for Next/Nuxt.js from yt-dlp * add _search_nextjs_data(), from https://github.com/yt-dlp/yt-dlp/pull/1386 thanks selfisekai * add _search_nuxt_data(), from https://github.com/yt-dlp/yt-dlp/pull/1921, thanks Lesmiscore, pukkandan * add tests for the above * also fix HTML5 type recognition and tests, from https://github.com/yt-dlp/yt-dlp/commit/222a230871fe4fe63f35c49590379c9a77116819, thanks Lesmiscore * update extractors in PR using above, fix tests. --- test/test_InfoExtractor.py | 111 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 5 deletions(-) (limited to 'test') diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6d25441db..34773fbd0 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -7,15 +7,33 @@ import io import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from youtube_dl.compat import compat_etree_fromstring, compat_http_server -from youtube_dl.extractor.common import InfoExtractor -from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError import threading +from test.helper import ( + expect_dict, + expect_value, + FakeYDL, + http_server_port, +) +from youtube_dl.compat import ( + compat_etree_fromstring, + compat_http_server, +) +from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor import ( + get_info_extractor, + YoutubeIE, +) +from youtube_dl.utils import ( + encode_data_uri, + ExtractorError, + RegexNotFoundError, + strip_jsonp, +) + TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" @@ -100,6 +118,71 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_nextjs_data(self): + html = ''' + + + + + + Test _search_nextjs_data() + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + + +''' + search = self.ie._search_nextjs_data(html, 'testID') + self.assertEqual(search['props']['pageProps']['video']['id'], 'testid') + + def test_search_nuxt_data(self): + html = ''' + + + + + Nuxt.js Test Page + + + + +
+

Example heading

+
+

Decoy text

+
+
+ + + + +''' + search = self.ie._search_nuxt_data(html, 'testID') + self.assertEqual(search['track']['id'], 'testid') + def test_search_json_ld_realworld(self): # https://github.com/ytdl-org/youtube-dl/issues/23306 expect_dict( @@ -348,6 +431,24 @@ class TestInfoExtractor(unittest.TestCase): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( -- cgit 1.4.1