diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/compat.py | 219 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 63 | ||||
-rw-r--r-- | youtube_dl/extractor/extractors.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/youporn.py | 600 | ||||
-rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 16 | ||||
-rw-r--r-- | youtube_dl/traversal.py | 10 | ||||
-rw-r--r-- | youtube_dl/utils.py | 102 |
7 files changed, 918 insertions, 102 deletions
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 53ff2a892..ed1a33cf2 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2719,8 +2719,222 @@ if sys.version_info < (2, 7): if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') return xpath + + # further code below based on CPython 2.7 source + import functools + + _xpath_tokenizer_re = re.compile(r'''(?x) + ( # (1) + '[^']*'|"[^"]*"| # quoted strings, or + ::|//?|\.\.|\(\)|[/.*:[\]()@=] # navigation specials + )| # or (2) + ((?:\{[^}]+\})?[^/[\]()@=\s]+)| # token: optional {ns}, no specials + \s+ # or white space + ''') + + def _xpath_tokenizer(pattern, namespaces=None): + for token in _xpath_tokenizer_re.findall(pattern): + tag = token[1] + if tag and tag[0] != "{" and ":" in tag: + try: + if not namespaces: + raise KeyError + prefix, uri = tag.split(":", 1) + yield token[0], "{%s}%s" % (namespaces[prefix], uri) + except KeyError: + raise SyntaxError("prefix %r not found in prefix map" % prefix) + else: + yield token + + def _get_parent_map(context): + parent_map = context.parent_map + if parent_map is None: + context.parent_map = parent_map = {} + for p in context.root.getiterator(): + for e in p: + parent_map[e] = p + return parent_map + + def _select(context, result, filter_fn=lambda *_: True): + for elem in result: + for e in elem: + if filter_fn(e, elem): + yield e + + def _prepare_child(next_, token): + tag = token[1] + return functools.partial(_select, filter_fn=lambda e, _: e.tag == tag) + + def _prepare_star(next_, token): + return _select + + def _prepare_self(next_, token): + return lambda _, result: (e for e in result) + + def _prepare_descendant(next_, token): + token = next(next_) + if token[0] == "*": + tag = "*" + elif not token[0]: + tag = token[1] + else: + raise SyntaxError("invalid descendant") + + def select(context, result): + for elem in result: + for e in elem.getiterator(tag): + if e is not elem: + yield e + return select + + def _prepare_parent(next_, token): + def select(context, result): + # FIXME: raise error if .. is applied at toplevel? + parent_map = _get_parent_map(context) + result_map = {} + for elem in result: + if elem in parent_map: + parent = parent_map[elem] + if parent not in result_map: + result_map[parent] = None + yield parent + return select + + def _prepare_predicate(next_, token): + signature = [] + predicate = [] + for token in next_: + if token[0] == "]": + break + if token[0] and token[0][:1] in "'\"": + token = "'", token[0][1:-1] + signature.append(token[0] or "-") + predicate.append(token[1]) + + def select(context, result, filter_fn=lambda _: True): + for elem in result: + if filter_fn(elem): + yield elem + + signature = "".join(signature) + # use signature to determine predicate type + if signature == "@-": + # [@attribute] predicate + key = predicate[1] + return functools.partial( + select, filter_fn=lambda el: el.get(key) is not None) + if signature == "@-='": + # [@attribute='value'] + key = predicate[1] + value = predicate[-1] + return functools.partial( + select, filter_fn=lambda el: el.get(key) == value) + if signature == "-" and not re.match(r"\d+$", predicate[0]): + # [tag] + tag = predicate[0] + return functools.partial( + select, filter_fn=lambda el: el.find(tag) is not None) + if signature == "-='" and not re.match(r"\d+$", predicate[0]): + # [tag='value'] + tag = predicate[0] + value = predicate[-1] + + def itertext(el): + for e in el.getiterator(): + e = e.text + if e: + yield e + + def select(context, result): + for elem in result: + for e in elem.findall(tag): + if "".join(itertext(e)) == value: + yield elem + break + return select + if signature == "-" or signature == "-()" or signature == "-()-": + # [index] or [last()] or [last()-index] + if signature == "-": + index = int(predicate[0]) - 1 + else: + if predicate[0] != "last": + raise SyntaxError("unsupported function") + if signature == "-()-": + try: + index = int(predicate[2]) - 1 + except ValueError: + raise SyntaxError("unsupported expression") + else: + index = -1 + + def select(context, result): + parent_map = _get_parent_map(context) + for elem in result: + try: + parent = parent_map[elem] + # FIXME: what if the selector is "*" ? + elems = list(parent.findall(elem.tag)) + if elems[index] is elem: + yield elem + except (IndexError, KeyError): + pass + return select + raise SyntaxError("invalid predicate") + + ops = { + "": _prepare_child, + "*": _prepare_star, + ".": _prepare_self, + "..": _prepare_parent, + "//": _prepare_descendant, + "[": _prepare_predicate, + } + + _cache = {} + + class _SelectorContext: + parent_map = None + + def __init__(self, root): + self.root = root + + ## + # Generate all matching objects. + + def compat_etree_iterfind(elem, path, namespaces=None): + # compile selector pattern + if path[-1:] == "/": + path = path + "*" # implicit all (FIXME: keep this?) + try: + selector = _cache[path] + except KeyError: + if len(_cache) > 100: + _cache.clear() + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") + tokens = _xpath_tokenizer(path, namespaces) + selector = [] + for token in tokens: + if token[0] == "/": + continue + try: + selector.append(ops[token[0]](tokens, token)) + except StopIteration: + raise SyntaxError("invalid path") + _cache[path] = selector + # execute selector pattern + result = [elem] + context = _SelectorContext(elem) + for select in selector: + result = select(context, result) + return result + + # end of code based on CPython 2.7 source + + else: compat_xpath = lambda xpath: xpath + compat_etree_iterfind = lambda element, match: element.iterfind(match) compat_os_name = os._name if os.name == 'java' else os.name @@ -2756,7 +2970,7 @@ except (AssertionError, UnicodeEncodeError): def compat_ord(c): - if type(c) is int: + if isinstance(c, int): return c else: return ord(c) @@ -2955,7 +3169,7 @@ except ImportError: return self def __exit__(self, exc_type, exc_val, exc_tb): - return exc_val is not None and isinstance(exc_val, self._exceptions or tuple()) + return exc_type is not None and issubclass(exc_type, self._exceptions or tuple()) # subprocess.Popen context manager @@ -3308,6 +3522,7 @@ __all__ = [ 'compat_contextlib_suppress', 'compat_ctypes_WINFUNCTYPE', 'compat_etree_fromstring', + 'compat_etree_iterfind', 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7fae9e57b..b10e84416 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1169,10 +1169,10 @@ class InfoExtractor(object): def _get_netrc_login_info(self, netrc_machine=None): username = None password = None - netrc_machine = netrc_machine or self._NETRC_MACHINE if self._downloader.params.get('usenetrc', False): try: + netrc_machine = netrc_machine or self._NETRC_MACHINE info = netrc.netrc().authenticators(netrc_machine) if info is not None: username = info[0] @@ -1180,7 +1180,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError( 'No authenticators for %s' % netrc_machine) - except (IOError, netrc.NetrcParseError) as err: + except (AttributeError, IOError, netrc.NetrcParseError) as err: self._downloader.report_warning( 'parsing .netrc: %s' % error_to_compat_str(err)) @@ -1490,14 +1490,18 @@ class InfoExtractor(object): return dict((k, v) for k, v in info.items() if v is not None) def _search_nextjs_data(self, webpage, video_id, **kw): - nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) - kw.pop('transform_source', None) - next_data = self._search_regex( - r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''', - webpage, 'next.js data', group='nd', **kw) - if not next_data: - return {} - return self._parse_json(next_data, video_id, **nkw) + # ..., *, transform_source=None, fatal=True, default=NO_DEFAULT + + # TODO: remove this backward compat + default = kw.get('default', NO_DEFAULT) + if default == '{}': + kw['default'] = {} + kw = compat_kwargs(kw) + + return self._search_json( + r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''', + webpage, 'next.js data', video_id, end_pattern='</script>', + **kw) def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" @@ -3296,12 +3300,16 @@ class InfoExtractor(object): return ret @classmethod - def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): - """ Merge two subtitle dictionaries, language by language. """ - ret = dict(subtitle_dict1) - for lang in subtitle_dict2: - ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - return ret + def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs): + """ Merge subtitle dictionaries, language by language. """ + + # ..., * , target=None + target = kwargs.get('target') or dict(subtitle_dict1) + + for subtitle_dict in subtitle_dicts: + for lang in subtitle_dict: + target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang]) + return target def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) @@ -3334,6 +3342,29 @@ class InfoExtractor(object): def _generic_title(self, url): return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + def _yes_playlist(self, playlist_id, video_id, *args, **kwargs): + # smuggled_data=None, *, playlist_label='playlist', video_label='video' + smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data') + playlist_label = kwargs.get('playlist_label', 'playlist') + video_label = kwargs.get('video_label', 'video') + + if not playlist_id or not video_id: + return not video_id + + no_playlist = (smuggled_data or {}).get('force_noplaylist') + if no_playlist is not None: + return not no_playlist + + video_id = '' if video_id is True else ' ' + video_id + noplaylist = self.get_param('noplaylist') + self.to_screen( + 'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id) + if noplaylist else + 'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format( + playlist_label, '' if playlist_id is True else ' ' + playlist_id, + video_label, video_id)) + return not noplaylist + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a56a7c52f..03d035a27 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1653,7 +1653,15 @@ from .younow import ( YouNowChannelIE, YouNowMomentIE, ) -from .youporn import YouPornIE +from .youporn import ( + YouPornIE, + YouPornCategoryIE, + YouPornChannelIE, + YouPornCollectionIE, + YouPornStarIE, + YouPornTagIE, + YouPornVideosIE, +) from .yourporn import YourPornIE from .yourupload import YourUploadIE from .youtube import ( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 31e8abb72..ec6125a79 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,20 +1,38 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re +from time import sleep from .common import InfoExtractor from ..utils import ( + clean_html, extract_attributes, + ExtractorError, + get_element_by_class, + get_element_by_id, int_or_none, - str_to_int, + merge_dicts, + parse_count, + parse_qs, + T, + traverse_obj, unified_strdate, url_or_none, + urljoin, ) class YouPornIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _VALID_URL = ( + r'youporn:(?P<id>\d+)', + r'''(?x) + https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+) + (?:/(?:(?P<display_id>[^/?#&]+)/?)?)?(?:[#?]|$) + ''' + ) + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -34,7 +52,7 @@ class YouPornIE(InfoExtractor): 'tags': list, 'age_limit': 18, }, - 'skip': 'This video has been disabled', + 'skip': 'This video has been deactivated', }, { # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', @@ -66,57 +84,104 @@ class YouPornIE(InfoExtractor): }, { 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/', + 'info_dict': { + 'id': '16290308', + 'age_limit': 18, + 'categories': [], + 'description': None, # SEO spam using title removed + 'display_id': 'tinderspecial-trailer1', + 'duration': 298.0, + 'ext': 'mp4', + 'upload_date': '20201123', + 'uploader': 'Ersties', + 'tags': [], + 'thumbnail': 'https://fi1.ypncdn.com/m=eaSaaTbWx/202011/23/16290308/original/3.jpg', + 'timestamp': 1606147564, + 'title': 'Tinder In Real Life', + 'view_count': int, + } }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', - webpage) + @classmethod + def _extract_urls(cls, webpage): + def yield_urls(): + for p in cls._EMBED_REGEX: + for m in re.finditer(p, webpage): + yield m.group('url') + + return list(yield_urls()) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + # A different video ID (data-video-id) is hidden in the page but + # never seems to be used + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + url = 'http://www.youporn.com/watch/%s' % (video_id,) + webpage = self._download_webpage( + url, video_id, headers={'Cookie': 'age_verified=1'}) + + watchable = self._search_regex( + r'''(<div\s[^>]*\bid\s*=\s*('|")?watch-container(?(2)\2|(?!-)\b)[^>]*>)''', + webpage, 'watchability', default=None) + if not watchable: + msg = re.split(r'\s{4}', clean_html(get_element_by_id( + 'mainContent', webpage)) or '')[0] + raise ExtractorError( + ('%s says: %s' % (self.IE_NAME, msg)) + if msg else 'Video unavailable: no reason found', + expected=True) + # internal ID ? + # video_id = extract_attributes(watchable).get('data-video-id') + + playervars = self._search_json( + r'\bplayervars\s*:', webpage, 'playervars', video_id) + + def get_fmt(x): + v_url = url_or_none(x.get('videoUrl')) + if v_url: + x['videoUrl'] = v_url + return (x['format'], x) - definitions = self._download_json( - 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id, - display_id) + defs_by_format = dict(traverse_obj(playervars, ( + 'mediaDefinitions', lambda _, v: v.get('format'), T(get_fmt)))) + + def get_format_data(f): + if f not in defs_by_format: + return [] + return self._download_json( + defs_by_format[f]['videoUrl'], video_id, '{0}-formats'.format(f)) formats = [] - for definition in definitions: - if not isinstance(definition, dict): - continue - video_url = url_or_none(definition.get('videoUrl')) - if not video_url: - continue - f = { - 'url': video_url, - 'filesize': int_or_none(definition.get('videoSize')), - } - height = int_or_none(definition.get('quality')) + # Try to extract only the actual master m3u8 first, avoiding the duplicate single resolution "master" m3u8s + for hls_url in traverse_obj( + get_format_data('hls'), + (lambda _, v: not isinstance(v['defaultQuality'], bool), 'videoUrl'), + (Ellipsis, 'videoUrl')): + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', fatal=False, m3u8_id='hls', + entry_protocol='m3u8_native')) + + for f in traverse_obj(get_format_data('mp4'), ( + lambda _, v: v.get('videoUrl'), { + 'url': ('videoUrl', T(url_or_none)), + 'filesize': ('videoSize', T(int_or_none)), + 'height': ('quality', T(int_or_none)), + }, T(lambda x: x.get('videoUrl') and x))): # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) + mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', f['videoUrl']) if mobj: - if not height: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'tbr': bitrate, - }) - f['height'] = height + if not f.get('height'): + f['height'] = int(mobj.group('height')) + f['tbr'] = int(mobj.group('bitrate')) + f['format_id'] = '%dp-%dk' % (f['height'], f['tbr']) formats.append(f) self._sort_formats(formats) - webpage = self._download_webpage( - 'http://www.youporn.com/watch/%s' % video_id, display_id, - headers={'Cookie': 'age_verified=1'}) - title = self._html_search_regex( r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', webpage, 'title', default=None) or self._og_search_title( @@ -131,8 +196,10 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'duration', fatal=False)) + duration = traverse_obj(playervars, ('duration', T(int_or_none))) + if duration is None: + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', @@ -148,11 +215,11 @@ class YouPornIE(InfoExtractor): view_count = None views = self._search_regex( - r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage, - 'views', default=None) + r'(<div\s[^>]*\bdata-value\s*=[^>]+>)\s*<label>Views:</label>', + webpage, 'views', default=None) if views: - view_count = str_to_int(extract_attributes(views).get('data-value')) - comment_count = str_to_int(self._search_regex( + view_count = parse_count(extract_attributes(views).get('data-value')) + comment_count = parse_count(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', default=None)) @@ -168,7 +235,10 @@ class YouPornIE(InfoExtractor): r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', 'tags') - return { + data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) or {} + data.pop('url', None) + + result = merge_dicts(data, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -183,4 +253,442 @@ class YouPornIE(InfoExtractor): 'tags': tags, 'age_limit': age_limit, 'formats': formats, - } + }) + # Remove promotional non-description + if result.get('description', '').startswith( + 'Watch %s online' % (result['title'],)): + del result['description'] + return result + + +class YouPornListBase(InfoExtractor): + # pattern in '.title-text' element of page section containing videos + _PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$' + _PAGE_RETRY_COUNT = 0 # ie, no retry + _PAGE_RETRY_DELAY = 2 # seconds + + def _get_next_url(self, url, pl_id, html): + return urljoin(url, self._search_regex( + r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''', + get_element_by_id('next', html) or '', 'next page', + group='url', default=None)) + + @classmethod + def _get_title_from_slug(cls, title_slug): + return re.sub(r'[_-]', ' ', title_slug) + + def _entries(self, url, pl_id, html=None, page_num=None): + + # separates page sections + PLAYLIST_SECTION_RE = ( + r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>''' + ) + # contains video link + VIDEO_URL_RE = r'''(?x) + <div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s* + (?:<div\b[\s\S]+?</div>\s*)* + <a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2 + ''' + + def yield_pages(url, html=html, page_num=page_num): + fatal = not html + for pnum in itertools.count(start=page_num or 1): + if not html: + html = self._download_webpage( + url, pl_id, note='Downloading page %d' % pnum, + fatal=fatal) + if not html: + break + fatal = False + yield (url, html, pnum) + # explicit page: extract just that page + if page_num is not None: + break + next_url = self._get_next_url(url, pl_id, html) + if not next_url or next_url == url: + break + url, html = next_url, None + + def retry_page(msg, tries_left, page_data): + if tries_left <= 0: + return + self.report_warning(msg, pl_id) + sleep(self._PAGE_RETRY_DELAY) + return next( + yield_pages(page_data[0], page_num=page_data[2]), None) + + def yield_entries(html): + for frag in re.split(PLAYLIST_SECTION_RE, html): + if not frag: + continue + t_text = get_element_by_class('title-text', frag or '') + if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)): + continue + for m in re.finditer(VIDEO_URL_RE, frag): + video_url = urljoin(url, m.group('url')) + if video_url: + yield self.url_result(video_url) + + last_first_url = None + for page_data in yield_pages(url, html=html, page_num=page_num): + # page_data: url, html, page_num + first_url = None + tries_left = self._PAGE_RETRY_COUNT + 1 + while tries_left > 0: + tries_left -= 1 + for from_ in yield_entries(page_data[1]): + # may get the same page twice instead of empty page + # or (site bug) intead of actual next page + if not first_url: + first_url = from_['url'] + if first_url == last_first_url: + # sometimes (/porntags/) the site serves the previous page + # instead but may provide the correct page after a delay + page_data = retry_page( + 'Retrying duplicate page...', tries_left, page_data) + if page_data: + first_url = None + break + continue + yield from_ + else: + if not first_url and 'no-result-paragarph1' in page_data[1]: + page_data = retry_page( + 'Retrying empty page...', tries_left, page_data) + if page_data: + continue + else: + # success/failure + break + # may get an infinite (?) sequence of empty pages + if not first_url: + break + last_first_url = first_url + + def _real_extract(self, url, html=None): + # exceptionally, id may be None + m_dict = self._match_valid_url(url).groupdict() + pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort')) + + qs = parse_qs(url) + for q, v in qs.items(): + if v: + qs[q] = v[-1] + else: + del qs[q] + + base_id = pl_id or 'YouPorn' + title = self._get_title_from_slug(base_id) + if page_type: + title = '%s %s' % (page_type.capitalize(), title) + base_id = [base_id.lower()] + if sort is None: + title += ' videos' + else: + title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort)) + base_id.append(sort) + if qs: + ps = ['%s=%s' % item for item in sorted(qs.items())] + title += ' (%s)' % ','.join(ps) + base_id.extend(ps) + pl_id = '/'.join(base_id) + + return self.playlist_result( + self._entries(url, pl_id, html=html, + page_num=int_or_none(qs.get('page'))), + playlist_id=pl_id, playlist_title=title) + + +class YouPornCategoryIE(YouPornListBase): + IE_DESC = 'YouPorn category, with sorting, filtering and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + (?P<type>category)/(?P<id>[^/?#&]+) + (?:/(?P<sort>popular|views|rating|time|duration))?/?(?:[#?]|$) + ''' + _TESTS = [{ + 'note': 'Full list with pagination', + 'url': 'https://www.youporn.com/category/lingerie/popular/', + 'info_dict': { + 'id': 'lingerie/popular', + 'title': 'Category lingerie videos by popular', + }, + 'playlist_mincount': 39, + }, { + 'note': 'Filtered paginated list with single page result', + 'url': 'https://www.youporn.com/category/lingerie/duration/?min_minutes=10', + 'info_dict': { + 'id': 'lingerie/duration/min_minutes=10', + 'title': 'Category lingerie videos by duration (min_minutes=10)', + }, + 'playlist_maxcount': 30, + }, { + 'note': 'Single page of full list', + 'url': 'https://www.youporn.com/category/lingerie/popular?page=1', + 'info_dict': { + 'id': 'lingerie/popular/page=1', + 'title': 'Category lingerie videos by popular (page=1)', + }, + 'playlist_count': 30, + }] + + +class YouPornChannelIE(YouPornListBase): + IE_DESC = 'YouPorn channel, with sorting and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + (?P<type>channel)/(?P<id>[^/?#&]+) + (?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$) + ''' + _TESTS = [{ + 'note': 'Full list with pagination', + 'url': 'https://www.youporn.com/channel/x-feeds/', + 'info_dict': { + 'id': 'x-feeds', + 'title': 'Channel X-Feeds videos', + }, + 'playlist_mincount': 37, + }, { + 'note': 'Single page of full list (no filters here)', + 'url': 'https://www.youporn.com/channel/x-feeds/duration?page=1', + 'info_dict': { + 'id': 'x-feeds/duration/page=1', + 'title': 'Channel X-Feeds videos by duration (page=1)', + }, + 'playlist_count': 24, + }] + + @staticmethod + def _get_title_from_slug(title_slug): + return re.sub(r'_', ' ', title_slug).title() + + +class YouPornCollectionIE(YouPornListBase): + IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + (?P<type>collection)s/videos/(?P<id>\d+) + (?:/(?P<sort>rating|views|time|duration))?/?(?:[#?]|$) + ''' + _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+in\s' + _TESTS = [{ + 'note': 'Full list with pagination', + 'url': 'https://www.youporn.com/collections/videos/33044251/', + 'info_dict': { + 'id': '33044251', + 'title': 'Collection Sexy Lips videos', + 'uploader': 'ph-littlewillyb', + }, + 'playlist_mincount': 50, + }, { + 'note': 'Single page of full list (no filters here)', + 'url': 'https://www.youporn.com/collections/videos/33044251/time?page=1', + 'info_dict': { + 'id': '33044251/time/page=1', + 'title': 'Collection Sexy Lips videos by time (page=1)', + 'uploader': 'ph-littlewillyb', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + pl_id = self._match_id(url) + html = self._download_webpage(url, pl_id) + playlist = super(YouPornCollectionIE, self)._real_extract(url, html=html) + infos = re.sub(r'\s+', ' ', clean_html(get_element_by_class( + 'collection-infos', html)) or '') + title, uploader = self._search_regex( + r'^\s*Collection: (?P<title>.+?) \d+ VIDEOS \d+ VIEWS \d+ days LAST UPDATED From: (?P<uploader>[\w_-]+)', + infos, 'title/uploader', group=('title', 'uploader'), default=(None, None)) + + return merge_dicts({ + 'title': playlist['title'].replace(playlist['id'].split('/')[0], title), + 'uploader': uploader, + }, playlist) if title else playlist + + +class YouPornTagIE(YouPornListBase): + IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + porn(?P<type>tag)s/(?P<id>[^/?#&]+) + (?:/(?P<sort>views|rating|time|duration))?/?(?:[#?]|$) + ''' + _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+tagged\s' + _PAGE_RETRY_COUNT = 1 + _TESTS = [{ + 'note': 'Full list with pagination', + 'url': 'https://www.youporn.com/porntags/austrian', + 'info_dict': { + 'id': 'austrian', + 'title': 'Tag austrian videos', + }, + 'playlist_mincount': 35, + 'expected_warnings': ['Retrying duplicate page'], + }, { + 'note': 'Filtered paginated list with single page result', + 'url': 'https://www.youporn.com/porntags/austrian/duration/?min_minutes=10', + 'info_dict': { + 'id': 'austrian/duration/min_minutes=10', + 'title': 'Tag austrian videos by duration (min_minutes=10)', + }, + # number of videos per page is (row x col) 2x3 + 6x4 + 2, or + 3, + # or more, varying with number of ads; let's set max as 9x4 + # NB col 1 may not be shown in non-JS page with site CSS and zoom 100% + 'playlist_maxcount': 32, + 'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'], + }, { + 'note': 'Single page of full list', + 'url': 'https://www.youporn.com/porntags/austrian/?page=1', + 'info_dict': { + 'id': 'austrian/page=1', + 'title': 'Tag austrian videos (page=1)', + }, + 'playlist_mincount': 32, + 'playlist_maxcount': 34, + 'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'], + }] + + # YP tag navigation is broken, loses sort + def _get_next_url(self, url, pl_id, html): + next_url = super(YouPornTagIE, self)._get_next_url(url, pl_id, html) + if next_url: + n = self._match_valid_url(next_url) + if n: + s = n.groupdict().get('sort') + if s: + u = self._match_valid_url(url) + if u: + u = u.groupdict().get('sort') + if s and not u: + n = n.end('sort') + next_url = next_url[:n] + '/' + u + next_url[n:] + return next_url + + +class YouPornStarIE(YouPornListBase): + IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + (?P<type>pornstar)/(?P<id>[^/?#&]+) + (?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$) + ''' + _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+[fF]eaturing\s' + _TESTS = [{ + 'note': 'Full list with pagination', + 'url': 'https://www.youporn.com/pornstar/daynia/', + 'info_dict': { + 'id': 'daynia', + 'title': 'Pornstar Daynia videos', + 'description': r're:Daynia Rank \d+ Videos \d+ Views [\d,.]+ .+ Subscribers \d+', + }, + 'playlist_mincount': 45, + }, { + 'note': 'Single page of full list (no filters here)', + 'url': 'https://www.youporn.com/pornstar/daynia/?page=1', + 'info_dict': { + 'id': 'daynia/page=1', + 'title': 'Pornstar Daynia videos (page=1)', + 'description': 're:.{180,}', + }, + 'playlist_count': 26, + }] + + @staticmethod + def _get_title_from_slug(title_slug): + return re.sub(r'_', ' ', title_slug).title() + + def _real_extract(self, url): + pl_id = self._match_id(url) + html = self._download_webpage(url, pl_id) + playlist = super(YouPornStarIE, self)._real_extract(url, html=html) + INFO_ELEMENT_RE = r'''(?x) + <div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?pornstar-info-wrapper(?:\s+[\w$-]+|\s)*\1[^>]*> + (?P<info>[\s\S]+?)(?:</div>\s*){6,} + ''' + + infos = self._search_regex(INFO_ELEMENT_RE, html, 'infos', group='info', default='') + if infos: + infos = re.sub( + r'(?:\s*nl=nl)+\s*', ' ', + re.sub(r'(?u)\s+', ' ', clean_html( + re.sub('\n', 'nl=nl', infos)))).replace('ribe Subsc', '') + + return merge_dicts({ + 'description': infos.strip() or None, + }, playlist) + + +class YouPornVideosIE(YouPornListBase): + IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination' + _VALID_URL = r'''(?x) + https?://(?:www\.)?youporn\.com/ + (?:(?P<id>browse)/)? + (?P<sort>(?(id) + (?:duration|rating|time|views)| + (?:most_(?:favou?rit|view)ed|recommended|top_rated)?)) + (?:[/#?]|$) + ''' + _PLAYLIST_TITLEBAR_RE = r'\s+(?:[Vv]ideos|VIDEOS)\s*$' + _TESTS = [{ + 'note': 'Full list with pagination (too long for test)', + 'url': 'https://www.youporn.com/', + 'info_dict': { + 'id': 'youporn', + 'title': 'YouPorn videos', + }, + 'only_matching': True, + }, { + 'note': 'Full list with pagination (too long for test)', + 'url': 'https://www.youporn.com/recommended', + 'info_dict': { + 'id': 'youporn/recommended', + 'title': 'YouPorn videos by recommended', + }, + 'only_matching': True, + }, { + 'note': 'Full list with pagination (too long for test)', + 'url': 'https://www.youporn.com/top_rated', + 'info_dict': { + 'id': 'youporn/top_rated', + 'title': 'YouPorn videos by top rated', + }, + 'only_matching': True, + }, { + 'note': 'Full list with pagination (too long for test)', + 'url': 'https://www.youporn.com/browse/time', + 'info_dict': { + 'id': 'browse/time', + 'title': 'YouPorn videos by time', + }, + 'only_matching': True, + }, { + 'note': 'Filtered paginated list with single page result', + 'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=2', + 'info_dict': { + 'id': 'youporn/most_favorited/max_minutes=2/res=VR', + 'title': 'YouPorn videos by most favorited (max_minutes=2,res=VR)', + }, + 'playlist_mincount': 10, + 'playlist_maxcount': 28, + }, { + 'note': 'Filtered paginated list with several pages', + 'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=5', + 'info_dict': { + 'id': 'youporn/most_favorited/max_minutes=5/res=VR', + 'title': 'YouPorn videos by most favorited (max_minutes=5,res=VR)', + }, + 'playlist_mincount': 45, + }, { + 'note': 'Single page of full list', + 'url': 'https://www.youporn.com/browse/time?page=1', + 'info_dict': { + 'id': 'browse/time/page=1', + 'title': 'YouPorn videos by time (page=1)', + }, + 'playlist_count': 36, + }] + + @staticmethod + def _get_title_from_slug(title_slug): + return 'YouPorn' if title_slug == 'browse' else title_slug diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index e5ffdf378..214825aa9 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -74,8 +74,11 @@ class FFmpegPostProcessor(PostProcessor): return FFmpegPostProcessor(downloader)._versions def _determine_executables(self): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] + # ordered to match prefer_ffmpeg! + convs = ['ffmpeg', 'avconv'] + probes = ['ffprobe', 'avprobe'] prefer_ffmpeg = True + programs = convs + probes def get_ffmpeg_version(path): ver = get_exe_version(path, args=['-version']) @@ -127,10 +130,13 @@ class FFmpegPostProcessor(PostProcessor): (p, get_ffmpeg_version(self._paths[p])) for p in programs) if x[1] is not None) - for p in ('ffmpeg', 'avconv')[::-1 if prefer_ffmpeg is False else 1]: - if self._versions.get(p): - self.basename = self.probe_basename = p - break + basenames = [None, None] + for i, progs in enumerate((convs, probes)): + for p in progs[::-1 if prefer_ffmpeg is False else 1]: + if self._versions.get(p): + basenames[i] = p + break + self.basename, self.probe_basename = basenames @property def available(self): diff --git a/youtube_dl/traversal.py b/youtube_dl/traversal.py new file mode 100644 index 000000000..834cfef7f --- /dev/null +++ b/youtube_dl/traversal.py @@ -0,0 +1,10 @@ +# coding: utf-8 + +# TODO: move these utils.fns here and move import to utils +# flake8: noqa +from .utils import ( + dict_get, + get_first, + T, + traverse_obj, +) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e1b05b307..cd4303566 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -49,11 +49,14 @@ from .compat import ( compat_cookiejar, compat_ctypes_WINFUNCTYPE, compat_datetime_timedelta_total_seconds, + compat_etree_Element, compat_etree_fromstring, + compat_etree_iterfind, compat_expanduser, compat_html_entities, compat_html_entities_html5, compat_http_client, + compat_http_cookies, compat_integer_types, compat_kwargs, compat_ncompress as ncompress, @@ -6253,15 +6256,16 @@ if __debug__: def traverse_obj(obj, *paths, **kwargs): """ - Safely traverse nested `dict`s and `Iterable`s + Safely traverse nested `dict`s and `Iterable`s, etc >>> obj = [{}, {"key": "value"}] >>> traverse_obj(obj, (1, "key")) - "value" + 'value' Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Supported values for traversal are `Mapping`, `Iterable`, `re.Match`, `xml.etree.ElementTree` + (xpath) and `http.cookies.Morsel`. Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. @@ -6269,8 +6273,9 @@ def traverse_obj(obj, *paths, **kwargs): The keys in the path can be one of: - `None`: Return the current object. - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{func}`. If a `type`, returns only values - of this type. If a function, returns `func(obj)`. + like `{type}`/`{type, type, ...}`/`{func}`. If one or more `type`s, + return only values that have one of the types. If a function, + return `func(obj)`. - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - `slice`: Branch out and return all values in `obj[key]`. - `Ellipsis`: Branch out and return a list of all values. @@ -6282,8 +6287,10 @@ def traverse_obj(obj, *paths, **kwargs): For `Iterable`s, `key` is the enumeration count of the value. For `re.Match`es, `key` is the group number (0 = full match) as well as additionally any group names, if given. - - `dict` Transform the current object and return a matching dict. + - `dict`: Transform the current object and return a matching dict. Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + - `any`-builtin: Take the first matching object and return it, resetting branching. + - `all`-builtin: Take all matching objects and return them as a list, resetting branching. `tuple`, `list`, and `dict` all support nested paths and branches. @@ -6299,10 +6306,8 @@ def traverse_obj(obj, *paths, **kwargs): @param get_all If `False`, return the first matching result, otherwise all matching ones. @param casesense If `False`, consider string dictionary keys as case insensitive. - The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + The following is only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API - @param _is_user_input Whether the keys are generated from user input. - If `True` strings get converted to `int`/`slice` if needed. @param _traverse_string Whether to traverse into objects as strings. If `True`, any non-compatible object will first be converted into a string and then traversed into. @@ -6322,7 +6327,6 @@ def traverse_obj(obj, *paths, **kwargs): expected_type = kwargs.get('expected_type') get_all = kwargs.get('get_all', True) casesense = kwargs.get('casesense', True) - _is_user_input = kwargs.get('_is_user_input', False) _traverse_string = kwargs.get('_traverse_string', False) # instant compat @@ -6336,10 +6340,8 @@ def traverse_obj(obj, *paths, **kwargs): type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) def lookup_or_none(v, k, getter=None): - try: + with compat_contextlib_suppress(LookupError): return getter(v, k) if getter else v[k] - except IndexError: - return None def from_iterable(iterables): # chain.from_iterable(['ABC', 'DEF']) --> A B C D E F @@ -6361,12 +6363,13 @@ def traverse_obj(obj, *paths, **kwargs): result = obj elif isinstance(key, set): - assert len(key) == 1, 'Set should only be used to wrap a single item' - item = next(iter(key)) - if isinstance(item, type): - result = obj if isinstance(obj, item) else None + assert len(key) >= 1, 'At least one item is required in a `set` key' + if all(isinstance(item, type) for item in key): + result = obj if isinstance(obj, tuple(key)) else None else: - result = try_call(item, args=(obj,)) + item = next(iter(key)) + assert len(key) == 1, 'Multiple items in a `set` key must all be types' + result = try_call(item, args=(obj,)) if not isinstance(item, type) else None elif isinstance(key, (list, tuple)): branching = True @@ -6375,9 +6378,11 @@ def traverse_obj(obj, *paths, **kwargs): elif key is Ellipsis: branching = True + if isinstance(obj, compat_http_cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) if isinstance(obj, compat_collections_abc.Mapping): result = obj.values() - elif is_iterable_like(obj): + elif is_iterable_like(obj, (compat_collections_abc.Iterable, compat_etree_Element)): result = obj elif isinstance(obj, compat_re_Match): result = obj.groups() @@ -6389,9 +6394,11 @@ def traverse_obj(obj, *paths, **kwargs): elif callable(key): branching = True + if isinstance(obj, compat_http_cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) if isinstance(obj, compat_collections_abc.Mapping): iter_obj = obj.items() - elif is_iterable_like(obj): + elif is_iterable_like(obj, (compat_collections_abc.Iterable, compat_etree_Element)): iter_obj = enumerate(obj) elif isinstance(obj, compat_re_Match): iter_obj = itertools.chain( @@ -6413,6 +6420,8 @@ def traverse_obj(obj, *paths, **kwargs): if v is not None or default is not NO_DEFAULT) or None elif isinstance(obj, compat_collections_abc.Mapping): + if isinstance(obj, compat_http_cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else next((v for k, v in obj.items() if casefold(k) == key), None)) @@ -6430,12 +6439,40 @@ def traverse_obj(obj, *paths, **kwargs): else: result = None if isinstance(key, (int, slice)): - if is_iterable_like(obj, compat_collections_abc.Sequence): + if is_iterable_like(obj, (compat_collections_abc.Sequence, compat_etree_Element)): branching = isinstance(key, slice) result = lookup_or_none(obj, key) elif _traverse_string: result = lookup_or_none(str(obj), key) + elif isinstance(obj, compat_etree_Element) and isinstance(key, str): + xpath, _, special = key.rpartition('/') + if not special.startswith('@') and not special.endswith('()'): + xpath = key + special = None + + # Allow abbreviations of relative paths, absolute paths error + if xpath.startswith('/'): + xpath = '.' + xpath + elif xpath and not xpath.startswith('./'): + xpath = './' + xpath + + def apply_specials(element): + if special is None: + return element + if special == '@': + return element.attrib + if special.startswith('@'): + return try_call(element.attrib.get, args=(special[1:],)) + if special == 'text()': + return element.text + raise SyntaxError('apply_specials is missing case for {0!r}'.format(special)) + + if xpath: + result = list(map(apply_specials, compat_etree_iterfind(obj, xpath))) + else: + result = apply_specials(obj) + return branching, result if branching else (result,) def lazy_last(iterable): @@ -6456,17 +6493,18 @@ def traverse_obj(obj, *paths, **kwargs): key = None for last, key in lazy_last(variadic(path, (str, bytes, dict, set))): - if _is_user_input and isinstance(key, str): - if key == ':': - key = Ellipsis - elif ':' in key: - key = slice(*map(int_or_none, key.split(':'))) - elif int_or_none(key) is not None: - key = int(key) - if not casesense and isinstance(key, str): key = compat_casefold(key) + if key in (any, all): + has_branched = False + filtered_objs = (obj for obj in objs if obj not in (None, {})) + if key is any: + objs = (next(filtered_objs, None),) + else: + objs = (list(filtered_objs),) + continue + if __debug__ and callable(key): # Verify function signature _try_bind_args(key, None, None) @@ -6505,9 +6543,9 @@ def traverse_obj(obj, *paths, **kwargs): return None if default is NO_DEFAULT else default -def T(x): - """ For use in yt-dl instead of {type} or set((type,)) """ - return set((x,)) +def T(*x): + """ For use in yt-dl instead of {type, ...} or set((type, ...)) """ + return set(x) def get_first(obj, keys, **kwargs): |