From 45283afdec81af21ba50ff3aca3d86fb6d2584b0 Mon Sep 17 00:00:00 2001 From: Martin Weinelt Date: Sat, 6 Jan 2018 17:33:40 +0100 Subject: [motherless] Add support for groups --- youtube_dl/extractor/motherless.py | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'youtube_dl/extractor/motherless.py') diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6fe3b6049..90ed91ba6 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -4,8 +4,11 @@ import datetime import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, + InAdvancePagedList, + orderedSet, str_to_int, unified_strdate, ) @@ -114,3 +117,73 @@ class MotherlessIE(InfoExtractor): 'age_limit': age_limit, 'url': video_url, } + + +class MotherlessGroupIE(InfoExtractor): + _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P[a-z0-9_]+)' + _TESTS = [{ + 'url': 'http://motherless.com/g/movie_scenes', + 'info_dict': { + 'id': 'movie_scenes', + 'title': 'Movie Scenes', + 'description': 'Hot and sexy scenes from "regular" movies... ' + 'Beautiful actresses fully nude... A looot of ' + 'skin! :)Enjoy!', + }, + 'playlist_mincount': 662, + }, { + 'url': 'http://motherless.com/gv/sex_must_be_funny', + 'info_dict': { + 'id': 'sex_must_be_funny', + 'title': 'Sex must be funny', + 'description': 'Sex can be funny. Wide smiles,laugh, games, fun of ' + 'any kind!' + }, + 'playlist_mincount': 9, + }] + + @classmethod + def suitable(cls, url): + return (False if MotherlessIE.suitable(url) + else super(MotherlessGroupIE, cls).suitable(url)) + + def _extract_entries(self, webpage, base): + return [ + self.url_result( + compat_urlparse.urljoin(base, video_path), + MotherlessIE.ie_key(), video_title=title) + for video_path, title in orderedSet(re.findall( + r'href="/([^"]+)"[^>]+>\s+]+alt="[^-]+-\s([^"]+)"', + webpage)) + ] + + def _real_extract(self, url): + group_id = self._match_id(url) + page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id) + webpage = self._download_webpage(page_url, group_id) + title = self._search_regex( + r'([\w\s]+\w)\s+-', webpage, 'title', fatal=False) + description = self._html_search_meta( + 'description', webpage, fatal=False) + page_count = self._int(self._search_regex( + r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT', + webpage, 'page_count'), 'page_count') + PAGE_SIZE = 80 + + def _get_page(idx): + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) + for entry in self._extract_entries(webpage, url): + yield entry + + playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': group_id, + 'title': title, + 'description': description, + 'entries': playlist + } -- cgit 1.4.1 From a133eb7764594b830cb975e3925972214e932704 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:02:41 +0700 Subject: [motherless:group] Capture leading slash of video path --- youtube_dl/extractor/motherless.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl/extractor/motherless.py') diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 90ed91ba6..4adac691c 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -153,7 +153,7 @@ class MotherlessGroupIE(InfoExtractor): compat_urlparse.urljoin(base, video_path), MotherlessIE.ie_key(), video_title=title) for video_path, title in orderedSet(re.findall( - r'href="/([^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', + r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', webpage)) ] -- cgit 1.4.1 From 0a5b1295b7c1aa6395b65ee137087c540b37b32b Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sun, 7 Jan 2018 00:31:53 +0700 Subject: [motherless:group] Relax entry extraction and add a fallback scenario --- youtube_dl/extractor/motherless.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'youtube_dl/extractor/motherless.py') diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 4adac691c..e24396e79 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor): else super(MotherlessGroupIE, cls).suitable(url)) def _extract_entries(self, webpage, base): - return [ - self.url_result( - compat_urlparse.urljoin(base, video_path), - MotherlessIE.ie_key(), video_title=title) - for video_path, title in orderedSet(re.findall( - r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', - webpage)) - ] + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries def _real_extract(self, url): group_id = self._match_id(url) -- cgit 1.4.1