From 51342717cddafde83dbf39f2212be40a196a577a Mon Sep 17 00:00:00 2001 From: Timendum Date: Tue, 14 Mar 2017 16:11:09 +0100 Subject: [rai] Fix extraction --- youtube_dl/extractor/rai.py | 355 ++++++++++++++++++++++++-------------------- 1 file changed, 194 insertions(+), 161 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 41afbd9af..b67e94f88 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - determine_ext, ExtractorError, + determine_ext, find_xpath_attr, fix_xml_ampersands, int_or_none, @@ -55,181 +55,157 @@ class RaiBaseIE(InfoExtractor): return formats - def _extract_from_content_id(self, content_id, base_url): - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, - content_id, 'Downloading video JSON') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(base_url, thumbnail_url), - }) +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': 're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', + 'info_dict': { + 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', + 'ext': 'mp4', + 'title': 'Gazebo - #gazebotraindesi', + 'thumbnail': r're:^https?://.*\.png$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report - Report del 07/04/2014', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': r're:^Rai.+', + 'description': r're:^[A-Za-z]+' + } + }] + _RESOLUTION = '600x400' - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) - self._sort_formats(formats) - else: - raise ExtractorError('not a media file') + def _real_extract(self, url): + video_id = self._match_id(url) - subtitles = {} - captions = media.get('subtitlesUrl') - if captions: - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': captions, - }] + # remove query and fragment part from url + canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) + webpage = self._download_webpage(canonical_url, video_id) - return { - 'id': content_id, - 'title': media['name'], - 'description': media.get('desc'), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'formats': formats, - 'subtitles': subtitles, - } + media = self._download_json('%s?json' % canonical_url, + video_id, 'Downloading video JSON') + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + }) -class RaiTVIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '8970abf8caf8aef4696e7b1f2adfc696', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', - 'title': 'Report del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', - 'upload_date': '20140407', - 'duration': 6160, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - # no m3u8 stream - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - # HDS download, MD5 is unstable - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'flv', - 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', - 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', - 'md5': '35cf7c229f22eeef43e48b5cf923bef0', - 'info_dict': { - 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', - 'ext': 'mp4', - 'title': 'State of the Net, Antonella La Carpia: regole virali', - 'description': 'md5:b0ba04a324126903e3da7763272ae63c', - 'upload_date': '20140613', - }, - 'skip': 'Error 404', - }, - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'info_dict': { - 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', - 'ext': 'mp4', - 'title': 'Alluvione in Sardegna e dissesto idrogeologico', - 'description': 'Edizione delle ore 20:30 ', - }, - 'skip': 'invalid urls', - }, - { - 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': 'e57493e1cb8bc7c564663f363b171847', - 'info_dict': { - 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'mp4', - 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'md5:364b604f7db50594678f483353164fb8', - 'upload_date': '20140923', - 'duration': 386, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - ] + if 'video' not in media: + raise ExtractorError('No video found') - def _real_extract(self, url): - video_id = self._match_id(url) + video = media.get('video') + duration = parse_duration(video.get('duration')), + formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) + self._sort_formats(formats) - return self._extract_from_content_id(video_id, url) + return { + 'id': video_id, + 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), + 'description': self._og_search_description(webpage), + 'uploader': media.get('channel'), + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats + } class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'upload_date': '20141221', - }, + _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _TESTS = [{ + # subdomain test case + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'upload_date': '20140612', + 'duration': 1758, + 'thumbnail': r're:^https?://.*\.jpg$' + } + }, { + # rainews test case + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', + 'upload_date': '20161103', + 'thumbnail': r're:^https?://.*\.png$', + 'description': r're:^[A-Za-z]+' + } + }, { + # with media information + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20161103', + 'description': r're:^[A-Za-z]+' + } + }, { + # drawMediaRaiTV test case + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'upload_date': '20141221', }, - { - # Direct relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'skip': 'Geo-restricted to Italy', + }, { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', }, - { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'md5': '84c1135ce960e8822ae63cec34441d63', - 'info_dict': { - 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', - 'ext': 'mp4', - 'title': 'TG1 ore 20:00 del 02/07/2016', - 'upload_date': '20160702', - }, + }, { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'd80d4b70-3812-4501-a888-92edec729f00', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'upload_date': r're:\d{8}', + 'description': r're:.+', }, - { - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'flv', - 'title': 'La diretta di Rainews24', - }, + }, { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', }, - ] - - @classmethod - def suitable(cls, url): - return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -250,6 +226,12 @@ class RaiIE(RaiBaseIE): if content_item_id: return self._extract_from_content_id(content_item_id, url) + try: + return self._extract_from_content_id(video_id, url) + except ExtractorError: + # no media data, only direct relinker + pass + relinker_url = compat_urlparse.urljoin(url, self._search_regex( r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P[\'"])(?P(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', webpage, 'relinker URL', group='url')) @@ -265,3 +247,54 @@ class RaiIE(RaiBaseIE): 'title': title, 'formats': formats, } + + def _extract_from_content_id(self, content_id, url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) + + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) + self._sort_formats(formats) + else: + raise ExtractorError('not a media file') + + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + + return { + 'id': content_id, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'formats': formats, + 'subtitles': subtitles, + } -- cgit 1.4.1 From b8d8cced9b55c57f3b09e83972be9d6318a459ee Mon Sep 17 00:00:00 2001 From: Sergey M․ Date: Sun, 2 Apr 2017 02:14:42 +0700 Subject: [rai] Improve extraction (closes #11790) * Fix georestriction detection * Detect live streams + Extract relinker metadata * Improve ContentItem detection + Extract series metadata * Fix tests --- youtube_dl/extractor/rai.py | 359 ++++++++++++++++++++++++++++---------------- 1 file changed, 233 insertions(+), 126 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b67e94f88..b77b0a08e 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,23 +1,40 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( ExtractorError, determine_ext, find_xpath_attr, fix_xml_ampersands, + GeoRestrictedError, int_or_none, parse_duration, + strip_or_none, + try_get, unified_strdate, + unified_timestamp, update_url_query, + urljoin, xpath_text, ) class RaiBaseIE(InfoExtractor): - def _extract_relinker_formats(self, relinker_url, video_id): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): formats = [] + geoprotection = None + is_live = None + duration = None for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( @@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor): query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) - media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) if media_url == 'http://download.rai.it/video_no_available.mp4': - self.raise_geo_restricted() + continue ext = determine_ext(media_url) if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): @@ -53,11 +88,18 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) - return formats + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s)\.html)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', 'md5': '340aa3b7afb54bfd14a8c11786450d76', @@ -65,110 +107,130 @@ class RaiPlayIE(RaiBaseIE): 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', 'ext': 'mp4', 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': 're:^[A-Za-z]+' - } - }, { - 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', - 'md5': 'ed4da3d70ccf8129a33ab16b34d20ab8', - 'info_dict': { - 'id': 'efebe701-969c-4593-92f3-285f0d1ce750', - 'ext': 'mp4', - 'title': 'Gazebo - #gazebotraindesi', - 'thumbnail': r're:^https?://.*\.png$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', - 'title': 'Report - Report del 07/04/2014', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': r're:^Rai.+', - 'description': r're:^[A-Za-z]+' - } + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, }] - _RESOLUTION = '600x400' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') - # remove query and fragment part from url - canonical_url = compat_urlparse.urljoin(url, compat_urlparse.urlparse(url).path) - webpage = self._download_webpage(canonical_url, video_id) + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') - media = self._download_json('%s?json' % canonical_url, - video_id, 'Downloading video JSON') + title = media['name'] + + video = media['video'] + + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) thumbnails = [] if 'images' in media: for _, value in media.get('images').items(): if value: thumbnails.append({ - 'url': value.replace('[RESOLUTION]', self._RESOLUTION) + 'url': value.replace('[RESOLUTION]', '600x400') }) - if 'video' not in media: - raise ExtractorError('No video found') + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) - video = media.get('video') - duration = parse_duration(video.get('duration')), - formats = self._extract_relinker_formats(video.get('contentUrl'), video_id) - self._sort_formats(formats) - - return { + info = { 'id': video_id, - 'title': self._og_search_title(webpage).replace(' - video - RaiPlay', ''), - 'description': self._og_search_description(webpage), + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), 'uploader': media.get('channel'), - 'duration': duration, + 'creator': media.get('editor'), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, 'thumbnails': thumbnails, - 'formats': formats + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, } + info.update(relinker_info) + + return info + class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://.+\.(?:rai|rainews)\.it/dl/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ - # subdomain test case + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$' + 'upload_date': '20140612', } }, { - # rainews test case + # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 'info_dict': { 'id': '1632c009-c843-4836-bb65-80c33084a64b', 'ext': 'mp4', - 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor \"La ragazza del treno\" ', - 'upload_date': '20161103', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', - 'description': r're:^[A-Za-z]+' + 'duration': 833, + 'upload_date': '20161103', } }, { - # with media information + # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 'md5': '11959b4e44fa74de47011b5799490adf', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, 'upload_date': '20161103', - 'description': r're:^[A-Za-z]+' } }, { - # drawMediaRaiTV test case + # drawMediaRaiTV(...) 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { @@ -176,83 +238,67 @@ class RaiIE(RaiBaseIE): 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, }, { - # Direct relinker URL + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', 'ext': 'flv', 'title': 'EuroNews', }, - }, { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'info_dict': { - 'id': 'd80d4b70-3812-4501-a888-92edec729f00', - 'ext': 'mp4', - 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', - 'upload_date': r're:\d{8}', - 'description': r're:.+', + 'params': { + 'skip_download': True, }, }, { + # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable 'info_dict': { 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', 'ext': 'mp4', 'title': 'La diretta di Rainews24', }, + 'params': { + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - iframe_url = self._search_regex( - [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P[\'"])ContentItem-(?P[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) - if content_item_id: - return self._extract_from_content_id(content_item_id, url) - - try: - return self._extract_from_content_id(video_id, url) - except ExtractorError: - # no media data, only direct relinker - pass - - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P[\'"])(?P(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) - - title = self._search_regex( - r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - } - def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') + title = media['name'].strip() + + media_type = media['type'] + if 'Audio' in media_type: + relinker_info = { + 'formats': { + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + } + } + elif 'Video' in media_type: + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) + else: + raise ExtractorError('not a media file') + + self._sort_formats(relinker_info['formats']) + thumbnails = [] for image_type in ('image', 'image_medium', 'image_300'): thumbnail_url = media.get(image_type) @@ -261,20 +307,6 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) - self._sort_formats(formats) - else: - raise ExtractorError('not a media file') - subtitles = {} captions = media.get('subtitlesUrl') if captions: @@ -287,14 +319,89 @@ class RaiIE(RaiBaseIE): 'url': captions, }] - return { + info = { 'id': content_id, - 'title': media['name'], - 'description': media.get('desc'), + 'title': title, + 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'formats': formats, 'subtitles': subtitles, } + + info.update(relinker_info) + + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + content_item_id = None + + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) + + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') + + content_item_ids = set() + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') + + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) + + info = { + 'id': video_id, + 'title': title, + } + + info.update(relinker_info) + + return info -- cgit 1.4.1 From 361f293ab85c29ab62cb91577d2be34814d5c552 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sun, 2 Apr 2017 02:24:13 +0700 Subject: [rai] Skip not found content item id --- youtube_dl/extractor/rai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index b77b0a08e..077546a73 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -362,7 +362,8 @@ class RaiIE(RaiBaseIE): webpage, 'content item id', default=None, group='id') content_item_ids = set() - content_item_ids.add(content_item_id) + if content_item_id: + content_item_ids.add(content_item_id) if video_id not in content_item_ids: content_item_ids.add(video_id) -- cgit 1.4.1 From 1b3feca0a700c4d4d7c4d8b01fc5f033cf48f421 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sat, 8 Apr 2017 14:11:03 +0700 Subject: [raiplay] Extract subtitles --- youtube_dl/extractor/rai.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 077546a73..81eb9db85 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -97,6 +97,25 @@ class RaiBaseIE(InfoExtractor): 'formats': formats, }.items() if v is not None) + @staticmethod + def _extract_subtitles(url, subtitle_url): + subtitles = {} + if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = urljoin(url, subtitle_url) + STL_EXT = '.stl' + SRT_EXT = '.srt' + subtitles['it'] = [{ + 'ext': 'stl', + 'url': subtitle_url, + }] + if subtitle_url.endswith(STL_EXT): + srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT + subtitles['it'].append({ + 'ext': 'srt', + 'url': srt_url, + }) + return subtitles + class RaiPlayIE(RaiBaseIE): _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE @@ -168,6 +187,8 @@ class RaiPlayIE(RaiBaseIE): timestamp = unified_timestamp(try_get( media, lambda x: x['availabilities'][0]['start'], compat_str)) + subtitles = self._extract_subtitles(url, video.get('subtitles')) + info = { 'id': video_id, 'title': title, @@ -183,6 +204,7 @@ class RaiPlayIE(RaiBaseIE): 'season_number': int_or_none(try_get( media, lambda x: x['isPartOf']['numeroStagioni'])), 'season': media.get('stagione') or None, + 'subtitles': subtitles, } info.update(relinker_info) @@ -307,17 +329,7 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = {} - captions = media.get('subtitlesUrl') - if captions: - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': captions, - }] + subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) info = { 'id': content_id, -- cgit 1.4.1 From 449c66577640a0c3f0b383204a1e7284429a61c3 Mon Sep 17 00:00:00 2001 From: james <jrabblac@gmail.com> Date: Sat, 17 Jun 2017 17:15:41 +0200 Subject: [raiplay:live] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rai.py | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e97691daa..a263c88b3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -824,6 +824,7 @@ from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import ( RaiPlayIE, + RaiPlayLiveIE, RaiIE, ) from .rbmaradio import RBMARadioIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 81eb9db85..ed15a5f10 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -208,10 +208,27 @@ class RaiPlayIE(RaiBaseIE): } info.update(relinker_info) - return info +class RaiPlayLiveIE(RaiBaseIE): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>\w*)' + _TEST = { + 'url': 'http://www.raiplay.it/dirette/rai3', + 'only_matching': True, + } + + def _real_extract(self, url): + channel = self._match_id(url) + + webpage = self._download_webpage(url, channel) + re_id = r'<div([^>]*)data-uniquename=(["\'])[\w-]*(?P<id>%s)(\2)([^>]*?)>' % RaiBaseIE._UUID_RE + video_id = self._html_search_regex(re_id, webpage, 'livestream-id', group='id') + + return self.url_result('http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + RaiPlayIE.ie_key(), video_id) + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ -- cgit 1.4.1 From 9c48b5a193de754403f4a1ced78f6cf6b3893676 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sun, 25 Jun 2017 01:48:02 +0700 Subject: [raiplay:live] Improve and add test (closes #13414) --- youtube_dl/extractor/rai.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index ed15a5f10..e11bf8f9a 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -191,11 +191,12 @@ class RaiPlayIE(RaiBaseIE): info = { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, 'alt_title': media.get('subtitle'), 'description': media.get('description'), - 'uploader': media.get('channel'), - 'creator': media.get('editor'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), 'duration': parse_duration(video.get('duration')), 'timestamp': timestamp, 'thumbnails': thumbnails, @@ -212,21 +213,40 @@ class RaiPlayIE(RaiBaseIE): class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>\w*)' + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.raiplay.it/dirette/rai3', - 'only_matching': True, + 'url': 'http://www.raiplay.it/dirette/rainews24', + 'info_dict': { + 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', + 'display_id': 'rainews24', + 'ext': 'mp4', + 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:6eca31500550f9376819f174e5644754', + 'uploader': 'Rai News 24', + 'creator': 'Rai News 24', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, } def _real_extract(self, url): - channel = self._match_id(url) + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage(url, channel) - re_id = r'<div([^>]*)data-uniquename=(["\'])[\w-]*(?P<id>%s)(\2)([^>]*?)>' % RaiBaseIE._UUID_RE - video_id = self._html_search_regex(re_id, webpage, 'livestream-id', group='id') + video_id = self._search_regex( + r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, + webpage, 'content id') - return self.url_result('http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, - RaiPlayIE.ie_key(), video_id) + return { + '_type': 'url_transparent', + 'ie_key': RaiPlayIE.ie_key(), + 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + 'id': video_id, + 'display_id': display_id, + } class RaiIE(RaiBaseIE): -- cgit 1.4.1 From 085d9dd9bebfd1692cfe07e8bcb844780bfe4700 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sat, 26 Aug 2017 22:02:49 +0700 Subject: [rai] Fix audio formats extraction (closes #14024) --- youtube_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e11bf8f9a..5bf64a56b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -345,11 +345,11 @@ class RaiIE(RaiBaseIE): media_type = media['type'] if 'Audio' in media_type: relinker_info = { - 'formats': { + 'formats': [{ 'format_id': media.get('formatoAudio'), 'url': media['audioUrl'], 'ext': media.get('formatoAudio'), - } + }] } elif 'Video' in media_type: relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) -- cgit 1.4.1 From d21d0ba6c14e8a6696130090641da4e2028e1bb3 Mon Sep 17 00:00:00 2001 From: Timendum <timedum@gmail.com> Date: Mon, 23 Oct 2017 15:32:45 +0200 Subject: [raiplay:playlist] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rai.py | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 9c9739ad2..d8f9f94cc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -857,6 +857,7 @@ from .rai import ( RaiPlayIE, RaiPlayLiveIE, RaiIE, + RaiPlaylistIE, ) from .rbmaradio import RBMARadioIE from .rds import RDSIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 5bf64a56b..625458380 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -455,3 +455,29 @@ class RaiIE(RaiBaseIE): info.update(relinker_info) return info + + +class RaiPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_meta('programma', webpage, default=None) + video_urls = re.findall(' href="(/raiplay/video.+)"', webpage) + video_urls = [urljoin(url, video_url) for video_url in video_urls] + entries = [ + self.url_result( + video_url, + RaiPlayIE.ie_key()) + for video_url in video_urls if RaiPlayIE.suitable(video_url) + ] + return self.playlist_result(entries, playlist_id, title) -- cgit 1.4.1 From 1115271ac61b89cc4ac1ca922eff8a4bed0fbf57 Mon Sep 17 00:00:00 2001 From: Sergey M․ <dstftw@gmail.com> Date: Sat, 9 Dec 2017 00:46:28 +0700 Subject: [raiplay:playlist] Fix issues and improve (closes #14563) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/rai.py | 62 ++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 27 deletions(-) (limited to 'youtube_dl/extractor/rai.py') diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d8f9f94cc..b9c97fac4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -856,8 +856,8 @@ from .radiofrance import RadioFranceIE from .rai import ( RaiPlayIE, RaiPlayLiveIE, + RaiPlayPlaylistIE, RaiIE, - RaiPlaylistIE, ) from .rbmaradio import RBMARadioIE from .rds import RDSIE diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 625458380..d22311031 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -249,6 +250,41 @@ class RaiPlayLiveIE(RaiBaseIE): } +class RaiPlayPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', + 'info_dict': { + 'id': 'nondirloalmiocapo', + 'title': 'Non dirlo al mio capo', + 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_meta( + ('programma', 'nomeProgramma'), webpage, 'title') + description = unescapeHTML(self._html_search_meta( + ('description', 'og:description'), webpage, 'description')) + print(description) + + entries = [] + for mobj in re.finditer( + r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', + webpage): + video_url = urljoin(url, mobj.group('path')) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) + + return self.playlist_result(entries, playlist_id, title, description) + + class RaiIE(RaiBaseIE): _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE _TESTS = [{ @@ -455,29 +491,3 @@ class RaiIE(RaiBaseIE): info.update(relinker_info) return info - - -class RaiPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/]+)' - _TESTS = [{ - 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', - 'info_dict': { - 'id': 'nondirloalmiocapo', - 'title': 'Non dirlo al mio capo', - }, - 'playlist_mincount': 12, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_meta('programma', webpage, default=None) - video_urls = re.findall(' href="(/raiplay/video.+)"', webpage) - video_urls = [urljoin(url, video_url) for video_url in video_urls] - entries = [ - self.url_result( - video_url, - RaiPlayIE.ie_key()) - for video_url in video_urls if RaiPlayIE.suitable(video_url) - ] - return self.playlist_result(entries, playlist_id, title) -- cgit 1.4.1