Fix MIT extractor for Python 2.6

The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing.
author: Jeff Smith <whydoubt@yahoo.com> 2013-08-28 14:00:59 -0500
committer: Jeff Smith <whydoubt@yahoo.com> 2013-08-28 14:24:42 -0500
commit: b5ba7b9dcfed5ded96c841a0ebbbf12132de838f (patch)
tree: 2622b78cd616c051fa1dadcef5571f97b72998be /youtube_dl/extractor/mit.py
parent: 2891932bf0a01acc025246438f890dca57f91c6b (diff)
download: youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.gz
youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.xz
youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.zip
1 files changed, 7 insertions, 9 deletions
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index d09d03e36..52be9232f 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        webpage = self._download_webpage(
+        raw_page = self._download_webpage(
             'http://techtv.mit.edu/videos/%s' % video_id, video_id)
-        embed_page = self._download_webpage(
-            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
-            note=u'Downloading embed page')
+        clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
 
         base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
-            embed_page, u'base url')
-        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
+            raw_page, u'base url')
+        formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
             u'video formats')
         formats = json.loads(formats_json)
         formats = sorted(formats, key=lambda f: f['bitrate'])
 
-        title = get_element_by_id('edit-title', webpage)
-        description = clean_html(get_element_by_id('edit-description', webpage))
+        title = get_element_by_id('edit-title', clean_page)
+        description = clean_html(get_element_by_id('edit-description', clean_page))
         thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
-            embed_page, u'thumbnail', flags=re.DOTALL)
+            raw_page, u'thumbnail', flags=re.DOTALL)
 
         return {'id': video_id,
                 'title': title,
author	Jeff Smith <whydoubt@yahoo.com>	2013-08-28 14:00:59 -0500
committer	Jeff Smith <whydoubt@yahoo.com>	2013-08-28 14:24:42 -0500
commit	b5ba7b9dcfed5ded96c841a0ebbbf12132de838f (patch)
tree	2622b78cd616c051fa1dadcef5571f97b72998be /youtube_dl/extractor/mit.py
parent	2891932bf0a01acc025246438f890dca57f91c6b (diff)
download	youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.gz youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.xz youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.zip