[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

author: Philipp Hagemeister <phihag@phihag.de> 2014-10-26 17:05:44 +0100
committer: Philipp Hagemeister <phihag@phihag.de> 2014-10-26 17:05:44 +0100
commit: 23be51d8ce132dbb967f460e1225fdaaa43dff39 (patch)
tree: 1a4e5404654e8f63d80bb662836ab05bb80cb340
parent: 488447455d3d90e1d83a7ebc2f9ce552e031e0d8 (diff)
download: youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.tar.gz
youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.tar.xz
youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.zip
2 files changed, 32 insertions, 28 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index cf3781cd6..e1bd6bb49 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -242,7 +242,6 @@ class InfoExtractor(object):
 
     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns a tuple (page content as string, URL handle) """
-
         # Strip hashes from the URL (#1038)
         if isinstance(url_or_request, (compat_str, str)):
             url_or_request = url_or_request.partition('#')[0]
@@ -251,6 +250,10 @@ class InfoExtractor(object):
         if urlh is False:
             assert not fatal
             return False
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        return (content, urlh)
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
         content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -309,7 +312,7 @@ class InfoExtractor(object):
                 msg += ' Visit %s for more details' % blocked_iframe
             raise ExtractorError(msg, expected=True)
 
-        return (content, urlh)
+        return content
 
     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns the data of the page as a string """
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 524215408..51dbbc8db 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -503,14 +503,14 @@ class GenericIE(InfoExtractor):
         self.to_screen('%s: Requesting header' % video_id)
 
         head_req = HEADRequest(url)
-        response = self._request_webpage(
+        head_response = self._request_webpage(
             head_req, video_id,
             note=False, errnote='Could not send HEAD request to %s' % url,
             fatal=False)
 
-        if response is not False:
+        if head_response is not False:
             # Check for redirect
-            new_url = response.geturl()
+            new_url = head_response.geturl()
             if url != new_url:
                 self.report_following_redirect(new_url)
                 if force_videoid:
@@ -518,34 +518,35 @@ class GenericIE(InfoExtractor):
                         new_url, {'force_videoid': force_videoid})
                 return self.url_result(new_url)
 
-            # Check for direct link to a video
-            content_type = response.headers.get('Content-Type', '')
-            m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
-            if m:
-                upload_date = response.headers.get('Last-Modified')
-                if upload_date:
-                    upload_date = unified_strdate(upload_date)
-                return {
-                    'id': video_id,
-                    'title': os.path.splitext(url_basename(url))[0],
-                    'formats': [{
-                        'format_id': m.group('format_id'),
-                        'url': url,
-                        'vcodec': 'none' if m.group('type') == 'audio' else None
-                    }],
-                    'upload_date': upload_date,
-                }
+        full_response = None
+        if head_response is False:
+            full_response = self._request_webpage(url, video_id)
+            head_response = full_response
+
+        # Check for direct link to a video
+        content_type = head_response.headers.get('Content-Type', '')
+        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        if m:
+            upload_date = unified_strdate(
+                head_response.headers.get('Last-Modified'))
+            return {
+                'id': video_id,
+                'title': os.path.splitext(url_basename(url))[0],
+                'formats': [{
+                    'format_id': m.group('format_id'),
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }],
+                'upload_date': upload_date,
+            }
 
         if not self._downloader.params.get('test', False) and not is_intentional:
             self._downloader.report_warning('Falling back on generic information extractor.')
 
-        try:
+        if full_response:
+            webpage = _webpage_read_content(url, video_id)
+        else:
             webpage = self._download_webpage(url, video_id)
-        except ValueError:
-            # since this is the last-resort InfoExtractor, if
-            # this error is thrown, it'll be thrown here
-            raise ExtractorError('Failed to download URL: %s' % url)
-
         self.report_extraction(video_id)
 
         # Is it an RSS feed?
author	Philipp Hagemeister <phihag@phihag.de>	2014-10-26 17:05:44 +0100
committer	Philipp Hagemeister <phihag@phihag.de>	2014-10-26 17:05:44 +0100
commit	23be51d8ce132dbb967f460e1225fdaaa43dff39 (patch)
tree	1a4e5404654e8f63d80bb662836ab05bb80cb340
parent	488447455d3d90e1d83a7ebc2f9ce552e031e0d8 (diff)
download	youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.tar.gz youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.tar.xz youtube-dl-23be51d8ce132dbb967f460e1225fdaaa43dff39.zip