[extractor/common] Improve JSON-LD interaction statistic extraction (refs #23306)

author: Sergey M․ <dstftw@gmail.com> 2020-12-13 20:24:13 +0700
committer: Sergey M․ <dstftw@gmail.com> 2020-12-13 20:24:13 +0700
commit: 172754131578f6042efa7c47a57c6e8531e3d190 (patch)
tree: dca561e62d722c96e05ff57a43daba160a401257
parent: 45b0a0d11b9bc67895507629f8f0a81c5835f51f (diff)
download: youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.tar.gz
youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.tar.xz
youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.zip
2 files changed, 60 insertions, 2 deletions
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 644b3759c..8745f3aac 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -98,6 +98,56 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
         self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
 
+    def test_search_json_ld_realworld(self):
+        # https://github.com/ytdl-org/youtube-dl/issues/23306
+        expect_dict(
+            self,
+            self.ie._search_json_ld(r'''<script type="application/ld+json">
+{
+"@context": "http://schema.org/",
+"@type": "VideoObject",
+"name": "1 On 1 With Kleio",
+"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/",
+"duration": "PT0H12M23S",
+"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"],
+"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4",
+"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/",
+"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg",
+"width": "1920",
+"height": "1080",
+"encodingFormat": "mp4",
+"bitrate": "6617kbps",
+"isFamilyFriendly": "False",
+"description": "Kleio Valentien",
+"uploadDate": "2015-12-05T21:24:35+01:00",
+"interactionStatistic": {
+"@type": "InteractionCounter",
+"interactionType": { "@type": "http://schema.org/WatchAction" },
+"userInteractionCount": 1120958
+}, "aggregateRating": {
+"@type": "AggregateRating",
+"ratingValue": "88",
+"ratingCount": "630",
+"bestRating": "100",
+"worstRating": "0"
+}, "actor": [{
+"@type": "Person",
+"name": "Kleio Valentien",
+"url": "https://www.eporner.com/pornstar/kleio-valentien/"
+}]}
+</script>''', None),
+            {
+                'title': '1 On 1 With Kleio',
+                'description': 'Kleio Valentien',
+                'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+                'timestamp': 1449347075,
+                'duration': 743.0,
+                'view_count': 1120958,
+                'width': 1920,
+                'height': 1080,
+            })
+
+
     def test_download_json(self):
         uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
         self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'})
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 460758ab8..79138f346 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1237,8 +1237,16 @@ class InfoExtractor(object):
             'ViewAction': 'view',
         }
 
+        def extract_interaction_type(e):
+            interaction_type = e.get('interactionType')
+            if isinstance(interaction_type, dict):
+                interaction_type = interaction_type.get('@type')
+            return str_or_none(interaction_type)
+
         def extract_interaction_statistic(e):
             interaction_statistic = e.get('interactionStatistic')
+            if isinstance(interaction_statistic, dict):
+                interaction_statistic = [interaction_statistic]
             if not isinstance(interaction_statistic, list):
                 return
             for is_e in interaction_statistic:
@@ -1246,8 +1254,8 @@ class InfoExtractor(object):
                     continue
                 if is_e.get('@type') != 'InteractionCounter':
                     continue
-                interaction_type = is_e.get('interactionType')
-                if not isinstance(interaction_type, compat_str):
+                interaction_type = extract_interaction_type(is_e)
+                if not interaction_type:
                     continue
                 # For interaction count some sites provide string instead of
                 # an integer (as per spec) with non digit characters (e.g. ",")
author	Sergey M․ <dstftw@gmail.com>	2020-12-13 20:24:13 +0700
committer	Sergey M․ <dstftw@gmail.com>	2020-12-13 20:24:13 +0700
commit	172754131578f6042efa7c47a57c6e8531e3d190 (patch)
tree	dca561e62d722c96e05ff57a43daba160a401257
parent	45b0a0d11b9bc67895507629f8f0a81c5835f51f (diff)
download	youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.tar.gz youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.tar.xz youtube-dl-172754131578f6042efa7c47a57c6e8531e3d190.zip