diff options
author | df <fieldhouse@gmx.net> | 2021-11-13 22:03:19 +0000 |
---|---|---|
committer | df <fieldhouse@gmx.net> | 2021-11-13 22:03:19 +0000 |
commit | 7d1059d6b8c817cc37d7598c7372ba832e6b55e2 (patch) | |
tree | 48fca3033b19a3484a4b25b8d6017e884275ad6f | |
parent | a941a68d5f9fa3d785713b73c854190a1b23d345 (diff) | |
download | youtube-dl-7d1059d6b8c817cc37d7598c7372ba832e6b55e2.tar.gz youtube-dl-7d1059d6b8c817cc37d7598c7372ba832e6b55e2.tar.xz youtube-dl-7d1059d6b8c817cc37d7598c7372ba832e6b55e2.zip |
Don't find classname as part of class="... x-classname ...", etc
Eg, in [1], the class with name 'plist-info' was found when searching for 'info'. 1. https://github.com/ytdl-org/youtube-dl/issues/30230
-rw-r--r-- | test/test_utils.py | 5 | ||||
-rw-r--r-- | youtube_dl/utils.py | 12 |
2 files changed, 12 insertions, 5 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index 50fb5f101..7c6a382fd 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1463,7 +1463,10 @@ Line 1 def test_get_elements_by_class(self): html = ''' - <span class="foo bar">nice</span><span class="foo bar">also nice</span> + <span class="not-foo bar">nasty</span> + <span class="foo bar">nice</span> + <span class="bar foo">"also nice"</span> + <span class="bar foo-impostor">also nasty</span> ''' self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice']) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fcb787570..0bc6be509 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1962,7 +1962,9 @@ def get_element_by_attribute(attribute, value, html, escape_value=True): def get_elements_by_class(class_name, html): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + # class names can contain alphanumeric, -, _ and \ for escapes + # don't allow a word break at - + 'class', r'(?:[\w\s\\-]*?[\w\s])?\b%s\b(?:[\w\s\\][\w\s\\-]*?)?' % re.escape(class_name), html, escape_value=False) @@ -1973,11 +1975,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): retlist = [] for m in re.finditer(r'''(?xs) - <([a-zA-Z0-9:._-]+) + <([a-zA-Z0-9:._-]+) # conservative pattern: HTML tags don't have :._- + # (?:\s[^>]+) # this seems to be simpler than the below and work the same? (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s+%s=['"]?%s['"]? + \s*\b%s\s*=\s*(?P<__q>'|"|\b)%s(?P=__q) + # (?:\s[^>]+)? # as above (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? - \s*> + \s*> (?P<content>.*?) </\1> ''' % (re.escape(attribute), value), html): |