From 040271022709c4d20d33c604d1dbc72dc2da472d Mon Sep 17 00:00:00 2001 From: dirkf Date: Sun, 5 Mar 2023 23:07:07 +0000 Subject: [jsinterp] Fix regexp parsing and .replace[All] method * For performance, make regexp object instantiation lazy * Other small performance improvements --- youtube_dl/jsinterp.py | 84 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 27 deletions(-) (limited to 'youtube_dl/jsinterp.py') diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index e28670a3f..ab7d6f926 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -12,9 +12,11 @@ from .utils import ( js_to_json, remove_quotes, unified_timestamp, + variadic, ) from .compat import ( compat_basestring, + compat_chr, compat_collections_chain_map as ChainMap, compat_itertools_zip_longest as zip_longest, compat_str, @@ -205,10 +207,10 @@ class JSInterpreter(object): super(JSInterpreter.Exception, self).__init__(msg, *args, **kwargs) class JS_RegExp(object): - _RE_FLAGS = { + RE_FLAGS = { # special knowledge: Python's re flags are bitmask values, current max 128 # invent new bitmask values well above that for literal parsing - # TODO: new pattern class to execute matches with these flags + # TODO: execute matches with these flags (remaining: d, y) 'd': 1024, # Generate indices for substring matches 'g': 2048, # Global search 'i': re.I, # Case-insensitive search @@ -218,12 +220,19 @@ class JSInterpreter(object): 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string } - def __init__(self, pattern_txt, flags=''): + def __init__(self, pattern_txt, flags=0): if isinstance(flags, compat_str): flags, _ = self.regex_flags(flags) - # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern # First, avoid https://github.com/python/cpython/issues/74534 - self.__self = re.compile(pattern_txt.replace('[[', r'[\['), flags) + self.__self = None + self.__pattern_txt = pattern_txt.replace('[[', r'[\[') + self.__flags = flags + + def __instantiate(self): + if self.__self: + return + self.__self = re.compile(self.__pattern_txt, self.__flags) + # Thx: https://stackoverflow.com/questions/44773522/setattr-on-python2-sre-sre-pattern for name in dir(self.__self): # Only these? Obviously __class__, __init__. # PyPy creates a __weakref__ attribute with value None @@ -232,15 +241,21 @@ class JSInterpreter(object): continue setattr(self, name, getattr(self.__self, name)) + def __getattr__(self, name): + self.__instantiate() + if hasattr(self, name): + return getattr(self, name) + return super(JSInterpreter.JS_RegExp, self).__getattr__(name) + @classmethod def regex_flags(cls, expr): flags = 0 if not expr: return flags, expr for idx, ch in enumerate(expr): - if ch not in cls._RE_FLAGS: + if ch not in cls.RE_FLAGS: break - flags |= cls._RE_FLAGS[ch] + flags |= cls.RE_FLAGS[ch] return flags, expr[idx + 1:] @classmethod @@ -265,17 +280,17 @@ class JSInterpreter(object): counters = dict((k, 0) for k in _MATCHING_PARENS.values()) start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 in_quote, escaping, skipping = None, False, 0 - after_op, in_regex_char_group, skip_re = True, False, 0 + after_op, in_regex_char_group = True, False for idx, char in enumerate(expr): - if skip_re > 0: - skip_re -= 1 - continue + paren_delta = 0 if not in_quote: if char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 + paren_delta = 1 elif char in counters: counters[char] -= 1 + paren_delta = -1 if not escaping: if char in _QUOTES and in_quote in (char, None): if in_quote or after_op or char != '/': @@ -283,7 +298,7 @@ class JSInterpreter(object): elif in_quote == '/' and char in '[]': in_regex_char_group = char == '[' escaping = not escaping and in_quote and char == '\\' - after_op = not in_quote and (char in cls.OP_CHARS or (char.isspace() and after_op)) + after_op = not in_quote and (char in cls.OP_CHARS or paren_delta > 0 or (after_op and char.isspace())) if char != delim[pos] or any(counters.values()) or in_quote: pos = skipping = 0 @@ -293,7 +308,7 @@ class JSInterpreter(object): continue elif pos == 0 and skip_delims: here = expr[idx:] - for s in skip_delims if isinstance(skip_delims, (list, tuple)) else [skip_delims]: + for s in variadic(skip_delims): if here.startswith(s) and s: skipping = len(s) - 1 break @@ -316,7 +331,7 @@ class JSInterpreter(object): separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise cls.Exception('No terminating paren {delim} in {expr}'.format(**locals())) + raise cls.Exception('No terminating paren {delim} in {expr!r:.5500}'.format(**locals())) return separated[0][1:].strip(), separated[1].strip() @staticmethod @@ -361,6 +376,20 @@ class JSInterpreter(object): except TypeError: return self._named_object(namespace, obj) + # used below + _VAR_RET_THROW_RE = re.compile(r'''(?x) + (?P(?:var|const|let)\s)|return(?:\s+|(?=["'])|$)|(?Pthrow\s+) + ''') + _COMPOUND_RE = re.compile(r'''(?x) + (?Ptry)\s*\{| + (?Pif)\s*\(| + (?Pswitch)\s*\(| + (?Pfor)\s*\(| + (?Pwhile)\s*\( + ''') + _FINALLY_RE = re.compile(r'finally\s*\{') + _SWITCH_RE = re.compile(r'switch\s*\(') + def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise self.Exception('Recursion limit reached') @@ -375,7 +404,7 @@ class JSInterpreter(object): if should_return: return ret, should_return - m = re.match(r'(?P(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?Pthrow\s+)', stmt) + m = self._VAR_RET_THROW_RE.match(stmt) if m: expr = stmt[len(m.group(0)):].strip() if m.group('throw'): @@ -447,13 +476,7 @@ class JSInterpreter(object): for item in self._separate(inner)]) expr = name + outer - m = re.match(r'''(?x) - (?Ptry)\s*\{| - (?Pif)\s*\(| - (?Pswitch)\s*\(| - (?Pfor)\s*\(| - (?Pwhile)\s*\( - ''', expr) + m = self._COMPOUND_RE.match(expr) md = m.groupdict() if m else {} if md.get('if'): cndn, expr = self._separate_at_paren(expr[m.end() - 1:]) @@ -512,7 +535,7 @@ class JSInterpreter(object): err = None pending = self.interpret_statement(sub_expr, catch_vars, allow_recursion) - m = re.match(r'finally\s*\{', expr) + m = self._FINALLY_RE.match(expr) if m: sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) @@ -531,7 +554,7 @@ class JSInterpreter(object): if remaining.startswith('{'): body, expr = self._separate_at_paren(remaining) else: - switch_m = re.match(r'switch\s*\(', remaining) # FIXME + switch_m = self._SWITCH_RE.match(remaining) # FIXME if switch_m: switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') @@ -735,7 +758,7 @@ class JSInterpreter(object): if obj == compat_str: if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') - return ''.join(map(chr, argvals)) + return ''.join(map(compat_chr, argvals)) raise self.Exception('Unsupported string method ' + member, expr=expr) elif obj == float: if member == 'pow': @@ -808,10 +831,17 @@ class JSInterpreter(object): if idx >= len(obj): return None return ord(obj[idx]) - elif member == 'replace': + elif member in ('replace', 'replaceAll'): assertion(isinstance(obj, compat_str), 'must be applied on a string') assertion(len(argvals) == 2, 'takes exactly two arguments') - return re.sub(argvals[0], argvals[1], obj) + # TODO: argvals[1] callable, other Py vs JS edge cases + if isinstance(argvals[0], self.JS_RegExp): + count = 0 if argvals[0].flags & self.JS_RegExp.RE_FLAGS['g'] else 1 + assertion(member != 'replaceAll' or count == 0, + 'replaceAll must be called with a global RegExp') + return argvals[0].sub(argvals[1], obj, count=count) + count = ('replaceAll', 'replace').index(member) + return re.sub(re.escape(argvals[0]), argvals[1], obj, count=count) idx = int(member) if isinstance(obj, list) else member return obj[idx](argvals, allow_recursion=allow_recursion) -- cgit 1.4.1