[jsinterp] Clean up and pull yt-dlp style

* add compat_re_Pattern
* improve compat_collections_chain_map
* use class JS_Undefined
* remove unused code
This commit is contained in:
dirkf 2022-08-19 15:34:33 +01:00
parent 538ec65ba7
commit 46b8ae2f52
4 changed files with 77 additions and 90 deletions

View file

@ -11,8 +11,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import math import math
import re import re
from youtube_dl.jsinterp import JSInterpreter from youtube_dl.compat import compat_re_Pattern
undefined = JSInterpreter.undefined
from youtube_dl.jsinterp import JS_Undefined, JSInterpreter
class TestJSInterpreter(unittest.TestCase): class TestJSInterpreter(unittest.TestCase):
@ -261,12 +262,12 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { return undefined; } function x() { return undefined; }
''') ''')
self.assertIs(jsi.call_function('x'), undefined) self.assertIs(jsi.call_function('x'), JS_Undefined)
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let v; return v; } function x() { let v; return v; }
''') ''')
self.assertIs(jsi.call_function('x'), undefined) self.assertIs(jsi.call_function('x'), JS_Undefined)
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; } function x() { return [undefined === undefined, undefined == undefined, undefined < undefined, undefined > undefined]; }
@ -307,7 +308,7 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; } function x() { let v; return [v>42, v<=42, v&&42, 42&&v]; }
''') ''')
self.assertEqual(jsi.call_function('x'), [False, False, undefined, undefined]) self.assertEqual(jsi.call_function('x'), [False, False, JS_Undefined, JS_Undefined])
jsi = JSInterpreter('function x(){return undefined ?? 42; }') jsi = JSInterpreter('function x(){return undefined ?? 42; }')
self.assertEqual(jsi.call_function('x'), 42) self.assertEqual(jsi.call_function('x'), 42)
@ -326,12 +327,12 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a; return a?.qq; } function x() { let a; return a?.qq; }
''') ''')
self.assertIs(jsi.call_function('x'), undefined) self.assertIs(jsi.call_function('x'), JS_Undefined)
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a = {m1: 42, m2: 0 }; return a?.qq; } function x() { let a = {m1: 42, m2: 0 }; return a?.qq; }
''') ''')
self.assertIs(jsi.call_function('x'), undefined) self.assertIs(jsi.call_function('x'), JS_Undefined)
def test_regex(self): def test_regex(self):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
@ -342,13 +343,12 @@ class TestJSInterpreter(unittest.TestCase):
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/; return a; } function x() { let a=/,,[/,913,/](,)}/; return a; }
''') ''')
# Pythons disagree on the type of a pattern self.assertIsInstance(jsi.call_function('x'), compat_re_Pattern)
self.assertTrue(isinstance(jsi.call_function('x'), type(re.compile(''))))
jsi = JSInterpreter(''' jsi = JSInterpreter('''
function x() { let a=/,,[/,913,/](,)}/i; return a; } function x() { let a=/,,[/,913,/](,)}/i; return a; }
''') ''')
self.assertEqual(jsi.call_function('x').flags & re.I, re.I) self.assertEqual(jsi.call_function('x').flags & ~re.U, re.I)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -12,10 +12,11 @@ import io
import re import re
import string import string
from youtube_dl.compat import compat_str, compat_urlretrieve
from test.helper import FakeYDL from test.helper import FakeYDL
from youtube_dl.extractor import YoutubeIE from youtube_dl.extractor import YoutubeIE
from youtube_dl.jsinterp import JSInterpreter from youtube_dl.jsinterp import JSInterpreter
from youtube_dl.compat import compat_str, compat_urlretrieve
_SIG_TESTS = [ _SIG_TESTS = [
( (

View file

@ -3023,18 +3023,34 @@ except ImportError:
self.maps[0].__setitem__(k, v) self.maps[0].__setitem__(k, v)
return return
def __delitem__(self, k): def __contains__(self, k):
return any((k in m) for m in self.maps)
def __delitem(self, k):
if k in self.maps[0]: if k in self.maps[0]:
del self.maps[0][k] del self.maps[0][k]
return return
raise KeyError(k) raise KeyError(k)
def __delitem__(self, k):
self.__delitem(k)
def __iter__(self): def __iter__(self):
return itertools.chain(*reversed(self.maps)) return itertools.chain(*reversed(self.maps))
def __len__(self): def __len__(self):
return len(iter(self)) return len(iter(self))
# to match Py3, don't del directly
def pop(self, k, *args):
if self.__contains__(k):
off = self.__getitem__(k)
self.__delitem(k)
return off
elif len(args) > 0:
return args[0]
raise KeyError(k)
def new_child(self, m=None, **kwargs): def new_child(self, m=None, **kwargs):
m = m or {} m = m or {}
m.update(kwargs) m.update(kwargs)
@ -3044,6 +3060,8 @@ except ImportError:
def parents(self): def parents(self):
return compat_collections_chain_map(*(self.maps[1:])) return compat_collections_chain_map(*(self.maps[1:]))
# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?)
compat_re_Pattern = type(re.compile(''))
if sys.version_info < (3, 3): if sys.version_info < (3, 3):
def compat_b64decode(s, *args, **kwargs): def compat_b64decode(s, *args, **kwargs):
@ -3110,6 +3128,7 @@ __all__ = [
'compat_os_name', 'compat_os_name',
'compat_parse_qs', 'compat_parse_qs',
'compat_print', 'compat_print',
'compat_re_Pattern',
'compat_realpath', 'compat_realpath',
'compat_setenv', 'compat_setenv',
'compat_shlex_quote', 'compat_shlex_quote',

View file

@ -19,16 +19,12 @@ from .compat import (
compat_str, compat_str,
) )
_NAME_RE = r'[a-zA-Z_$][\w$]*'
_UNDEFINED = object()
def _js_bit_op(op): def _js_bit_op(op):
def wrapped(a, b): def wrapped(a, b):
def zeroise(x): def zeroise(x):
return 0 if x in (None, _UNDEFINED) else x return 0 if x in (None, JS_Undefined) else x
return op(zeroise(a), zeroise(b)) return op(zeroise(a), zeroise(b))
return wrapped return wrapped
@ -37,7 +33,7 @@ def _js_bit_op(op):
def _js_arith_op(op): def _js_arith_op(op):
def wrapped(a, b): def wrapped(a, b):
if _UNDEFINED in (a, b): if JS_Undefined in (a, b):
return float('nan') return float('nan')
return op(a or 0, b or 0) return op(a or 0, b or 0)
@ -45,22 +41,21 @@ def _js_arith_op(op):
def _js_div(a, b): def _js_div(a, b):
if _UNDEFINED in (a, b) or not (a and b): if JS_Undefined in (a, b) or not (a and b):
return float('nan') return float('nan')
return float('inf') if not b else operator.truediv(a or 0, b) return float('inf') if not b else operator.truediv(a or 0, b)
def _js_mod(a, b): def _js_mod(a, b):
if _UNDEFINED in (a, b) or not b: if JS_Undefined in (a, b) or not b:
return float('nan') return float('nan')
return (a or 0) % b return (a or 0) % b
def _js_exp(a, b): def _js_exp(a, b):
if not b: if not b:
# even 0 ** 0 !! return 1 # even 0 ** 0 !!
return 1 elif JS_Undefined in (a, b):
if _UNDEFINED in (a, b):
return float('nan') return float('nan')
return (a or 0) ** b return (a or 0) ** b
@ -68,7 +63,7 @@ def _js_exp(a, b):
def _js_eq_op(op): def _js_eq_op(op):
def wrapped(a, b): def wrapped(a, b):
if set((a, b)) <= set((None, _UNDEFINED)): if set((a, b)) <= set((None, JS_Undefined)):
return op(a, a) return op(a, a)
return op(a, b) return op(a, b)
@ -78,21 +73,28 @@ def _js_eq_op(op):
def _js_comp_op(op): def _js_comp_op(op):
def wrapped(a, b): def wrapped(a, b):
if _UNDEFINED in (a, b): if JS_Undefined in (a, b):
return False return False
return op(a or 0, b or 0) return op(a or 0, b or 0)
return wrapped return wrapped
def _js_ternary(cndn, if_true=True, if_false=False):
"""Simulate JS's ternary operator (cndn?if_true:if_false)"""
if cndn in (False, None, 0, '', JS_Undefined):
return if_false
try:
if math.isnan(cndn): # NB: NaN cannot be checked by membership
return if_false
except TypeError:
pass
return if_true
# (op, definition) in order of binding priority, tightest first # (op, definition) in order of binding priority, tightest first
# avoid dict to maintain order # avoid dict to maintain order
# definition None => Defined in JSInterpreter._operator # definition None => Defined in JSInterpreter._operator
_DOT_OPERATORS = (
('.', None),
# TODO: ('?.', None),
)
_OPERATORS = ( _OPERATORS = (
('>>', _js_bit_op(operator.rshift)), ('>>', _js_bit_op(operator.rshift)),
('<<', _js_bit_op(operator.lshift)), ('<<', _js_bit_op(operator.lshift)),
@ -130,20 +132,13 @@ _SC_OPERATORS = (
_OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS)) _OPERATOR_RE = '|'.join(map(lambda x: re.escape(x[0]), _OPERATORS + _LOG_OPERATORS))
_NAME_RE = r'[a-zA-Z_$][\w$]*'
_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
_QUOTES = '\'"/' _QUOTES = '\'"/'
def _ternary(cndn, if_true=True, if_false=False): class JS_Undefined(object):
"""Simulate JS's ternary operator (cndn?if_true:if_false)""" pass
if cndn in (False, None, 0, '', _UNDEFINED):
return if_false
try:
if math.isnan(cndn): # NB: NaN cannot be checked by membership
return if_false
except TypeError:
pass
return if_true
class JS_Break(ExtractorError): class JS_Break(ExtractorError):
@ -167,7 +162,7 @@ class LocalNameSpace(ChainMap):
try: try:
return super(LocalNameSpace, self).__getitem__(key) return super(LocalNameSpace, self).__getitem__(key)
except KeyError: except KeyError:
return _UNDEFINED return JS_Undefined
def __setitem__(self, key, value): def __setitem__(self, key, value):
for scope in self.maps: for scope in self.maps:
@ -179,24 +174,6 @@ class LocalNameSpace(ChainMap):
def __delitem__(self, key): def __delitem__(self, key):
raise NotImplementedError('Deleting is not supported') raise NotImplementedError('Deleting is not supported')
# except
def pop(self, key, *args):
try:
off = self.__getitem__(key)
super(LocalNameSpace, self).__delitem__(key)
return off
except KeyError:
if len(args) > 0:
return args[0]
raise
def __contains__(self, key):
try:
super(LocalNameSpace, self).__getitem__(key)
return True
except KeyError:
return False
def __repr__(self): def __repr__(self):
return 'LocalNameSpace%s' % (self.maps, ) return 'LocalNameSpace%s' % (self.maps, )
@ -204,9 +181,7 @@ class LocalNameSpace(ChainMap):
class JSInterpreter(object): class JSInterpreter(object):
__named_object_counter = 0 __named_object_counter = 0
undefined = _UNDEFINED _RE_FLAGS = {
RE_FLAGS = {
# special knowledge: Python's re flags are bitmask values, current max 128 # special knowledge: Python's re flags are bitmask values, current max 128
# invent new bitmask values well above that for literal parsing # invent new bitmask values well above that for literal parsing
# TODO: new pattern class to execute matches with these flags # TODO: new pattern class to execute matches with these flags
@ -257,10 +232,10 @@ class JSInterpreter(object):
if not expr: if not expr:
return flags, expr return flags, expr
for idx, ch in enumerate(expr): for idx, ch in enumerate(expr):
if ch not in cls.RE_FLAGS: if ch not in cls._RE_FLAGS:
break break
flags |= cls.RE_FLAGS[ch] flags |= cls._RE_FLAGS[ch]
return flags, expr[idx:] if idx > 0 else expr return flags, expr[idx + 1:]
@classmethod @classmethod
def _separate(cls, expr, delim=',', max_split=None, skip_delims=None): def _separate(cls, expr, delim=',', max_split=None, skip_delims=None):
@ -283,14 +258,6 @@ class JSInterpreter(object):
if not escaping and char in _QUOTES and in_quote in (char, None): if not escaping and char in _QUOTES and in_quote in (char, None):
if in_quote or after_op or char != '/': if in_quote or after_op or char != '/':
in_quote = None if in_quote and not in_regex_char_group else char in_quote = None if in_quote and not in_regex_char_group else char
if in_quote is None and char == '/' and delim != '/':
# regexp flags
n_idx = idx + 1
while n_idx < len(expr) and expr[n_idx] in cls.RE_FLAGS:
n_idx += 1
skip_re = n_idx - idx - 1
if skip_re > 0:
continue
elif in_quote == '/' and char in '[]': elif in_quote == '/' and char in '[]':
in_regex_char_group = char == '[' in_regex_char_group = char == '['
escaping = not escaping and in_quote and char == '\\' escaping = not escaping and in_quote and char == '\\'
@ -336,13 +303,13 @@ class JSInterpreter(object):
def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion):
if op in ('||', '&&'): if op in ('||', '&&'):
if (op == '&&') ^ _ternary(left_val): if (op == '&&') ^ _js_ternary(left_val):
return left_val # short circuiting return left_val # short circuiting
elif op == '??': elif op == '??':
if left_val not in (None, self.undefined): if left_val not in (None, JS_Undefined):
return left_val return left_val
elif op == '?': elif op == '?':
right_expr = _ternary(left_val, *self._separate(right_expr, ':', 1)) right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1))
right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) right_val = self.interpret_expression(right_expr, local_vars, allow_recursion)
opfunc = op and next((v for k, v in self._all_operators() if k == op), None) opfunc = op and next((v for k, v in self._all_operators() if k == op), None)
@ -361,7 +328,7 @@ class JSInterpreter(object):
return obj[int(idx)] if isinstance(obj, list) else obj[idx] return obj[int(idx)] if isinstance(obj, list) else obj[idx]
except Exception as e: except Exception as e:
if allow_undefined: if allow_undefined:
return self.undefined return JS_Undefined
raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e) raise self.Exception('Cannot get index {idx}'.format(**locals()), expr=repr(obj), cause=e)
def _dump(self, obj, namespace): def _dump(self, obj, namespace):
@ -395,9 +362,8 @@ class JSInterpreter(object):
if expr[0] in _QUOTES: if expr[0] in _QUOTES:
inner, outer = self._separate(expr, expr[0], 1) inner, outer = self._separate(expr, expr[0], 1)
if expr[0] == '/': if expr[0] == '/':
flags, _ = self._regex_flags(outer) flags, outer = self._regex_flags(outer)
inner, outer = inner.replace('"', r'\"'), '' inner = re.compile(inner[1:], flags=flags) # , strict=True))
inner = re.compile(js_to_json(inner + expr[0]), flags=flags) # , strict=True))
else: else:
inner = json.loads(js_to_json(inner + expr[0])) # , strict=True)) inner = json.loads(js_to_json(inner + expr[0])) # , strict=True))
if not outer: if not outer:
@ -422,7 +388,7 @@ class JSInterpreter(object):
if expr.startswith('{'): if expr.startswith('{'):
inner, outer = self._separate_at_paren(expr, '}') inner, outer = self._separate_at_paren(expr, '}')
# try for object expression # try for object expression (Map)
sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)]
if all(len(sub_expr) == 2 for sub_expr in sub_expressions): if all(len(sub_expr) == 2 for sub_expr in sub_expressions):
return dict( return dict(
@ -455,7 +421,8 @@ class JSInterpreter(object):
(?P<try>try|finally)\s*| (?P<try>try|finally)\s*|
(?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))| (?P<catch>catch\s*(?P<err>\(\s*{_NAME_RE}\s*\)))|
(?P<switch>switch)\s*\(| (?P<switch>switch)\s*\(|
(?P<for>for)\s*\(|'''.format(**globals()), expr) (?P<for>for)\s*\(|
'''.format(**globals()), expr)
md = m.groupdict() if m else {} md = m.groupdict() if m else {}
if md.get('try'): if md.get('try'):
if expr[m.end()] == '{': if expr[m.end()] == '{':
@ -500,7 +467,7 @@ class JSInterpreter(object):
start, cndn, increment = self._separate(constructor, ';') start, cndn, increment = self._separate(constructor, ';')
self.interpret_expression(start, local_vars, allow_recursion) self.interpret_expression(start, local_vars, allow_recursion)
while True: while True:
if not _ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)):
break break
try: try:
ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion)
@ -587,7 +554,7 @@ class JSInterpreter(object):
local_vars[m.group('out')] = self._operator( local_vars[m.group('out')] = self._operator(
m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion)
return local_vars[m.group('out')], should_return return local_vars[m.group('out')], should_return
elif left_val in (None, self.undefined): elif left_val in (None, JS_Undefined):
raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr) raise self.Exception('Cannot index undefined variable ' + m.group('out'), expr=expr)
idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
@ -607,7 +574,7 @@ class JSInterpreter(object):
raise JS_Continue() raise JS_Continue()
elif expr == 'undefined': elif expr == 'undefined':
return self.undefined, should_return return JS_Undefined, should_return
elif md.get('return'): elif md.get('return'):
return local_vars[m.group('name')], should_return return local_vars[m.group('name')], should_return
@ -663,9 +630,9 @@ class JSInterpreter(object):
'Math': float, 'Math': float,
} }
obj = local_vars.get(variable) obj = local_vars.get(variable)
if obj in (self.undefined, None): if obj in (JS_Undefined, None):
obj = types.get(variable, self.undefined) obj = types.get(variable, JS_Undefined)
if obj is self.undefined: if obj is JS_Undefined:
try: try:
if variable not in self._objects: if variable not in self._objects:
self._objects[variable] = self.extract_object(variable) self._objects[variable] = self.extract_object(variable)
@ -674,8 +641,8 @@ class JSInterpreter(object):
if not nullish: if not nullish:
raise raise
if nullish and obj is self.undefined: if nullish and obj is JS_Undefined:
return self.undefined return JS_Undefined
# Member access # Member access
if arg_str is None: if arg_str is None: