Feature or enhancement
Proposal:
import ast
body = '섀' + 'S' * 8000000 # non ASCII!
ast.literal_eval('Y(' + body + ')' ) # pretty fast
ast.literal_eval('Y(' + body + '=!') # noticably slower due to re-normalization
After some profiling, it is pretty clear that we are normalising the identifier many times:
Instead of normalising every time in _PyPegen_new_identifier:
|
if (!PyUnicode_IS_ASCII(id)) |
|
{ |
|
if (!init_normalization(p)) |
|
{ |
|
Py_DECREF(id); |
|
goto error; |
|
} |
|
PyObject *form = PyUnicode_InternFromString("NFKC"); |
|
if (form == NULL) |
|
{ |
|
Py_DECREF(id); |
|
goto error; |
|
} |
|
PyObject *args[2] = {form, id}; |
|
PyObject *id2 = PyObject_Vectorcall(p->normalize, args, 2, NULL); |
|
Py_DECREF(id); |
|
Py_DECREF(form); |
|
if (!id2) { |
|
goto error; |
|
} |
|
|
|
if (!PyUnicode_Check(id2)) |
|
{ |
|
PyErr_Format(PyExc_TypeError, |
|
"unicodedata.normalize() must return a string, not " |
|
"%.200s", |
|
_PyType_Name(Py_TYPE(id2))); |
|
Py_DECREF(id2); |
|
goto error; |
|
} |
|
id = id2; |
|
} |
Maybe we could cache it in the Token (I could write a patch with something like that :-)?
CC @pablogsal @lysnikolaou
This was found by OSS-Fuzz
Has this already been discussed elsewhere?
This is a minor feature, which does not need previous discussion elsewhere
Links to previous discussion of this feature:
No response
Feature or enhancement
Proposal:
After some profiling, it is pretty clear that we are normalising the identifier many times:
Instead of normalising every time in
_PyPegen_new_identifier:cpython/Parser/pegen.c
Lines 502 to 533 in 448d7b9
Maybe we could cache it in the
Token(I could write a patch with something like that :-)?CC @pablogsal @lysnikolaou
This was found by OSS-Fuzz
Has this already been discussed elsewhere?
This is a minor feature, which does not need previous discussion elsewhere
Links to previous discussion of this feature:
No response