Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Include/codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,12 @@ PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
PyAPI_DATA(const char *) Py_hexdigits;
#endif

#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
const char *encoding,
const char *alternate_command);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion Include/internal/pycore_codecs.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
in Python 3.5+?
*/
extern PyObject* _PyCodec_LookupTextEncoding(
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
const char *encoding,
const char *alternate_command);

Expand Down
5 changes: 4 additions & 1 deletion Lib/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ class CodecInfo(tuple):

def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None,
*, _is_text_encoding=None):
*, _is_text_encoding=None,
_is_single_byte=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
Expand All @@ -104,6 +105,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
self.streamreader = streamreader
if _is_text_encoding is not None:
self._is_text_encoding = _is_text_encoding
if _is_single_byte is not None:
self._is_single_byte = _is_single_byte
return self

def __repr__(self):
Expand Down
1 change: 1 addition & 0 deletions Lib/encodings/big5.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/big5hkscs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp932.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp949.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/cp950.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jis_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jisx0213.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_jp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/euc_kr.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/gb18030.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/gb2312.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/hz.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,4 +385,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_jp_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/iso2022_kr.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/johab.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/punycode.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/raw_unicode_escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jis.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jis_2004.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/shift_jisx0213.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/unicode_escape.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16_be.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_16_le.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32_be.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_32_le.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_7.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
1 change: 1 addition & 0 deletions Lib/encodings/utf_8_sig.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
_is_single_byte=False,
)
3 changes: 3 additions & 0 deletions Lib/test/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1892,6 +1892,7 @@ def test_copy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
self.assertFalse(orig._is_single_byte)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
Expand All @@ -1912,6 +1913,7 @@ def test_deepcopy(self):
self.assertIsNot(dup, orig)
self.assertEqual(dup, orig)
self.assertTrue(orig._is_text_encoding)
self.assertFalse(orig._is_single_byte)
self.assertEqual(dup.encode, orig.encode)
self.assertEqual(dup.name, orig.name)
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
Expand Down Expand Up @@ -1940,6 +1942,7 @@ def test_pickle(self):
unpickled_codec_info.incrementalencoder
)
self.assertTrue(unpickled_codec_info._is_text_encoding)
self.assertFalse(unpickled_codec_info._is_single_byte)

# Test a CodecInfo with _is_text_encoding equal to false.
codec_info = codecs.lookup('base64')
Expand Down
47 changes: 46 additions & 1 deletion Lib/test/test_pyexpat.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def _verify_parse_output(self, operations):
"Character data: '\xb5'",
"End element: 'root'",
]
for operation, expected_operation in zip(operations, expected_operations):
for operation, expected_operation in zip(operations, expected_operations, strict=True):
self.assertEqual(operation, expected_operation)

def test_parse_bytes(self):
Expand Down Expand Up @@ -276,6 +276,51 @@ def test_parse_again(self):
self.assertEqual(expat.ErrorString(cm.exception.code),
expat.errors.XML_ERROR_FINISHED)

@support.subTests('enc', ['UTF-8', 'utf-8', 'utf-16', 'koi8-u',
'cp1125', 'cp1251', 'iso8859-5',
'mac_cyrillic'])
def test_supportes_ecodings(self, enc):
out = self.Outputter()
parser = expat.ParserCreate()
self._hookup_callbacks(parser, out)
data = (f'<?xml version="1.0" encoding="{enc}"?>\n'
'<корінь атрибут="значення">зміст</корінь>').encode(enc)
parser.Parse(data, True)
self.assertEqual(out.out, [
('XML declaration', ('1.0', enc, -1)),
"Start element: 'корінь' {'атрибут': 'значення'}",
"Character data: 'зміст'",
"End element: 'корінь'",
])

@support.subTests('enc', [
'UTF8', 'UTF-7',
"unicode-escape", "raw-unicode-escape",
"Big5-HKSCS", "Big5",
"cp932", "cp949", "cp950",
"EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
"GB18030", "GB2312", "GBK",
"HZ-GB-2312",
"ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
"ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
"ISO-2022-KR",
"johab",
"Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
])
def test_unsupportes_ecodings(self, enc):
parser = expat.ParserCreate()
data = (f'<?xml version="1.0" encoding="{enc}"?>\n'
'<root></root>').encode(enc)
with self.assertRaises(ValueError):
parser.Parse(data, True)

def test_unknown_ecoding(self):
parser = expat.ParserCreate()
data = b'<?xml version="1.0" encoding="xyz"?>\n<root></root>'
with self.assertRaises(LookupError):
parser.Parse(data, True)


class NamespaceSeparatorTest(unittest.TestCase):
def test_legal(self):
# Tests that make sure we get errors when the namespace_separator value
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
The :mod:`XML parser <pyexpat>` now raises :exc:`ValueError` for known
unsupported multi-byte encodings such us "UTF8", "ISO-2022-JP" or
"raw-unicode-escape" instead of failing later, when encounter non-ASCII
data.
26 changes: 26 additions & 0 deletions Modules/pyexpat.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "Python.h"
#include "pycore_ceval.h" // _Py_EnterRecursiveCall()
#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding()
#include "pycore_import.h" // _PyImport_SetModule()
#include "pycore_pyhash.h" // _Py_HashSecret
#include "pycore_traceback.h" // _PyTraceback_Add()
Expand Down Expand Up @@ -1465,6 +1466,31 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
if (PyErr_Occurred())
return XML_STATUS_ERROR;

PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
if (codec == NULL) {
return XML_STATUS_ERROR;
}
if (!PyTuple_CheckExact(codec)) {
PyObject *attr;
if (PyObject_GetOptionalAttrString(codec, "_is_single_byte", &attr) < 0) {
Py_DECREF(codec);
return XML_STATUS_ERROR;
}
if (attr != NULL) {
int is_single_byte = PyObject_IsTrue(attr);
Py_DECREF(attr);
if (is_single_byte <= 0) {
Py_DECREF(codec);
if (is_single_byte == 0) {
PyErr_SetString(PyExc_ValueError,
"multi-byte encodings are not supported");
}
return XML_STATUS_ERROR;
}
}
}
Py_DECREF(codec);

u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
if (u == NULL) {
Py_XDECREF(u);
Expand Down
Loading
Loading