From a0ccbb728b2d36df0c9f129cabec5d0361280fc6 Mon Sep 17 00:00:00 2001 From: James Parrott <80779630+JamesParrott@users.noreply.github.com> Date: Tue, 23 Jun 2026 23:49:33 +0100 Subject: [PATCH] v3.1.1 Warn if text encoding ends in pad bytes. Minimise pad bytes used in decodings. v3.1.1 Bump version Only catch warnings in dbf record tests that encode ascii to utf-16-le Trim all trailing null bytes, not just the first one (lots appear in test files) Check string encodings end in pad bytes, and use minimum of them in decoding. Remove unnecessary stream.seek(0)s and warning captures Stop corruption! Of b" " in encoded field names (only corrupt U+0020 in strings) Add test for unicode field string corruption --- README.md | 9 ++- changelog.txt | 5 ++ src/shapefile.py | 164 +++++++++++++++++++++++++++----------- tests/hypothesis_tests.py | 9 +-- tests/test_shapefile.py | 121 +++++++++++++++++++++++----- 5 files changed, 234 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index ad06d55..8ca6841 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py - **Author**: [Joel Lawhead](https://github.com/GeospatialPython) - **Maintainers**: [James Parrott](https://github.com/JamesParrott) & [Karim Bahgat](https://github.com/karimbahgat) -- **Version**: 3.0.13 -- **Date**: 19th June 2026 +- **Version**: 3.1.1 +- **Date**: 24th June 2026 - **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT) ## Contents @@ -93,6 +93,11 @@ part of your geospatial project. # Version Changes +## 3.1.1 +### Unicode support made even more robust and yet another encoding bug fixed! + - When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423). + - When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes. + ## 3.1.0 ### Unicode support made more robust and encoding bugs fixed - Truncation of field names and text fields now respects unicode code point boundaries (fixes issues - diff --git a/changelog.txt b/changelog.txt index 71560b9..258c2c5 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,8 @@ +VERSION 3.1.1 + Unicode support made even more robust and yet another encoding bug fixed! + * When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423). + * When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes. + VERSION 3.1.0 2026-06-23 diff --git a/src/shapefile.py b/src/shapefile.py index d080e37..e90dd99 100644 --- a/src/shapefile.py +++ b/src/shapefile.py @@ -8,7 +8,7 @@ from __future__ import annotations -__version__ = "3.1.0" +__version__ = "3.1.1" import abc import array @@ -251,6 +251,38 @@ def __call__( ) -> str: ... +def _warn_if_string_ends_with_decoded_pad_bytes( + s: str, + pad_byte: bytes, + encoding: str = "utf-8", + encodingErrors: str = "strict", +) -> None: + """Warns if e.g. the encoding is utf-16-le, and the + decoded text ends in "†", which encodes to a pair of + ascii spaces (b" ", the pad byte for C and M fields). + """ + # Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes. + for n in range(1, 5): + # TODO: test for encodings ending in a null terminator preceded + # by pad bytes, that are exactly the field's size (length). + pad_bytes = pad_byte * n + try: + decoded_pad_bytes: str = pad_bytes.decode(encoding, encodingErrors) + except UnicodeDecodeError: + continue + if s.endswith(decoded_pad_bytes): + msg = ( + f"Under the given encoding: {encoding}, " + f" the text (field name or 'C' or 'M' field): {s!r} " + f" ends with {decoded_pad_bytes!r}, which coincidentally" + f"encodes to the pad bytes: {pad_bytes!r}. " + "The real end of the actual data may be earlier. " + ) + + warnings.warn(msg, category=PossibleDataLoss) + break + + def _encode_dbf_string( s: str, size: int, @@ -273,6 +305,8 @@ def _encode_dbf_string( N = len(s) trimmed: str encoded: bytes + + # i - num of characters to keep. Starts by trying to keep all N. for i in reversed(range(0, N + 1)): trimmed = s[:i] encoded = trimmed.encode(encoding, encodingErrors) @@ -300,16 +334,27 @@ def _encode_dbf_string( f"to a short enough byte string, using {encoding=}, {encodingErrors=}" ) + if pad_byte is not None: + _warn_if_string_ends_with_decoded_pad_bytes( + s=trimmed, + pad_byte=pad_byte, + encoding=encoding, + encodingErrors=encodingErrors, + ) + if len(encoded) < size and pad_byte is not None: padded = encoded.ljust(size, pad_byte) else: padded = encoded - decoded = decode( - b=padded, - encoding=encoding, - encodingErrors=encodingErrors, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + decoded = decode( + b=padded, + encoding=encoding, + encodingErrors=encodingErrors, + ) + if decoded != trimmed: msg = f"Padded value: {padded!r} does not decode to {trimmed!r} using PyShp's decoder: {decode.__name__}" if len(trimmed) < len(s): @@ -324,20 +369,67 @@ def _encode_dbf_string( return padded, trimmed +def _try_to_decode_dbf_name_or_text_field( + b: bytes, + pad_bytes: bytes, # Pad bytes will be trimmed (from the R of b) in their order in the byte-string + encoding: str = "utf8", + encodingErrors: str = "strict", +) -> str: + N = len(b) + decoded: str + trimmed = b + num_trailing_pad_bytes = N - len(b.rstrip(pad_bytes)) + + # Test if we need to restore any pad_bytes to + # correctly decode the remaining bytes to a string. + # num_to_trim starts from num_trailing_pad_bytes + # - initially trimming all trailing pad bytes + for num_to_trim in reversed(range(num_trailing_pad_bytes + 1)): + i = N - num_to_trim + trimmed = b[:i] + try: + decoded = trimmed.decode(encoding, encodingErrors) + except UnicodeDecodeError: + continue + if num_to_trim < num_trailing_pad_bytes: + warnings.warn( + f"Used {num_trailing_pad_bytes - num_to_trim} pad bytes ({pad_bytes!r}) " + f"from padding to decode raw field: {b!r} " + f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ", + category=PossibleDataLoss, + ) + return decoded + + raise dbfFileException( + f"Could not decode field name or text/memo field: {b!r} using {encoding=} and {encodingErrors=}" + " no matter how many trailing pad bytes (if any) ({pad_bytes!r}) were used. " + ) + + def _decode_C_or_M_field( b: bytes, encoding: str = "utf8", encodingErrors: str = "strict", strict: bool = True, ) -> str: - retval = b.decode(encoding, encodingErrors).rstrip("\x00").rstrip(" ") - if retval.rstrip("\x00") != retval and strict: + retval = _try_to_decode_dbf_name_or_text_field( + b=b, + pad_bytes=b" \x00", + encoding=encoding, + encodingErrors=encodingErrors, + ) + + if not strict: + return retval + + if retval.rstrip("\x00") != retval: msg = ( - f"More Trailing Null chars in: {b!r}" - " after removing trailing null chars and ascii spaces" - f", resulting in {retval!r}" + f"More trailing null chars in: {retval!r}" + " after removing one trailing null char and ascii spaces" + f" from {b!r}, and decoding (codec: {encoding}, errors: {encodingErrors}). " ) warnings.warn(msg, category=PossibleDataLoss) + return retval @@ -360,34 +452,15 @@ def decode_name( encodingErrors: str = "strict", strict: bool = True, ) -> str: - N = len(b) - decoded: str - num_trailing_null_bytes = N - len(b.rstrip(b"\x00")) - - # Test if we need to restore any of those null bytes to - # correctly decode the remaining bytes to a string. - for num_to_trim in reversed(range(num_trailing_null_bytes + 1)): - i = N - num_to_trim - trimmed = b[:i] - try: - decoded = trimmed.decode(encoding, encodingErrors) - except UnicodeDecodeError: - continue - if strict and num_to_trim < num_trailing_null_bytes: - warnings.warn( - f"Used {num_trailing_null_bytes - num_to_trim} null bytes " - f"from padding to decode {b!r} " - f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ", - category=PossibleDataLoss, - ) - if not strict: - decoded = decoded.lstrip() - return decoded - - raise dbfFileException( - f"Could not decode field name: {b!r} using {encoding=} and {encodingErrors=}" - " no matter how many trailing null-bytes (if any) were used. " + decoded = _try_to_decode_dbf_name_or_text_field( + b=b, + pad_bytes=b"\x00", + encoding=encoding, + encodingErrors=encodingErrors, ) + if not strict: + decoded = decoded.lstrip() + return decoded @classmethod def from_byte_stream( @@ -445,6 +518,14 @@ def from_unchecked( size = 1 decimal = 0 + if not strict and " " in name: + warnings.warn( + f"Replacing ascii spaces (0x20, ' 's) with underscores ('_'s) in {name!r}. " + "Use a Writer(file, strict=True) to preserve the field name as it is. ", + category=PossibleDataLoss, + ) + name = name.replace(" ", "_") + # Only use the portion of the name that we are able to encode to # 10 bytes or less. _encoded_name, trimmed_name = cls.trim_name_until_encodable( @@ -502,13 +583,6 @@ def encode_field_descriptor( encodingErrors=encodingErrors, strict=strict, ) - if not strict and b" " in encoded_name: - warnings.warn( - "Replacing ascii spaces (0x20) with underscores " - f"in encoded bytes: {encoded_name!r}", - category=PossibleDataLoss, - ) - encoded_name = encoded_name.replace(b" ", b"_") encoded_field_type = self.field_type.encode("ascii") return self.get_struct().pack( diff --git a/tests/hypothesis_tests.py b/tests/hypothesis_tests.py index d7424fb..eb4e79c 100644 --- a/tests/hypothesis_tests.py +++ b/tests/hypothesis_tests.py @@ -461,12 +461,14 @@ def code_and_shape_strat_from_triple(t): @pytest.mark.hypothesis @given(codes_and_shapes=codes_and_shapes) def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None: + code_ex, expected_shapes = codes_and_shapes stream = io.BytesIO() + with shp.ShpWriter(shp=stream, shapeType=code_ex) as w: for shape in expected_shapes: w.shape(shape) - stream.seek(0) + with shp.ShpReader(shp=stream) as r: assert r.shapeType == code_ex @@ -495,8 +497,6 @@ def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None: assert not hasattr(expected, "partTypes") - - @pytest.mark.hypothesis @given(codes_and_shapes=codes_and_shapes) def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None: @@ -516,8 +516,6 @@ def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None: offsets_B.append(offset_B) shx_w._shx_record(offset_B, size_B) - shx_stream.seek(0) - with shp.ShxReader(shx=shx_stream) as r: assert r.numShapes == len(expected_shapes) assert r.offsets == offsets_B @@ -655,7 +653,6 @@ def test_dbf_reader_writer_roundtrip(fields_and_records)-> None: written_records.append(record) - stream.seek(0) with shp.DbfReader(dbf=stream) as r: actual_fields = iter(r.fields) next(actual_fields) # skip deletion flag diff --git a/tests/test_shapefile.py b/tests/test_shapefile.py index c49ff70..3675dd7 100644 --- a/tests/test_shapefile.py +++ b/tests/test_shapefile.py @@ -2,6 +2,7 @@ This module tests the functionality of shapefile.py. """ +import contextlib import datetime import io import json @@ -2044,33 +2045,111 @@ def test_write_multipatch(tmpdir): @pytest.mark.parametrize("expected_date", DATES) def test_round_trip_dbf_date_record(expected_date): stream = io.BytesIO() - with shapefile.DbfWriter(dbf=stream) as dbf_w: - dbf_w.field("Date","D") - dbf_w.record(expected_date) - stream.seek(0) - with shapefile.DbfReader(dbf=stream) as dbf_r: - assert dbf_r.record(0)[0] == expected_date + dbf_w = shapefile.DbfWriter(dbf=stream) + dbf_w.field("Date","D") + dbf_w.record(expected_date) + dbf_w.close() + dbf_r = shapefile.DbfReader(dbf=stream) + dbf_r.record(0)[0] == expected_date + dbf_r.close() -LONG_FIELD_NAME_TESTS = [ - ("ÀÀÀÀ०", 8, "utf-8", "strict"), + +LONG_FIELD_NAMES = [ + ("ÀÀÀÀ०", 8, "utf-8", "strict"), # Encoded bytes are corrupted if truncated to 10 bytes +] + +@pytest.mark.parametrize("name,encoded_len,codec,errors", LONG_FIELD_NAMES) +def test_encode_dbf_field_name_truncation(name,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = False, + ) + with pytest.warns(shapefile.PossibleDataLoss): + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + + +TEST_ENCODING_WARNINGS_FIELD_NAMES = [ + ("A", 2, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) + ("ABC", 6, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) + ("ABCDE", 10, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) +] + +@pytest.mark.parametrize("name,encoded_len,codec,errors", TEST_ENCODING_WARNINGS_FIELD_NAMES) +def test_encode_dbf_field_name_padding(name,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = True, + ) + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + with pytest.warns(shapefile.PossibleDataLoss): + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + +NON_ASCII_FIELD_NAMES = [ + ("囊萤映雪", 8, 'utf-16-be', "strict"), # Issue 421. Encoded bytes contain an ascii space (0x20) so by applying + # encoded.replace(b" ",b"_") the text is corrupted from + # "囊萤映雪" ("Studying by the light of fireflies and snow") + # to: "囊萤晟雪" ("Gathering Fireflies and Flourishing Snow") + # (English translation from Google Translate). ] -@pytest.mark.parametrize("name,encoded_len,codec,errors", LONG_FIELD_NAME_TESTS) -def test_encode_dbf_field_too_long_names(name,encoded_len,codec,errors): +@pytest.mark.parametrize("name,encoded_len,codec,errors", NON_ASCII_FIELD_NAMES) +def test_encode_dbf_field_name_corruption(name,encoded_len,codec,errors): stream = io.BytesIO() - with shapefile.DbfWriter( + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = True, + ) + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + +TEST_STR_VALUES = LONG_FIELD_NAMES + TEST_ENCODING_WARNINGS_FIELD_NAMES + NON_ASCII_FIELD_NAMES + +@pytest.mark.parametrize("value,encoded_len,codec,errors", TEST_STR_VALUES) +def test_encode_dbf_field_values(value,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( stream, encoding=codec, encodingErrors=errors, strict = False, - ) as w: - with pytest.warns(shapefile.PossibleDataLoss): - w.field(name=name) - field = w.fields[0] - assert name.startswith(field.name) - assert len(w.fields[0].name.encode(codec, errors)) == encoded_len - - stream.seek(0) - with shapefile.DbfReader(stream) as r: - assert r.fields[1].name == field.name \ No newline at end of file + ) + w.field("name", "C") + w.record(value) + w.close() + WARNS = codec.lower() == "utf-16-le" and value.isascii() + context = pytest.warns(shapefile.PossibleDataLoss) if WARNS else contextlib.nullcontext() + with context: + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.record(0)[0] == value + r.close() \ No newline at end of file