diff --git a/README.md b/README.md index ad06d55..8ca6841 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py - **Author**: [Joel Lawhead](https://github.com/GeospatialPython) - **Maintainers**: [James Parrott](https://github.com/JamesParrott) & [Karim Bahgat](https://github.com/karimbahgat) -- **Version**: 3.0.13 -- **Date**: 19th June 2026 +- **Version**: 3.1.1 +- **Date**: 24th June 2026 - **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT) ## Contents @@ -93,6 +93,11 @@ part of your geospatial project. # Version Changes +## 3.1.1 +### Unicode support made even more robust and yet another encoding bug fixed! + - When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423). + - When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes. + ## 3.1.0 ### Unicode support made more robust and encoding bugs fixed - Truncation of field names and text fields now respects unicode code point boundaries (fixes issues - diff --git a/changelog.txt b/changelog.txt index 71560b9..258c2c5 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,8 @@ +VERSION 3.1.1 + Unicode support made even more robust and yet another encoding bug fixed! + * When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423). + * When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes. + VERSION 3.1.0 2026-06-23 diff --git a/src/shapefile.py b/src/shapefile.py index d080e37..e90dd99 100644 --- a/src/shapefile.py +++ b/src/shapefile.py @@ -8,7 +8,7 @@ from __future__ import annotations -__version__ = "3.1.0" +__version__ = "3.1.1" import abc import array @@ -251,6 +251,38 @@ def __call__( ) -> str: ... +def _warn_if_string_ends_with_decoded_pad_bytes( + s: str, + pad_byte: bytes, + encoding: str = "utf-8", + encodingErrors: str = "strict", +) -> None: + """Warns if e.g. the encoding is utf-16-le, and the + decoded text ends in "†", which encodes to a pair of + ascii spaces (b" ", the pad byte for C and M fields). + """ + # Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes. + for n in range(1, 5): + # TODO: test for encodings ending in a null terminator preceded + # by pad bytes, that are exactly the field's size (length). + pad_bytes = pad_byte * n + try: + decoded_pad_bytes: str = pad_bytes.decode(encoding, encodingErrors) + except UnicodeDecodeError: + continue + if s.endswith(decoded_pad_bytes): + msg = ( + f"Under the given encoding: {encoding}, " + f" the text (field name or 'C' or 'M' field): {s!r} " + f" ends with {decoded_pad_bytes!r}, which coincidentally" + f"encodes to the pad bytes: {pad_bytes!r}. " + "The real end of the actual data may be earlier. " + ) + + warnings.warn(msg, category=PossibleDataLoss) + break + + def _encode_dbf_string( s: str, size: int, @@ -273,6 +305,8 @@ def _encode_dbf_string( N = len(s) trimmed: str encoded: bytes + + # i - num of characters to keep. Starts by trying to keep all N. for i in reversed(range(0, N + 1)): trimmed = s[:i] encoded = trimmed.encode(encoding, encodingErrors) @@ -300,16 +334,27 @@ def _encode_dbf_string( f"to a short enough byte string, using {encoding=}, {encodingErrors=}" ) + if pad_byte is not None: + _warn_if_string_ends_with_decoded_pad_bytes( + s=trimmed, + pad_byte=pad_byte, + encoding=encoding, + encodingErrors=encodingErrors, + ) + if len(encoded) < size and pad_byte is not None: padded = encoded.ljust(size, pad_byte) else: padded = encoded - decoded = decode( - b=padded, - encoding=encoding, - encodingErrors=encodingErrors, - ) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + decoded = decode( + b=padded, + encoding=encoding, + encodingErrors=encodingErrors, + ) + if decoded != trimmed: msg = f"Padded value: {padded!r} does not decode to {trimmed!r} using PyShp's decoder: {decode.__name__}" if len(trimmed) < len(s): @@ -324,20 +369,67 @@ def _encode_dbf_string( return padded, trimmed +def _try_to_decode_dbf_name_or_text_field( + b: bytes, + pad_bytes: bytes, # Pad bytes will be trimmed (from the R of b) in their order in the byte-string + encoding: str = "utf8", + encodingErrors: str = "strict", +) -> str: + N = len(b) + decoded: str + trimmed = b + num_trailing_pad_bytes = N - len(b.rstrip(pad_bytes)) + + # Test if we need to restore any pad_bytes to + # correctly decode the remaining bytes to a string. + # num_to_trim starts from num_trailing_pad_bytes + # - initially trimming all trailing pad bytes + for num_to_trim in reversed(range(num_trailing_pad_bytes + 1)): + i = N - num_to_trim + trimmed = b[:i] + try: + decoded = trimmed.decode(encoding, encodingErrors) + except UnicodeDecodeError: + continue + if num_to_trim < num_trailing_pad_bytes: + warnings.warn( + f"Used {num_trailing_pad_bytes - num_to_trim} pad bytes ({pad_bytes!r}) " + f"from padding to decode raw field: {b!r} " + f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ", + category=PossibleDataLoss, + ) + return decoded + + raise dbfFileException( + f"Could not decode field name or text/memo field: {b!r} using {encoding=} and {encodingErrors=}" + " no matter how many trailing pad bytes (if any) ({pad_bytes!r}) were used. " + ) + + def _decode_C_or_M_field( b: bytes, encoding: str = "utf8", encodingErrors: str = "strict", strict: bool = True, ) -> str: - retval = b.decode(encoding, encodingErrors).rstrip("\x00").rstrip(" ") - if retval.rstrip("\x00") != retval and strict: + retval = _try_to_decode_dbf_name_or_text_field( + b=b, + pad_bytes=b" \x00", + encoding=encoding, + encodingErrors=encodingErrors, + ) + + if not strict: + return retval + + if retval.rstrip("\x00") != retval: msg = ( - f"More Trailing Null chars in: {b!r}" - " after removing trailing null chars and ascii spaces" - f", resulting in {retval!r}" + f"More trailing null chars in: {retval!r}" + " after removing one trailing null char and ascii spaces" + f" from {b!r}, and decoding (codec: {encoding}, errors: {encodingErrors}). " ) warnings.warn(msg, category=PossibleDataLoss) + return retval @@ -360,34 +452,15 @@ def decode_name( encodingErrors: str = "strict", strict: bool = True, ) -> str: - N = len(b) - decoded: str - num_trailing_null_bytes = N - len(b.rstrip(b"\x00")) - - # Test if we need to restore any of those null bytes to - # correctly decode the remaining bytes to a string. - for num_to_trim in reversed(range(num_trailing_null_bytes + 1)): - i = N - num_to_trim - trimmed = b[:i] - try: - decoded = trimmed.decode(encoding, encodingErrors) - except UnicodeDecodeError: - continue - if strict and num_to_trim < num_trailing_null_bytes: - warnings.warn( - f"Used {num_trailing_null_bytes - num_to_trim} null bytes " - f"from padding to decode {b!r} " - f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ", - category=PossibleDataLoss, - ) - if not strict: - decoded = decoded.lstrip() - return decoded - - raise dbfFileException( - f"Could not decode field name: {b!r} using {encoding=} and {encodingErrors=}" - " no matter how many trailing null-bytes (if any) were used. " + decoded = _try_to_decode_dbf_name_or_text_field( + b=b, + pad_bytes=b"\x00", + encoding=encoding, + encodingErrors=encodingErrors, ) + if not strict: + decoded = decoded.lstrip() + return decoded @classmethod def from_byte_stream( @@ -445,6 +518,14 @@ def from_unchecked( size = 1 decimal = 0 + if not strict and " " in name: + warnings.warn( + f"Replacing ascii spaces (0x20, ' 's) with underscores ('_'s) in {name!r}. " + "Use a Writer(file, strict=True) to preserve the field name as it is. ", + category=PossibleDataLoss, + ) + name = name.replace(" ", "_") + # Only use the portion of the name that we are able to encode to # 10 bytes or less. _encoded_name, trimmed_name = cls.trim_name_until_encodable( @@ -502,13 +583,6 @@ def encode_field_descriptor( encodingErrors=encodingErrors, strict=strict, ) - if not strict and b" " in encoded_name: - warnings.warn( - "Replacing ascii spaces (0x20) with underscores " - f"in encoded bytes: {encoded_name!r}", - category=PossibleDataLoss, - ) - encoded_name = encoded_name.replace(b" ", b"_") encoded_field_type = self.field_type.encode("ascii") return self.get_struct().pack( diff --git a/tests/hypothesis_tests.py b/tests/hypothesis_tests.py index d7424fb..eb4e79c 100644 --- a/tests/hypothesis_tests.py +++ b/tests/hypothesis_tests.py @@ -461,12 +461,14 @@ def code_and_shape_strat_from_triple(t): @pytest.mark.hypothesis @given(codes_and_shapes=codes_and_shapes) def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None: + code_ex, expected_shapes = codes_and_shapes stream = io.BytesIO() + with shp.ShpWriter(shp=stream, shapeType=code_ex) as w: for shape in expected_shapes: w.shape(shape) - stream.seek(0) + with shp.ShpReader(shp=stream) as r: assert r.shapeType == code_ex @@ -495,8 +497,6 @@ def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None: assert not hasattr(expected, "partTypes") - - @pytest.mark.hypothesis @given(codes_and_shapes=codes_and_shapes) def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None: @@ -516,8 +516,6 @@ def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None: offsets_B.append(offset_B) shx_w._shx_record(offset_B, size_B) - shx_stream.seek(0) - with shp.ShxReader(shx=shx_stream) as r: assert r.numShapes == len(expected_shapes) assert r.offsets == offsets_B @@ -655,7 +653,6 @@ def test_dbf_reader_writer_roundtrip(fields_and_records)-> None: written_records.append(record) - stream.seek(0) with shp.DbfReader(dbf=stream) as r: actual_fields = iter(r.fields) next(actual_fields) # skip deletion flag diff --git a/tests/test_shapefile.py b/tests/test_shapefile.py index c49ff70..3675dd7 100644 --- a/tests/test_shapefile.py +++ b/tests/test_shapefile.py @@ -2,6 +2,7 @@ This module tests the functionality of shapefile.py. """ +import contextlib import datetime import io import json @@ -2044,33 +2045,111 @@ def test_write_multipatch(tmpdir): @pytest.mark.parametrize("expected_date", DATES) def test_round_trip_dbf_date_record(expected_date): stream = io.BytesIO() - with shapefile.DbfWriter(dbf=stream) as dbf_w: - dbf_w.field("Date","D") - dbf_w.record(expected_date) - stream.seek(0) - with shapefile.DbfReader(dbf=stream) as dbf_r: - assert dbf_r.record(0)[0] == expected_date + dbf_w = shapefile.DbfWriter(dbf=stream) + dbf_w.field("Date","D") + dbf_w.record(expected_date) + dbf_w.close() + dbf_r = shapefile.DbfReader(dbf=stream) + dbf_r.record(0)[0] == expected_date + dbf_r.close() -LONG_FIELD_NAME_TESTS = [ - ("ÀÀÀÀ०", 8, "utf-8", "strict"), + +LONG_FIELD_NAMES = [ + ("ÀÀÀÀ०", 8, "utf-8", "strict"), # Encoded bytes are corrupted if truncated to 10 bytes +] + +@pytest.mark.parametrize("name,encoded_len,codec,errors", LONG_FIELD_NAMES) +def test_encode_dbf_field_name_truncation(name,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = False, + ) + with pytest.warns(shapefile.PossibleDataLoss): + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + + +TEST_ENCODING_WARNINGS_FIELD_NAMES = [ + ("A", 2, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) + ("ABC", 6, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) + ("ABCDE", 10, "utf-16-le", "strict"), # Encoded bytes end in null byte (second byte in low end UTF16 code unit) +] + +@pytest.mark.parametrize("name,encoded_len,codec,errors", TEST_ENCODING_WARNINGS_FIELD_NAMES) +def test_encode_dbf_field_name_padding(name,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = True, + ) + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + with pytest.warns(shapefile.PossibleDataLoss): + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + +NON_ASCII_FIELD_NAMES = [ + ("囊萤映雪", 8, 'utf-16-be', "strict"), # Issue 421. Encoded bytes contain an ascii space (0x20) so by applying + # encoded.replace(b" ",b"_") the text is corrupted from + # "囊萤映雪" ("Studying by the light of fireflies and snow") + # to: "囊萤晟雪" ("Gathering Fireflies and Flourishing Snow") + # (English translation from Google Translate). ] -@pytest.mark.parametrize("name,encoded_len,codec,errors", LONG_FIELD_NAME_TESTS) -def test_encode_dbf_field_too_long_names(name,encoded_len,codec,errors): +@pytest.mark.parametrize("name,encoded_len,codec,errors", NON_ASCII_FIELD_NAMES) +def test_encode_dbf_field_name_corruption(name,encoded_len,codec,errors): stream = io.BytesIO() - with shapefile.DbfWriter( + w = shapefile.DbfWriter( + stream, + encoding=codec, + encodingErrors=errors, + strict = True, + ) + w.field(name=name) + field = w.fields[0] + assert name.startswith(field.name) + assert len(w.fields[0].name.encode(codec, errors)) == encoded_len + w.close() + + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.fields[1].name == field.name + r.close() + +TEST_STR_VALUES = LONG_FIELD_NAMES + TEST_ENCODING_WARNINGS_FIELD_NAMES + NON_ASCII_FIELD_NAMES + +@pytest.mark.parametrize("value,encoded_len,codec,errors", TEST_STR_VALUES) +def test_encode_dbf_field_values(value,encoded_len,codec,errors): + stream = io.BytesIO() + w = shapefile.DbfWriter( stream, encoding=codec, encodingErrors=errors, strict = False, - ) as w: - with pytest.warns(shapefile.PossibleDataLoss): - w.field(name=name) - field = w.fields[0] - assert name.startswith(field.name) - assert len(w.fields[0].name.encode(codec, errors)) == encoded_len - - stream.seek(0) - with shapefile.DbfReader(stream) as r: - assert r.fields[1].name == field.name \ No newline at end of file + ) + w.field("name", "C") + w.record(value) + w.close() + WARNS = codec.lower() == "utf-16-le" and value.isascii() + context = pytest.warns(shapefile.PossibleDataLoss) if WARNS else contextlib.nullcontext() + with context: + r = shapefile.DbfReader(stream, encoding=codec, encodingErrors=errors, strict=False) + assert r.record(0)[0] == value + r.close() \ No newline at end of file