Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ The Python Shapefile Library (PyShp) reads and writes ESRI Shapefiles in pure Py

- **Author**: [Joel Lawhead](https://github.com/GeospatialPython)
- **Maintainers**: [James Parrott](https://github.com/JamesParrott) & [Karim Bahgat](https://github.com/karimbahgat)
- **Version**: 3.0.13
- **Date**: 19th June 2026
- **Version**: 3.1.1
- **Date**: 24th June 2026
- **License**: [MIT](https://github.com/GeospatialPython/pyshp/blob/master/LICENSE.TXT)

## Contents
Expand Down Expand Up @@ -93,6 +93,11 @@ part of your geospatial project.

# Version Changes

## 3.1.1
### Unicode support made even more robust and yet another encoding bug fixed!
- When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423).
- When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes.

## 3.1.0
### Unicode support made more robust and encoding bugs fixed
- Truncation of field names and text fields now respects unicode code point boundaries (fixes issues -
Expand Down
5 changes: 5 additions & 0 deletions changelog.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
VERSION 3.1.1
Unicode support made even more robust and yet another encoding bug fixed!
* When reading, only use minimum number of pad bytes to decode text successfully (fixes issue 423).
* When writing, warn (or raise in strict mode) if the text's encoding ends in pad bytes.

VERSION 3.1.0

2026-06-23
Expand Down
164 changes: 119 additions & 45 deletions src/shapefile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from __future__ import annotations

__version__ = "3.1.0"
__version__ = "3.1.1"

import abc
import array
Expand Down Expand Up @@ -251,6 +251,38 @@ def __call__(
) -> str: ...


def _warn_if_string_ends_with_decoded_pad_bytes(
s: str,
pad_byte: bytes,
encoding: str = "utf-8",
encodingErrors: str = "strict",
) -> None:
"""Warns if e.g. the encoding is utf-16-le, and the
decoded text ends in "†", which encodes to a pair of
ascii spaces (b" ", the pad byte for C and M fields).
"""
# Max code unit size under UTF-8, UTF-16, and UTF-32 is 4 bytes.
for n in range(1, 5):
# TODO: test for encodings ending in a null terminator preceded
# by pad bytes, that are exactly the field's size (length).
pad_bytes = pad_byte * n
try:
decoded_pad_bytes: str = pad_bytes.decode(encoding, encodingErrors)
except UnicodeDecodeError:
continue
if s.endswith(decoded_pad_bytes):
msg = (
f"Under the given encoding: {encoding}, "
f" the text (field name or 'C' or 'M' field): {s!r} "
f" ends with {decoded_pad_bytes!r}, which coincidentally"
f"encodes to the pad bytes: {pad_bytes!r}. "
"The real end of the actual data may be earlier. "
)

warnings.warn(msg, category=PossibleDataLoss)
break


def _encode_dbf_string(
s: str,
size: int,
Expand All @@ -273,6 +305,8 @@ def _encode_dbf_string(
N = len(s)
trimmed: str
encoded: bytes

# i - num of characters to keep. Starts by trying to keep all N.
for i in reversed(range(0, N + 1)):
trimmed = s[:i]
encoded = trimmed.encode(encoding, encodingErrors)
Expand Down Expand Up @@ -300,16 +334,27 @@ def _encode_dbf_string(
f"to a short enough byte string, using {encoding=}, {encodingErrors=}"
)

if pad_byte is not None:
_warn_if_string_ends_with_decoded_pad_bytes(
s=trimmed,
pad_byte=pad_byte,
encoding=encoding,
encodingErrors=encodingErrors,
)

if len(encoded) < size and pad_byte is not None:
padded = encoded.ljust(size, pad_byte)
else:
padded = encoded

decoded = decode(
b=padded,
encoding=encoding,
encodingErrors=encodingErrors,
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
decoded = decode(
b=padded,
encoding=encoding,
encodingErrors=encodingErrors,
)

if decoded != trimmed:
msg = f"Padded value: {padded!r} does not decode to {trimmed!r} using PyShp's decoder: {decode.__name__}"
if len(trimmed) < len(s):
Expand All @@ -324,20 +369,67 @@ def _encode_dbf_string(
return padded, trimmed


def _try_to_decode_dbf_name_or_text_field(
b: bytes,
pad_bytes: bytes, # Pad bytes will be trimmed (from the R of b) in their order in the byte-string
encoding: str = "utf8",
encodingErrors: str = "strict",
) -> str:
N = len(b)
decoded: str
trimmed = b
num_trailing_pad_bytes = N - len(b.rstrip(pad_bytes))

# Test if we need to restore any pad_bytes to
# correctly decode the remaining bytes to a string.
# num_to_trim starts from num_trailing_pad_bytes
# - initially trimming all trailing pad bytes
for num_to_trim in reversed(range(num_trailing_pad_bytes + 1)):
i = N - num_to_trim
trimmed = b[:i]
try:
decoded = trimmed.decode(encoding, encodingErrors)
except UnicodeDecodeError:
continue
if num_to_trim < num_trailing_pad_bytes:
warnings.warn(
f"Used {num_trailing_pad_bytes - num_to_trim} pad bytes ({pad_bytes!r}) "
f"from padding to decode raw field: {b!r} "
f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ",
category=PossibleDataLoss,
)
return decoded

raise dbfFileException(
f"Could not decode field name or text/memo field: {b!r} using {encoding=} and {encodingErrors=}"
" no matter how many trailing pad bytes (if any) ({pad_bytes!r}) were used. "
)


def _decode_C_or_M_field(
b: bytes,
encoding: str = "utf8",
encodingErrors: str = "strict",
strict: bool = True,
) -> str:
retval = b.decode(encoding, encodingErrors).rstrip("\x00").rstrip(" ")
if retval.rstrip("\x00") != retval and strict:
retval = _try_to_decode_dbf_name_or_text_field(
b=b,
pad_bytes=b" \x00",
encoding=encoding,
encodingErrors=encodingErrors,
)

if not strict:
return retval

if retval.rstrip("\x00") != retval:
msg = (
f"More Trailing Null chars in: {b!r}"
" after removing trailing null chars and ascii spaces"
f", resulting in {retval!r}"
f"More trailing null chars in: {retval!r}"
" after removing one trailing null char and ascii spaces"
f" from {b!r}, and decoding (codec: {encoding}, errors: {encodingErrors}). "
)
warnings.warn(msg, category=PossibleDataLoss)

return retval


Expand All @@ -360,34 +452,15 @@ def decode_name(
encodingErrors: str = "strict",
strict: bool = True,
) -> str:
N = len(b)
decoded: str
num_trailing_null_bytes = N - len(b.rstrip(b"\x00"))

# Test if we need to restore any of those null bytes to
# correctly decode the remaining bytes to a string.
for num_to_trim in reversed(range(num_trailing_null_bytes + 1)):
i = N - num_to_trim
trimmed = b[:i]
try:
decoded = trimmed.decode(encoding, encodingErrors)
except UnicodeDecodeError:
continue
if strict and num_to_trim < num_trailing_null_bytes:
warnings.warn(
f"Used {num_trailing_null_bytes - num_to_trim} null bytes "
f"from padding to decode {b!r} "
f"to: {decoded!r} ({encoding=}, {encodingErrors=}) ",
category=PossibleDataLoss,
)
if not strict:
decoded = decoded.lstrip()
return decoded

raise dbfFileException(
f"Could not decode field name: {b!r} using {encoding=} and {encodingErrors=}"
" no matter how many trailing null-bytes (if any) were used. "
decoded = _try_to_decode_dbf_name_or_text_field(
b=b,
pad_bytes=b"\x00",
encoding=encoding,
encodingErrors=encodingErrors,
)
if not strict:
decoded = decoded.lstrip()
return decoded

@classmethod
def from_byte_stream(
Expand Down Expand Up @@ -445,6 +518,14 @@ def from_unchecked(
size = 1
decimal = 0

if not strict and " " in name:
warnings.warn(
f"Replacing ascii spaces (0x20, ' 's) with underscores ('_'s) in {name!r}. "
"Use a Writer(file, strict=True) to preserve the field name as it is. ",
category=PossibleDataLoss,
)
name = name.replace(" ", "_")

# Only use the portion of the name that we are able to encode to
# 10 bytes or less.
_encoded_name, trimmed_name = cls.trim_name_until_encodable(
Expand Down Expand Up @@ -502,13 +583,6 @@ def encode_field_descriptor(
encodingErrors=encodingErrors,
strict=strict,
)
if not strict and b" " in encoded_name:
warnings.warn(
"Replacing ascii spaces (0x20) with underscores "
f"in encoded bytes: {encoded_name!r}",
category=PossibleDataLoss,
)
encoded_name = encoded_name.replace(b" ", b"_")

encoded_field_type = self.field_type.encode("ascii")
return self.get_struct().pack(
Expand Down
9 changes: 3 additions & 6 deletions tests/hypothesis_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,12 +461,14 @@ def code_and_shape_strat_from_triple(t):
@pytest.mark.hypothesis
@given(codes_and_shapes=codes_and_shapes)
def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None:

code_ex, expected_shapes = codes_and_shapes
stream = io.BytesIO()

with shp.ShpWriter(shp=stream, shapeType=code_ex) as w:
for shape in expected_shapes:
w.shape(shape)
stream.seek(0)

with shp.ShpReader(shp=stream) as r:
assert r.shapeType == code_ex

Expand Down Expand Up @@ -495,8 +497,6 @@ def test_shp_reader_writer_roundtrip(codes_and_shapes)-> None:
assert not hasattr(expected, "partTypes")




@pytest.mark.hypothesis
@given(codes_and_shapes=codes_and_shapes)
def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None:
Expand All @@ -516,8 +516,6 @@ def test_shx_reader_writer_roundtrip(codes_and_shapes)-> None:
offsets_B.append(offset_B)
shx_w._shx_record(offset_B, size_B)

shx_stream.seek(0)

with shp.ShxReader(shx=shx_stream) as r:
assert r.numShapes == len(expected_shapes)
assert r.offsets == offsets_B
Expand Down Expand Up @@ -655,7 +653,6 @@ def test_dbf_reader_writer_roundtrip(fields_and_records)-> None:
written_records.append(record)


stream.seek(0)
with shp.DbfReader(dbf=stream) as r:
actual_fields = iter(r.fields)
next(actual_fields) # skip deletion flag
Expand Down
Loading
Loading