diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 76e7b2127..068fbd80d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -31,12 +31,14 @@ v17.1.0 (next) Features: - Use FFmpeg 8.1.1 in the binary wheels. - Expose ``AVCodecContext.global_quality`` by :gh-user:`WyattBlue` in (:pr:`2246`). +- Expose ``Stream.discard`` so demuxing and seeking can skip unwanted streams (:issue:`2272`). Fixes: - Add ``cython.final`` to leaf classes, ensuring that they are not subclassed. - Warn that ``CodecContext.decode()`` is not memory safe in some cases. - Fix ``enumerate_input_devices`` and ``enumerate_output_devices`` raising ``AttributeError`` (:issue:`2264`). - Map HTTP 429 to ``HTTPTooManyRequestsError`` instead of ``UndefinedError`` (:issue:`2267`). +- Fix crash in ``VideoFrame.to_ndarray()`` and ``to_image()`` on bottom-up frames with a negative ``line_size`` (:issue:`2213`). v17.0.1 ------- diff --git a/av/stream.py b/av/stream.py index 23749e699..f96030e26 100644 --- a/av/stream.py +++ b/av/stream.py @@ -1,4 +1,4 @@ -from enum import IntFlag +from enum import IntEnum, IntFlag import cython from cython.cimports import libav as lib @@ -34,6 +34,16 @@ class Disposition(IntFlag): multilayer = 1 << 21 +class Discard(IntEnum): + none = lib.AVDISCARD_NONE + default = lib.AVDISCARD_DEFAULT + nonref = lib.AVDISCARD_NONREF + bidir = lib.AVDISCARD_BIDIR + nonintra = lib.AVDISCARD_NONINTRA + nonkey = lib.AVDISCARD_NONKEY + all = lib.AVDISCARD_ALL + + _cinit_bypass_sentinel = cython.declare(object, object()) @@ -132,6 +142,9 @@ def __setattr__(self, name, value): if name == "disposition": self.ptr.disposition = value return + if name == "discard": + self.ptr.discard = Discard(value).value + return if name == "time_base": to_avrational(value, cython.address(self.ptr.time_base)) return @@ -268,6 +281,19 @@ def language(self): def disposition(self): return Disposition(self.ptr.disposition) + @property + def discard(self): + """ + Controls which packets of this stream are discarded by the demuxer. + + Set this to e.g. :attr:`Discard.all` on streams you don't need so that + :meth:`.Container.demux` and :meth:`.Container.seek` skip them, avoiding + the cost of synchronizing streams you never read. + + :type: Discard + """ + return Discard(self.ptr.discard) + @property def type(self): """ diff --git a/av/stream.pyi b/av/stream.pyi index 680166fd6..f9148021f 100644 --- a/av/stream.pyi +++ b/av/stream.pyi @@ -1,4 +1,4 @@ -from enum import IntFlag +from enum import IntEnum, IntFlag from fractions import Fraction from typing import Literal, cast @@ -27,6 +27,15 @@ class Disposition(IntFlag): still_image = cast(int, ...) multilayer = cast(int, ...) +class Discard(IntEnum): + none = cast(int, ...) + default = cast(int, ...) + nonref = cast(int, ...) + bidir = cast(int, ...) + nonintra = cast(int, ...) + nonkey = cast(int, ...) + all = cast(int, ...) + class Stream: name: str | None container: Container @@ -46,6 +55,7 @@ class Stream: start_time: int | None duration: int | None disposition: Disposition + discard: Discard frames: int language: str | None type: Literal["video", "audio", "data", "subtitle", "attachment", "unknown"] diff --git a/av/video/frame.py b/av/video/frame.py index 605ec9a9c..1a7b5364d 100644 --- a/av/video/frame.py +++ b/av/video/frame.py @@ -447,7 +447,8 @@ def useful_array( import numpy as np dtype_obj = np.dtype(dtype) - total_line_size = abs(plane.frame.ptr.linesize[plane.index]) + line_size = plane.frame.ptr.linesize[plane.index] + total_line_size = abs(line_size) itemsize = dtype_obj.itemsize channels = bytes_per_pixel // itemsize @@ -458,6 +459,13 @@ def useful_array( shape = (plane.height, plane.width, channels) strides = (total_line_size, bytes_per_pixel, itemsize) + if line_size < 0: + offset = (plane.height - 1) * total_line_size + strides = (-total_line_size, *strides[1:]) + return np.ndarray( + shape, dtype=dtype_obj, buffer=plane, offset=offset, strides=strides + ) + return np.ndarray(shape, dtype=dtype_obj, buffer=plane, strides=strides) @@ -704,17 +712,24 @@ def to_image(self, **kwargs): plane: VideoPlane = self.reformat(format="rgb24", **kwargs).planes[0] i_buf: cython.const[uint8_t][:] = plane - i_pos: cython.size_t = 0 - i_stride: cython.size_t = plane.line_size + line_size: cython.int = plane.line_size + i_stride: cython.size_t = abs(line_size) o_pos: cython.size_t = 0 o_stride: cython.size_t = plane.width * 3 o_size: cython.size_t = plane.height * o_stride o_buf: bytearray = bytearray(o_size) + # For bottom-up frames (negative line_size) the buffer protocol exposes + # rows from the lowest address, so the top display row is at the far end. + i_pos: cython.size_t = (plane.height - 1) * i_stride if line_size < 0 else 0 + while o_pos < o_size: o_buf[o_pos : o_pos + o_stride] = i_buf[i_pos : i_pos + o_stride] - i_pos += i_stride + if line_size < 0: + i_pos -= i_stride + else: + i_pos += i_stride o_pos += o_stride return Image.frombytes( diff --git a/av/video/plane.py b/av/video/plane.py index 0652f108f..15fe70a94 100644 --- a/av/video/plane.py +++ b/av/video/plane.py @@ -78,7 +78,14 @@ def __getbuffer__(self, view: cython.pointer[Py_buffer], flags: cython.int): ) if flags & PyBUF_WRITABLE and not self._buffer_writable(): raise ValueError("buffer is not writable") - PyBuffer_FillInfo(view, self, self._buffer_ptr(), self._buffer_size(), 0, flags) + + ptr: cython.p_void = self._buffer_ptr() + line_size: cython.int = self.frame.ptr.linesize[self.index] + if line_size < 0: + height: cython.int = self.height + ptr = cython.cast(cython.p_char, ptr) + (height - 1) * line_size + + PyBuffer_FillInfo(view, self, ptr, self._buffer_size(), 0, flags) def __dlpack_device__(self): if self.frame.ptr.hw_frames_ctx: diff --git a/docs/api/stream.rst b/docs/api/stream.rst index 99a30b136..98d8fc43d 100644 --- a/docs/api/stream.rst +++ b/docs/api/stream.rst @@ -92,5 +92,7 @@ Others .. autoattribute:: Stream.language +.. autoattribute:: Stream.discard + diff --git a/include/avformat.pxd b/include/avformat.pxd index 5aca5e287..1cd07d814 100644 --- a/include/avformat.pxd +++ b/include/avformat.pxd @@ -25,6 +25,7 @@ cdef extern from "libavformat/avformat.h" nogil: int index int id int disposition + AVDiscard discard AVCodecParameters *codecpar AVRational time_base int64_t start_time diff --git a/tests/test_streams.py b/tests/test_streams.py index a9f5e5bdc..1afc67222 100644 --- a/tests/test_streams.py +++ b/tests/test_streams.py @@ -136,6 +136,45 @@ def test_selection(self) -> None: data = container.streams.data[0] assert data == container.streams.best("data") + def test_discard(self) -> None: + from av.stream import Discard + + container = av.open( + fate_suite("amv/MTV_high_res_320x240_sample_Penguin_Joke_MTV_from_WMV.amv") + ) + audio = container.streams.audio[0] + + # Default discard policy. + assert audio.discard == Discard.default + + # Setter accepts the enum and round-trips. + audio.discard = Discard.all + assert audio.discard == Discard.all + + audio.discard = Discard.nonkey + assert audio.discard == Discard.nonkey + container.close() + + # Discarding a stream makes demux skip (almost) all of its packets. + def audio_packets(discard: Discard | None) -> int: + c = av.open( + fate_suite( + "amv/MTV_high_res_320x240_sample_Penguin_Joke_MTV_from_WMV.amv" + ) + ) + if discard is not None: + c.streams.audio[0].discard = discard + count = sum( + 1 for p in c.demux() if p.dts is not None and p.stream.type == "audio" + ) + c.close() + return count + + baseline = audio_packets(None) + discarded = audio_packets(Discard.all) + assert baseline > 0 + assert discarded < baseline + def test_printing_video_stream(self) -> None: input_ = av.open( fate_suite("amv/MTV_high_res_320x240_sample_Penguin_Joke_MTV_from_WMV.amv") diff --git a/tests/test_videoframe.py b/tests/test_videoframe.py index 8552683e8..26386adb0 100644 --- a/tests/test_videoframe.py +++ b/tests/test_videoframe.py @@ -161,6 +161,49 @@ def test_basic_to_ndarray() -> None: assert array.shape == (480, 640, 3) +def _vflip(frame: VideoFrame) -> VideoFrame: + """Vertically flip a frame, which yields a bottom-up frame with a negative + ``line_size`` (the same layout DirectShow produces, see GH-2213).""" + graph = av.filter.Graph() + src = graph.add_buffer( + template=None, + width=frame.width, + height=frame.height, + format=frame.format, + time_base=Fraction(1, 1000), + ) + vflip = graph.add("vflip") + sink = graph.add("buffersink") + src.link_to(vflip) + vflip.link_to(sink) + graph.configure() + graph.push(frame) + out = graph.pull() + assert isinstance(out, VideoFrame) + return out + + +@pytest.mark.parametrize("format", ["rgb24", "bgr24", "gray"]) +def test_negative_linesize_to_ndarray(format: str) -> None: + # Bottom-up packed frames have a negative line_size; to_ndarray() must read + # them without crashing (GH-2213) and in the correct top-down order. + height, width = 6, 4 + if format == "gray": + array = numpy.arange(height * width, dtype=numpy.uint8).reshape(height, width) + else: + array = numpy.zeros((height, width, 3), dtype=numpy.uint8) + for row in range(height): + array[row, :, :] = row * 10 + + frame = _vflip(VideoFrame.from_ndarray(array, format=format)) + assert frame.planes[0].line_size < 0 + + result = frame.to_ndarray(format=format) + assertNdarraysEqual(result, array[::-1]) + # Fully materializing the array used to segfault on a bottom-up frame. + assert result.copy().sum() == int(array.sum()) + + def test_ndarray_gray() -> None: array = numpy.random.randint(0, 256, size=(480, 640), dtype=numpy.uint8) for format in ("gray", "gray8"):