From 772a896a56e0e3ef9424a025cec9176f9d8f4552 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 11 Apr 2026 14:06:00 +0200 Subject: [PATCH 001/972] scsi: target: configfs: Bound snprintf() return in tg_pt_gp_members_show() target_tg_pt_gp_members_show() formats LUN paths with snprintf() into a 256-byte stack buffer, then will memcpy() cur_len bytes from that buffer. snprintf() returns the length the output would have had, which can exceed the buffer size when the fabric WWN is long because iSCSI IQN names can be up to 223 bytes. The check at the memcpy() site only guards the destination page write, not the source read, so memcpy() will read past the stack buffer and copy adjacent stack contents to the sysfs reader, which when CONFIG_FORTIFY_SOURCE is enabled, fortify_panic() will be triggered. Commit 27e06650a5ea ("scsi: target: target_core_configfs: Add length check to avoid buffer overflow") added the same bound to the target_lu_gp_members_show() but the tg_pt_gp variant was missed so resolve that here. Cc: Martin K. Petersen Fixes: c66ac9db8d4a ("[SCSI] target: Add LIO target core v4.0.0-rc6") Assisted-by: gregkh_clanker_t1000 Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2026041159-garter-theft-3be0@gregkh Signed-off-by: Martin K. Petersen --- drivers/target/target_core_configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index d93773b3227c3e..2b19a956007b73 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3249,7 +3249,7 @@ static ssize_t target_tg_pt_gp_members_show(struct config_item *item, config_item_name(&lun->lun_group.cg_item)); cur_len++; /* Extra byte for NULL terminator */ - if ((cur_len + len) > PAGE_SIZE) { + if (cur_len > TG_PT_GROUP_NAME_BUF || (cur_len + len) > PAGE_SIZE) { pr_warn("Ran out of lu_gp_show_attr" "_members buffer\n"); break; From bc0fcb9823cd0894934cf968b525c575833d7078 Mon Sep 17 00:00:00 2001 From: Yilin Zhu Date: Sun, 12 Apr 2026 13:07:54 +0800 Subject: [PATCH 002/972] ipv6: xfrm6: release dst on error in xfrm6_rcv_encap() xfrm6_rcv_encap() performs an IPv6 route lookup when the skb does not already have a dst attached. ip6_route_input_lookup() returns a referenced dst entry even when the lookup resolves to an error route. If dst->error is set, xfrm6_rcv_encap() drops the skb without attaching the dst to the skb and without releasing the reference returned by the lookup. Repeated packets hitting this path therefore leak dst entries. Release the dst before jumping to the drop path. Fixes: 0146dca70b87 ("xfrm: add support for UDPv6 encapsulation of ESP") Cc: stable@kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ruide Cao Signed-off-by: Yilin Zhu Signed-off-by: Ren Wei Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_protocol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index ea2f805d3b014c..9b586fcec4850b 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -88,8 +88,10 @@ int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); - if (dst->error) + if (dst->error) { + dst_release(dst); goto drop; + } skb_dst_set(skb, dst); } From 16d990a15491cf76cd6eef0846e1b4100e63261a Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Wed, 15 Apr 2026 17:26:55 +0800 Subject: [PATCH 003/972] KVM: s390: pci: fix GAIT table indexing due to double-scaling pointer arithmetic kvm_s390_pci_aif_enable(), kvm_s390_pci_aif_disable(), and aen_host_forward() index the GAIT by manually multiplying the index with sizeof(struct zpci_gaite). Since aift->gait is already a struct zpci_gaite pointer, this double-scales the offset, accessing element aisb*16 instead of aisb. This causes out-of-bounds accesses when aisb >= 32 (with ZPCI_NR_DEVICES=512) Fix by removing the erroneous sizeof multiplication. Fixes: 3c5a1b6f0a18 ("KVM: s390: pci: provide routines for enabling/disabling interrupt forwarding") Fixes: 73f91b004321 ("KVM: s390: pci: enable host forwarding of Adapter Event Notifications") Reported-by: Yuhao Jiang Cc: stable@vger.kernel.org Signed-off-by: Junrui Luo Reviewed-by: Christian Borntraeger Reviewed-by: Matthew Rosato Tested-by: Matthew Rosato Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 3 +-- arch/s390/kvm/pci.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 7cb8ce833b6254..f48f25c7dc8fe4 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3307,8 +3307,7 @@ static void aen_host_forward(unsigned long si) struct zpci_gaite *gaite; struct kvm *kvm; - gaite = (struct zpci_gaite *)aift->gait + - (si * sizeof(struct zpci_gaite)); + gaite = aift->gait + si; if (gaite->count == 0) return; if (gaite->aisb != 0) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 86d93e8dddae3e..eed45af1a92d48 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -290,8 +290,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, phys_to_virt(fib->fmt0.aibv)); spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; /* If assist not requested, host will get all alerts */ if (assist) @@ -357,8 +356,7 @@ static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) if (zdev->kzdev->fib.fmt0.aibv == 0) goto out; spin_lock_irq(&aift->gait_lock); - gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * - sizeof(struct zpci_gaite)); + gaite = aift->gait + zdev->aisb; isc = gaite->gisc; gaite->count--; if (gaite->count == 0) { From ec54093e6a8f87e800bb6aa15eb7fc1e33faa524 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sun, 19 Apr 2026 18:35:42 -0400 Subject: [PATCH 004/972] xfrm: ah: account for ESN high bits in async callbacks AH allocates its temporary auth/ICV layout differently when ESN is enabled: the async ahash setup appends a 4-byte seqhi slot before the ICV or auth_data area, but the async completion callbacks still reconstruct the temporary layout as if seqhi were absent. With an async AH implementation selected, that makes AH copy or compare the wrong bytes on both the IPv4 and IPv6 paths. In UML repro on IPv4 AH with ESN and forced async hmac(sha1), ping fails with 100% packet loss, and the callback logs show the pre-fix drift: ah4 output_done: esn=1 err=0 icv_off=20 expected_off=24 ah4 input_done: esn=1 auth_off=20 expected_auth_off=24 icv_off=32 expected_icv_off=36 Reconstruct the callback-side layout the same way the setup path built it by skipping the ESN seqhi slot before locating the saved auth_data or ICV. Per RFC 4302, the ESN high-order 32 bits participate in the AH ICV computation, so the async callbacks must account for the seqhi slot. Post-fix, the same IPv4 AH+ESN+forced-async-hmac(sha1) UML repro shows the corrected offset (ah4 output_done: esn=1 err=0 icv_off=24 expected_off=24) and ping succeeds; net/ipv4/ah4.o and net/ipv6/ah6.o build clean at W=1. IPv6 AH+ESN was not exercised at runtime, and the change has not been tested against a real async hardware AH engine. Fixes: d4d573d0334d ("{IPv4,xfrm} Add ESN support for AH egress part") Fixes: d8b2a8600b0e ("{IPv4,xfrm} Add ESN support for AH ingress part") Fixes: 26dd70c3fad3 ("{IPv6,xfrm} Add ESN support for AH egress part") Fixes: 8d6da6f32557 ("{IPv6,xfrm} Add ESN support for AH ingress part") Cc: stable@vger.kernel.org Assisted-by: Codex:gpt-5-4 Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 14 ++++++++++++-- net/ipv6/ah6.c | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5fb812443a08f2..4366cbac3f06c5 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -124,9 +124,14 @@ static void ah_output_done(void *data, int err) struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); + int seqhi_len = 0; + __be32 *seqhi; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph = AH_SKB_CB(skb)->tmp; - icv = ah_tmp_icv(iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; @@ -270,12 +275,17 @@ static void ah_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index cb26beea439824..de1e68199a0145 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -317,14 +317,19 @@ static void ah6_output_done(void *data, int err) struct ipv6hdr *top_iph = ipv6_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); if (extlen) extlen += sizeof(*iph_ext); + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph_base = AH_SKB_CB(skb)->tmp; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(iph_ext, extlen); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); @@ -471,13 +476,18 @@ static void ah6_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int hdr_len = skb_network_header_len(skb); int ah_hlen = ipv6_authlen(ah); + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(seqhi, seqhi_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) From 7b03c93d2beb91c6abae322a1f25447b5b3bb9e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Apr 2026 08:08:06 +0200 Subject: [PATCH 005/972] scsi: sg: Don't use GFP_ATOMIC in sg_start_req() sg_start_req() is called from normal user context and can sleep when waiting for memory. Switch it to use GFP_KERNEL, which fixes allocation failures seen with the bio_alloc rework. Fixes: b520c4eef83d ("block: split bio_alloc_bioset more clearly into a fast and slowpath") Reported-by: Shin'ichiro Kawasaki Signed-off-by: Christoph Hellwig Tested-by: Shin'ichiro Kawasaki Reviewed-by: John Garry Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260415060813.807659-2-hch@lst.de Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 2b4b2a1a8e442c..74cd4e8a61c2a0 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1801,7 +1801,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) } res = blk_rq_map_user_io(rq, md, hp->dxferp, hp->dxfer_len, - GFP_ATOMIC, iov_count, iov_count, 1, rw); + GFP_KERNEL, iov_count, iov_count, 1, rw); if (!res) { srp->bio = rq->bio; From 04631f55afc543d5431a2bdee7f6cc0f2c0debe7 Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Tue, 14 Apr 2026 16:38:11 +0530 Subject: [PATCH 006/972] scsi: mpt3sas: Limit NVMe request size to 2 MiB The HBA firmware reports NVMe MDTS values based on the underlying drive capability. However, because the driver allocates a fixed 4K buffer for the PRP list, accommodating at most 512 entries, the driver supports a maximum I/O transfer size of 2 MiB. Limit max_hw_sectors to the smaller of the reported MDTS and the 2 MiB driver limit to prevent issuing oversized I/O that may lead to a kernel oops. Cc: stable@vger.kernel.org Fixes: 9b8b84879d4a ("block: Increase BLK_DEF_MAX_SECTORS_CAP") Reported-by: Mira Limbeck Closes: https://lore.kernel.org/r/291f78bf-4b4a-40dd-867d-053b36c564b3@proxmox.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9b8b84879d4a Suggested-by: Keith Busch Signed-off-by: Ranjan Kumar Tested-by: Mira Limbeck Link: https://patch.msgid.link/20260414110811.85156-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 6ff78855729424..12caffeed3a0d2 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2738,8 +2738,20 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) pcie_device->enclosure_level, pcie_device->connector_name); + /* + * The HBA firmware passes the NVMe drive's MDTS + * (Maximum Data Transfer Size) up to the driver. However, + * the driver hardcodes a 4K buffer size for the PRP list, + * accommodating at most 512 entries. This strictly limits + * the maximum supported NVMe I/O transfer to 2 MiB. + * + * Cap max_hw_sectors to the smaller of the drive's reported + * MDTS or the 2 MiB driver limit to prevent kernel oopses. + */ + lim->max_hw_sectors = SZ_2M >> SECTOR_SHIFT; if (pcie_device->nvme_mdts) - lim->max_hw_sectors = pcie_device->nvme_mdts / 512; + lim->max_hw_sectors = min(lim->max_hw_sectors, + pcie_device->nvme_mdts >> SECTOR_SHIFT); pcie_device_put(pcie_device); spin_unlock_irqrestore(&ioc->pcie_device_lock, flags); From d65efdf467ff935e35dfe6aa9a7ab93f17ac07ee Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Tue, 14 Apr 2026 14:41:18 +0200 Subject: [PATCH 007/972] scsi: smartpqi: Silence a recursive lock warning On systems with multiple controllers debug kernel shows WARNING: possible recursive locking detected during shutdown. Each controller does have its own ctrl_info (and mutex) and that isn't correctly recognized by debug kernel. Suppress the warning by releasing the mutex at the end of pqi_shutdown(). Signed-off-by: Tomas Henzl Acked-by: Don Brace Link: https://patch.msgid.link/20260414124118.23661-1-thenzl@redhat.com Signed-off-by: Martin K. Petersen --- drivers/scsi/smartpqi/smartpqi_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index b4ed991976d065..2026ac645d6ab4 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -9427,6 +9427,7 @@ static void pqi_shutdown(struct pci_dev *pci_dev) pqi_crash_if_pending_command(ctrl_info); pqi_reset(ctrl_info); + pqi_ctrl_unblock_device_reset(ctrl_info); } static void pqi_process_lockup_action_param(void) From 68c3a65a5a8e85643745fdde02cb63904e165620 Mon Sep 17 00:00:00 2001 From: Brian Bunker Date: Thu, 16 Apr 2026 09:55:12 -0700 Subject: [PATCH 008/972] scsi: scsi_dh_alua: Increase default ALUA timeout to maximum spec value The ALUA handler maps a 0 value (no implicit transition timeout provided by the target) to the ALUA_FAILOVER_TIMEOUT constant, currently 60 seconds. This means the kernel already does not accept an infinite transition time. However, 60 seconds is insufficient for some arrays that may take longer to complete ALUA transitions. Since the highest value allowed by the SCSI specification for the implicit transition timeout is a single byte (255 seconds), change the default to 255. This way, when a target does not provide an explicit transition timeout, we default to the maximum value the spec allows rather than an arbitrary 60 second limit. Co-developed-by: Krishna Kant Signed-off-by: Krishna Kant Co-developed-by: Riya Savla Signed-off-by: Riya Savla Signed-off-by: Brian Bunker Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260416165512.26497-2-brian@purestorage.com Signed-off-by: Martin K. Petersen --- drivers/scsi/device_handler/scsi_dh_alua.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index efb08b9b145a14..80ab0ff921d430 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -37,7 +37,7 @@ #define TPGS_MODE_EXPLICIT 0x2 #define ALUA_RTPG_SIZE 128 -#define ALUA_FAILOVER_TIMEOUT 60 +#define ALUA_FAILOVER_TIMEOUT 255 /* max 255 (8-bit value) */ #define ALUA_FAILOVER_RETRIES 5 #define ALUA_RTPG_DELAY_MSECS 5 #define ALUA_RTPG_RETRY_DELAY 2 From 1dc39ed655750d6c679d3ada4adf4a937f2a63fc Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Fri, 17 Apr 2026 16:07:31 -0400 Subject: [PATCH 009/972] scsi: pmcraid: Fix typo in comments Fix typo in structure comment. Signed-off-by: Hugo Villeneuve Link: https://patch.msgid.link/20260417200738.3920001-1-hugo@hugovil.com Signed-off-by: Martin K. Petersen --- drivers/scsi/pmcraid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/pmcraid.h b/drivers/scsi/pmcraid.h index 9f59930e8b4fdd..cd059b7599b4ce 100644 --- a/drivers/scsi/pmcraid.h +++ b/drivers/scsi/pmcraid.h @@ -657,7 +657,7 @@ struct pmcraid_hostrcb { */ struct pmcraid_instance { /* Array of allowed-to-be-exposed resources, initialized from - * Configutation Table, later updated with CCNs + * Configuration Table, later updated with CCNs */ struct pmcraid_resource_entry *res_entries; From 47e66bec3edaebd7c52d8ee981065a4c83b3072f Mon Sep 17 00:00:00 2001 From: Yihang Li Date: Mon, 20 Apr 2026 10:10:44 +0800 Subject: [PATCH 010/972] scsi: hisi_sas: Fix sparse warnings in prep_ata_v3_hw() In prep_ata_v3_hw(), add cpu_to_le32() to fix warning: drivers/scsi/hisi_sas/hisi_sas_v3_hw.c:1448:26: sparse: sparse: invalid assignment: |= drivers/scsi/hisi_sas/hisi_sas_v3_hw.c:1448:26: sparse: left side has type restricted __le32 drivers/scsi/hisi_sas/hisi_sas_v3_hw.c:1448:26: sparse: right side has type unsigned int Fixes: 8aa580cd9284 ("scsi: hisi_sas: Enable force phy when SATA disk directly connected") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604191850.IVYPTaML-lkp@intel.com/ Signed-off-by: Yihang Li Link: https://patch.msgid.link/20260420021044.3339459-1-liyihang9@huawei.com Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index fda07b193137af..14d563e82d2085 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -1491,7 +1491,7 @@ static void prep_ata_v3_hw(struct hisi_hba *hisi_hba, phy_id = device->phy->identify.phy_identifier; hdr->dw0 |= cpu_to_le32((1U << phy_id) << CMD_HDR_PHY_ID_OFF); - hdr->dw0 |= CMD_HDR_FORCE_PHY_MSK; + hdr->dw0 |= cpu_to_le32(CMD_HDR_FORCE_PHY_MSK); hdr->dw0 |= cpu_to_le32(4U << CMD_HDR_CMD_OFF); } From 36920f30e78e69df01f9691c470b6f3ba8aebf98 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 20 Apr 2026 12:50:09 -0500 Subject: [PATCH 011/972] ipmi: Check event message buffer response for bad data The event message buffer response data size got checked later when processing, but check it right after the response comes back. It appears some BMCs may return an empty message instead of an error when fetching events. There are apparently some new BMCs that make this error, so we need to compensate. Reported-by: Matt Fleming Closes: https://lore.kernel.org/lkml/20260415115930.3428942-1-matt@readmodwrite.com/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 4a9e9de4d684f9..08c208cc64c56b 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -630,7 +630,13 @@ static void handle_transaction_done(struct smi_info *smi_info) */ msg = smi_info->curr_msg; smi_info->curr_msg = NULL; - if (msg->rsp[2] != 0) { + /* + * It appears some BMCs, with no event data, return no + * data in the message and not a 0x80 error as the + * spec says they should. Shut down processing if + * the data is not the right length. + */ + if (msg->rsp[2] != 0 || msg->rsp_size != 19) { /* Error getting event, probably done. */ msg->done(msg); From b06cf63d83d3b3744d3aefdd2f3ced25e99d7ec1 Mon Sep 17 00:00:00 2001 From: Wang Shuaiwei Date: Tue, 14 Apr 2026 11:37:18 +0800 Subject: [PATCH 012/972] scsi: ufs: core: Fix bRefClkFreq write failure in HS-LSS mode According to the UFS spec, the bRefClkFreq attribute can only be written when both sub-links are in LS-MODE. However, in HS LSS mode with resetmode = HS_MODE, if the UFS device's default bRefClkFreq value differs from the host controller's dev_ref_clk_freq setting, the write operation will fail. To fix this issue, introduce ufshcd_get_op_mode() function to detect the current link operational mode. Call ufshcd_set_dev_ref_clk() only when both sub-links are in LS-MODE to ensure the attribute can be written successfully. Signed-off-by: Wang Shuaiwei Link: https://patch.msgid.link/20260414033718.1459540-1-wangshuaiwei1@xiaomi.com Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 30 ++++++++++++++++++++++++++++-- include/ufs/unipro.h | 5 +++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 4805e40ed4d78c..c3f08957d179ac 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9259,6 +9259,30 @@ static void ufshcd_config_mcq(struct ufs_hba *hba) hba->nutrs); } +/** + * ufshcd_get_op_mode - get UFS operating mode. + * @hba: per-adapter instance + * + * Use the PA_PWRMODE value to represent the operating mode of UFS. + * + */ +static enum ufs_op_mode ufshcd_get_op_mode(struct ufs_hba *hba) +{ + u32 mode; + u8 rx_mode; + u8 tx_mode; + + ufshcd_dme_get(hba, UIC_ARG_MIB(PA_PWRMODE), &mode); + rx_mode = (mode >> PWRMODE_RX_OFFSET) & PWRMODE_MASK; + tx_mode = mode & PWRMODE_MASK; + + if ((rx_mode == SLOW_MODE || rx_mode == SLOWAUTO_MODE) && + (tx_mode == SLOW_MODE || tx_mode == SLOWAUTO_MODE)) + return LS_MODE; + + return HS_MODE; +} + static int ufshcd_post_device_init(struct ufs_hba *hba) { int ret; @@ -9281,11 +9305,13 @@ static int ufshcd_post_device_init(struct ufs_hba *hba) return 0; /* - * Set the right value to bRefClkFreq before attempting to + * Set the right value to bRefClkFreq in LS_MODE before attempting to * switch to HS gears. */ - if (hba->dev_ref_clk_freq != REF_CLK_FREQ_INVAL) + if (ufshcd_get_op_mode(hba) == LS_MODE && + hba->dev_ref_clk_freq != REF_CLK_FREQ_INVAL) ufshcd_set_dev_ref_clk(hba); + /* Gear up to HS gear. */ ret = ufshcd_config_pwr_mode(hba, &hba->max_pwr_info.info, UFSHCD_PMC_POLICY_DONT_FORCE); diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h index f849a2a101ae29..9c168703b10480 100644 --- a/include/ufs/unipro.h +++ b/include/ufs/unipro.h @@ -333,6 +333,11 @@ enum ufs_eom_eye_mask { #define DME_LocalTC0ReplayTimeOutVal 0xD042 #define DME_LocalAFC0ReqTimeOutVal 0xD043 +enum ufs_op_mode { + LS_MODE = 1, + HS_MODE = 2, +}; + /* PA power modes */ enum ufs_pa_pwr_mode { FAST_MODE = 1, From 2f3835771dff512750205aa5f5f61aec0f2b8cb7 Mon Sep 17 00:00:00 2001 From: Carlos Bilbao Date: Tue, 14 Apr 2026 21:07:28 -0700 Subject: [PATCH 013/972] scsi: target: iscsi: reject invalid size Extended CDB AHS If ecdb_ahdr->ahslength is zero, two bugs follow: kmalloc(be16_to_cpu(ecdb_ahdr->ahslength) + 15, ...) allocates 15 bytes, but the immediately following memcpy writes ISCSI_CDB_SIZE (16) bytes into it, a one-byte heap overflow. Also: memcpy(cdb + ISCSI_CDB_SIZE, ecdb_ahdr->ecdb, be16_to_cpu(ecdb_ahdr->ahslength) - 1); (u16)0 - 1 promotes to (int)-1 which converts to SIZE_MAX as size_t, causing a massive out-of-bounds write. Reject ahslength == 0 with ISCSI_REASON_PROTOCOL_ERROR before the kmalloc. Also reject ahslength values that exceed the actual AHS buffer advertised. Fixes: 8f1f7d297bce ("scsi: target: iscsi: Add support for extended CDB AHS") Signed-off-by: Carlos Bilbao Reviewed-by: Dmitry Bogdanov Link: https://patch.msgid.link/20260415040728.187680-1-carlos.bilbao@kernel.org Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index e80449f6ce159c..cb832fd523af18 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -995,6 +995,7 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd, int data_direction, payload_length; struct iscsi_ecdb_ahdr *ecdb_ahdr; struct iscsi_scsi_req *hdr; + u16 ahslength, cdb_length; int iscsi_task_attr; unsigned char *cdb; int sam_task_attr; @@ -1108,14 +1109,27 @@ int iscsit_setup_scsi_cmd(struct iscsit_conn *conn, struct iscsit_cmd *cmd, ISCSI_REASON_CMD_NOT_SUPPORTED, buf); } - cdb = kmalloc(be16_to_cpu(ecdb_ahdr->ahslength) + 15, - GFP_KERNEL); + ahslength = be16_to_cpu(ecdb_ahdr->ahslength); + if (!ahslength) { + pr_err("Extended CDB AHS with zero length, protocol error.\n"); + return iscsit_add_reject_cmd(cmd, + ISCSI_REASON_PROTOCOL_ERROR, buf); + } + if (ahslength > (hdr->hlength * 4) - 3) { + pr_err("Extended CDB AHS length %u exceeds available PDU buffer.\n", + ahslength); + return iscsit_add_reject_cmd(cmd, + ISCSI_REASON_PROTOCOL_ERROR, buf); + } + + cdb_length = ahslength - 1 + ISCSI_CDB_SIZE; + + cdb = kmalloc(cdb_length, GFP_KERNEL); if (cdb == NULL) return iscsit_add_reject_cmd(cmd, ISCSI_REASON_BOOKMARK_NO_RESOURCES, buf); memcpy(cdb, hdr->cdb, ISCSI_CDB_SIZE); - memcpy(cdb + ISCSI_CDB_SIZE, ecdb_ahdr->ecdb, - be16_to_cpu(ecdb_ahdr->ahslength) - 1); + memcpy(cdb + ISCSI_CDB_SIZE, ecdb_ahdr->ecdb, cdb_length - ISCSI_CDB_SIZE); } data_direction = (hdr->flags & ISCSI_FLAG_CMD_WRITE) ? DMA_TO_DEVICE : From 846c76ecc02973b05ae909dd4248c11bfa277fc1 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:01 +0800 Subject: [PATCH 014/972] bpf: Reject TCP_NODELAY in TCP header option callbacks A BPF_SOCK_OPS program can enable BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG and then call bpf_setsockopt(TCP_NODELAY) from BPF_SOCK_OPS_HDR_OPT_LEN_CB or BPF_SOCK_OPS_WRITE_HDR_OPT_CB. In these callbacks, bpf_setsockopt(TCP_NODELAY) can reach __tcp_sock_set_nodelay(), which can call tcp_push_pending_frames(). >From BPF_SOCK_OPS_HDR_OPT_LEN_CB, tcp_push_pending_frames() can call tcp_current_mss(), which calls tcp_established_options() and re-enters bpf_skops_hdr_opt_len(). BPF_SOCK_OPS_HDR_OPT_LEN_CB -> bpf_setsockopt(TCP_NODELAY) -> tcp_push_pending_frames() -> tcp_current_mss() -> tcp_established_options() -> bpf_skops_hdr_opt_len() -> BPF_SOCK_OPS_HDR_OPT_LEN_CB >From BPF_SOCK_OPS_WRITE_HDR_OPT_CB, tcp_push_pending_frames() can call tcp_write_xmit(), which calls tcp_transmit_skb(). That path recomputes header option length through tcp_established_options() and bpf_skops_hdr_opt_len() before re-entering bpf_skops_write_hdr_opt(). BPF_SOCK_OPS_WRITE_HDR_OPT_CB -> bpf_setsockopt(TCP_NODELAY) -> tcp_push_pending_frames() -> tcp_write_xmit() -> tcp_transmit_skb() -> tcp_established_options() -> bpf_skops_hdr_opt_len() -> bpf_skops_write_hdr_opt() -> BPF_SOCK_OPS_WRITE_HDR_OPT_CB This leads to unbounded recursion and can overflow the kernel stack. Reject TCP_NODELAY with -EOPNOTSUPP in bpf_sock_ops_setsockopt() when bpf_setsockopt() is called from BPF_SOCK_OPS_HDR_OPT_LEN_CB or BPF_SOCK_OPS_WRITE_HDR_OPT_CB. Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt") Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/ Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn> Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260421155804.135786-2-kafai.wan@linux.dev --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 5fa9189eb772b2..96849f4c1fbccd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5833,6 +5833,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; + /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */ + if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB || + bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) && + level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } From 54377fcab51f6f1f8807827d3751be42279e1a6a Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:02 +0800 Subject: [PATCH 015/972] bpf: Reject TCP_NODELAY in bpf-tcp-cc A BPF TCP congestion control program can call bpf_setsockopt() from its callbacks. In current kernels, if it calls bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can re-enter the TCP transmit path before the outer tcp_transmit_skb() has completed and advanced the send head. This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion: tcp_transmit_skb() -> tcp_event_data_sent() -> tcp_ca_event(sk, CA_EVENT_TX_START) -> cwnd_event_tx_start() -> bpf_setsockopt(TCP_NODELAY) -> tcp_push_pending_frames() -> tcp_write_xmit() -> tcp_transmit_skb() This leads to unbounded recursion and can overflow the kernel stack. Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP congestion control programs. To keep it simple, all tcp-cc ops is rejected for TCP_NODELAY. Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt") Suggested-by: Martin KaFai Lau Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev --- include/linux/bpf.h | 1 + net/core/filter.c | 24 ++++++++++++++++++++++++ net/ipv4/bpf_tcp_ca.c | 2 +- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4b703c90ca94f..01e20396489287 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; +extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto; extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_find_vma_proto; diff --git a/net/core/filter.c b/net/core/filter.c index 96849f4c1fbccd..2914f5330310d0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5688,6 +5688,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + /* + * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters + * CA_EVENT_TX_START in bpf_tcp_cc. + */ + if (level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = { + .func = bpf_sk_setsockopt_nodelay, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 008edc7f668852..791e15063237c9 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, */ if (prog_ops_moff(prog) != offsetof(struct tcp_congestion_ops, release)) - return &bpf_sk_setsockopt_proto; + return &bpf_sk_setsockopt_nodelay_proto; return NULL; case BPF_FUNC_getsockopt: /* Since get/setsockopt is usually expected to From 52b6b5334924d8f083a2abe8edeface9206e13ee Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:03 +0800 Subject: [PATCH 016/972] selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks Add a sockops selftest for the TCP_NODELAY restriction in BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB. With BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG enabled, bpf_setsockopt(TCP_NODELAY) returns -EOPNOTSUPP from BPF_SOCK_OPS_HDR_OPT_LEN_CB and BPF_SOCK_OPS_WRITE_HDR_OPT_CB, avoiding unbounded recursion and kernel stack overflow. Other cases continue to work as before, including BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB. Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260421155804.135786-4-kafai.wan@linux.dev --- .../selftests/bpf/prog_tests/tcp_hdr_options.c | 4 ++++ .../bpf/progs/test_misc_tcp_hdr_options.c | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c index 56685fc03c7e9e..80e6315da2a515 100644 --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c @@ -507,6 +507,10 @@ static void misc(void) ASSERT_EQ(misc_skel->bss->nr_hwtstamp, 0, "nr_hwtstamp"); + ASSERT_TRUE(misc_skel->bss->nodelay_est_ok, "nodelay_est_ok"); + ASSERT_TRUE(misc_skel->bss->nodelay_hdr_len_reject, "nodelay_hdr_len_reject"); + ASSERT_TRUE(misc_skel->bss->nodelay_write_hdr_reject, "nodelay_write_hdr_reject"); + check_linum: ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum"); sk_fds_close(&sk_fds); diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c index d487153a839d71..ed5a0011b8639f 100644 --- a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c +++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c @@ -29,6 +29,10 @@ unsigned int nr_syn = 0; unsigned int nr_fin = 0; unsigned int nr_hwtstamp = 0; +bool nodelay_est_ok = false; +bool nodelay_hdr_len_reject = false; +bool nodelay_write_hdr_reject = false; + /* Check the header received from the active side */ static int __check_active_hdr_in(struct bpf_sock_ops *skops, bool check_syn) { @@ -300,7 +304,7 @@ static int handle_passive_estab(struct bpf_sock_ops *skops) SEC("sockops") int misc_estab(struct bpf_sock_ops *skops) { - int true_val = 1; + int true_val = 1, false_val = 0, ret; switch (skops->op) { case BPF_SOCK_OPS_TCP_LISTEN_CB: @@ -316,10 +320,19 @@ int misc_estab(struct bpf_sock_ops *skops) case BPF_SOCK_OPS_PARSE_HDR_OPT_CB: return handle_parse_hdr(skops); case BPF_SOCK_OPS_HDR_OPT_LEN_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_hdr_len_reject = true; return handle_hdr_opt_len(skops); case BPF_SOCK_OPS_WRITE_HDR_OPT_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_write_hdr_reject = true; return handle_write_hdr_opt(skops); case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: + ret = bpf_setsockopt(skops, SOL_TCP, TCP_NODELAY, &false_val, sizeof(false_val)); + if (!ret) + nodelay_est_ok = true; return handle_passive_estab(skops); } From 2c7e33f1fc2e75fcfb4aa5d840bcd2e8b53c1847 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 21 Apr 2026 23:58:04 +0800 Subject: [PATCH 017/972] selftests/bpf: Verify bpf-tcp-cc rejects TCP_NODELAY Add a bpf_tcp_ca selftest for the TCP_NODELAY restriction in bpf-tcp-cc. Update bpf_cubic to exercise init() and cwnd_event_tx_start(), and check that both callbacks reject bpf_setsockopt(TCP_NODELAY) with -EOPNOTSUPP. Signed-off-by: KaFai Wan Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260421155804.135786-5-kafai.wan@linux.dev --- .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 4 ++++ tools/testing/selftests/bpf/progs/bpf_cubic.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index f829b6f09bc9d3..fe30181e63367d 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -112,6 +112,10 @@ static void test_cubic(void) ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called"); + ASSERT_TRUE(cubic_skel->bss->nodelay_init_reject, "init reject nodelay option"); + ASSERT_TRUE(cubic_skel->bss->nodelay_cwnd_event_tx_start_reject, + "cwnd_event_tx_start reject nodelay option"); + bpf_link__destroy(link); bpf_cubic__destroy(cubic_skel); } diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index ce18a4db813fab..ebd5a1e69f5606 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -16,6 +16,7 @@ #include "bpf_tracing_net.h" #include +#include char _license[] SEC("license") = "GPL"; @@ -170,10 +171,18 @@ static void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } +bool nodelay_init_reject = false; +bool nodelay_cwnd_event_tx_start_reject = false; + SEC("struct_ops") void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bpf_bictcp *ca = inet_csk_ca(sk); + int true_val = 1, ret; + + ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_init_reject = true; bictcp_reset(ca); @@ -189,8 +198,13 @@ void BPF_PROG(bpf_cubic_cwnd_event_tx_start, struct sock *sk) { struct bpf_bictcp *ca = inet_csk_ca(sk); __u32 now = tcp_jiffies32; + int true_val = 1, ret; __s32 delta; + ret = bpf_setsockopt(sk, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val)); + if (ret == -EOPNOTSUPP) + nodelay_cwnd_event_tx_start_reject = true; + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. From 4a1b534177395627579c1fb9e7f9100ee88955dd Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Tue, 10 Feb 2026 11:07:31 +0800 Subject: [PATCH 018/972] wifi: ath12k: prepare REO update element only for primary link Commit [1] introduces dp->reo_cmd_update_rx_queue_list for the purpose of tracking all pending REO queue flush commands. The helper ath12k_dp_prepare_reo_update_elem() allocates an element and populates it with REO queue information, then add it to the list. The element would be helpful during clean up stage to finally unmap/free the corresponding REO queue buffer. In MLO scenarios with more than one links, for non dp_primary_link_only chips like WCN7850, that helper is called for each link peer. This results in multiple elements added to the list but all of them pointing to the same REO queue buffer. Consequently the same buffer gets unmap/freed multiple times: BUG kmalloc-2k (Tainted: G B W O ): Object already free ----------------------------------------------------------------------------- Allocated in ath12k_wifi7_dp_rx_assign_reoq+0xce/0x280 [ath12k_wifi7] age=7436 cpu=10 pid=16130 __kmalloc_noprof ath12k_wifi7_dp_rx_assign_reoq ath12k_dp_rx_peer_tid_setup ath12k_dp_peer_setup ath12k_mac_station_add ath12k_mac_op_sta_state [...] Freed in ath12k_dp_rx_tid_cleanup.part.0+0x25/0x40 [ath12k] age=1 cpu=27 pid=16137 kfree ath12k_dp_rx_tid_cleanup.part.0 ath12k_dp_rx_reo_cmd_list_cleanup ath12k_dp_cmn_device_deinit ath12k_core_stop ath12k_core_hw_group_cleanup ath12k_pci_remove Fix this by allowing list addition for primary link only. Note dp_primary_link_only chips like QCN9274 are not affected by this change, because that's what they were doing in the first place. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 3bf2e57e7d6c ("wifi: ath12k: Add Retry Mechanism for REO RX Queue Update Failures") # [1] Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221011 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260210-ath12k-rxtid-double-free-v1-1-8b523fb2886d@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/dp_rx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 250459facff367..25557dea582679 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -565,6 +565,9 @@ static int ath12k_dp_prepare_reo_update_elem(struct ath12k_dp *dp, lockdep_assert_held(&dp->dp_lock); + if (!peer->primary_link) + return 0; + elem = kzalloc_obj(*elem, GFP_ATOMIC); if (!elem) return -ENOMEM; From f3ba9e05cc7b65f41f58bb4808f6c3a8f7894bb1 Mon Sep 17 00:00:00 2001 From: Aaradhana Sahu Date: Fri, 10 Apr 2026 12:43:00 +0530 Subject: [PATCH 019/972] wifi: ath12k: fix OF node refcount imbalance in WSI graph traversal ath12k_core_get_wsi_info() traverses the WSI (Wired Serial Interface) device graph starting from dev->of_node. The current code uses dev->of_node directly as the local traversal pointer and calls of_node_put() on error. Since the driver does not own a reference to dev->of_node, dropping it during traversal results in the following OF refcount underflow: OF: ERROR: of_node_release() detected bad of_node_put() on /soc@0/wifi@c000000 CPU: 1 UID: 0 PID: 210 Comm: insmod Not tainted 6.19.0-rc4-next-20260109-00023-g797dd36dc178 #26 PREEMPT Hardware name: Qualcomm Technologies, Inc. IPQ5332 MI01.2 (DT) Call trace: show_stack+0x18/0x24 (C) dump_stack_lvl+0x60/0x80 dump_stack+0x18/0x24 of_node_release+0x164/0x1a0 kobject_put+0xb4/0x278 of_node_put+0x18/0x28 ath12k_core_init+0x29c/0x5d4 [ath12k] ath12k_ahb_probe+0x950/0xc14 [ath12k] platform_probe+0x5c/0xa4 really_probe+0xc0/0x3ec __driver_probe_device+0x80/0x170 driver_probe_device+0x3c/0x120 __driver_attach+0xc4/0x218 OF: ERROR: next of_node_put() on this node will result in a kobject warning 'refcount_t: underflow; use-after-free.' Fix this by explicitly acquiring a reference to the starting node using of_node_get() and attaching automatic cleanup via __free(device_node). Each discovered WSI node is stored in ag->wsi_node[] with its own of_node_get() reference. These references are later released in ath12k_core_free_wsi_info() during driver teardown. Also remove unnecessary memset() of wsi_node array since cleanup now explicitly sets pointers to NULL. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.6-01243-QCAHKSWPL_SILICONZ-1 Tested-on: IPQ5332 hw1.0 AHB WLAN.WBE.1.6-01275-QCAHKSWPL_SILICONZ-1 Fixes: 908c10c860e0 ("wifi: ath12k: parse multiple device information from Device Tree") Signed-off-by: Aaradhana Sahu Reviewed-by: Rameshkumar Sundaram Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20260410071300.2323603-1-aaradhana.sahu@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 77 ++++++++++++++++---------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 2519e2400d5891..980a12fb2c6e7e 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1838,10 +1838,22 @@ static struct ath12k_hw_group *ath12k_core_hw_group_alloc(struct ath12k_base *ab return ag; } +static void ath12k_core_free_wsi_info(struct ath12k_hw_group *ag) +{ + int i; + + for (i = 0; i < ag->num_devices; i++) { + of_node_put(ag->wsi_node[i]); + ag->wsi_node[i] = NULL; + } + ag->num_devices = 0; +} + static void ath12k_core_hw_group_free(struct ath12k_hw_group *ag) { mutex_lock(&ath12k_hw_group_mutex); + ath12k_core_free_wsi_info(ag); list_del(&ag->list); kfree(ag); @@ -1867,52 +1879,59 @@ static struct ath12k_hw_group *ath12k_core_hw_group_find_by_dt(struct ath12k_bas static int ath12k_core_get_wsi_info(struct ath12k_hw_group *ag, struct ath12k_base *ab) { - struct device_node *wsi_dev = ab->dev->of_node, *next_wsi_dev; - struct device_node *tx_endpoint, *next_rx_endpoint; - int device_count = 0; - - next_wsi_dev = wsi_dev; + struct device_node *next_wsi_dev; + int device_count = 0, ret = 0; + struct device_node *wsi_dev; - if (!next_wsi_dev) + wsi_dev = of_node_get(ab->dev->of_node); + if (!wsi_dev) return -ENODEV; do { - ag->wsi_node[device_count] = next_wsi_dev; + if (device_count >= ATH12K_MAX_DEVICES) { + ath12k_warn(ab, "device count in DT %d is more than limit %d\n", + device_count, ATH12K_MAX_DEVICES); + ret = -EINVAL; + break; + } + + ag->wsi_node[device_count++] = of_node_get(wsi_dev); - tx_endpoint = of_graph_get_endpoint_by_regs(next_wsi_dev, 0, -1); + struct device_node *tx_endpoint __free(device_node) = + of_graph_get_endpoint_by_regs(wsi_dev, 0, -1); if (!tx_endpoint) { - of_node_put(next_wsi_dev); - return -ENODEV; + ret = -ENODEV; + break; } - next_rx_endpoint = of_graph_get_remote_endpoint(tx_endpoint); + struct device_node *next_rx_endpoint __free(device_node) = + of_graph_get_remote_endpoint(tx_endpoint); if (!next_rx_endpoint) { - of_node_put(next_wsi_dev); - of_node_put(tx_endpoint); - return -ENODEV; + ret = -ENODEV; + break; } - of_node_put(tx_endpoint); - of_node_put(next_wsi_dev); - next_wsi_dev = of_graph_get_port_parent(next_rx_endpoint); if (!next_wsi_dev) { - of_node_put(next_rx_endpoint); - return -ENODEV; + ret = -ENODEV; + break; } - of_node_put(next_rx_endpoint); + of_node_put(wsi_dev); + wsi_dev = next_wsi_dev; + } while (ab->dev->of_node != wsi_dev); - device_count++; - if (device_count > ATH12K_MAX_DEVICES) { - ath12k_warn(ab, "device count in DT %d is more than limit %d\n", - device_count, ATH12K_MAX_DEVICES); - of_node_put(next_wsi_dev); - return -EINVAL; + if (ret) { + while (--device_count >= 0) { + of_node_put(ag->wsi_node[device_count]); + ag->wsi_node[device_count] = NULL; } - } while (wsi_dev != next_wsi_dev); - of_node_put(next_wsi_dev); + of_node_put(wsi_dev); + return ret; + } + + of_node_put(wsi_dev); ag->num_devices = device_count; return 0; @@ -1983,9 +2002,9 @@ static struct ath12k_hw_group *ath12k_core_hw_group_assign(struct ath12k_base *a ath12k_core_get_wsi_index(ag, ab)) { ath12k_dbg(ab, ATH12K_DBG_BOOT, "unable to get wsi info from dt, grouping single device"); + ath12k_core_free_wsi_info(ag); ag->id = ATH12K_INVALID_GROUP_ID; ag->num_devices = 1; - memset(ag->wsi_node, 0, sizeof(ag->wsi_node)); wsi->index = 0; } From c4b6ad0e14f5df942eed5ebadaff84b468bd2496 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 18 Apr 2026 22:37:00 +0300 Subject: [PATCH 020/972] wifi: ath10k: snoc: select POWER_SEQUENCING The commit afcf3ec615c9 ("wifi: ath10k: snoc: support powering on the device via pwrseq") made ath10k SNOC driver use devm_pwrseq_get(). Select the corresponding Kconfig symbol to make sure that API call is always available and doesn't return an error per se. Fixes: afcf3ec615c9 ("wifi: ath10k: snoc: support powering on the device via pwrseq") Reported-by: Luca Weiss Closes: https://lore.kernel.org/r/DHUHU7UIT487.139L3KIVRVREU@fairphone.com Signed-off-by: Dmitry Baryshkov Reviewed-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20260418-ath10k-snoc-pwrseq-v1-1-832594ba3294@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath10k/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig index 876aed76583318..efb9f022d8c662 100644 --- a/drivers/net/wireless/ath/ath10k/Kconfig +++ b/drivers/net/wireless/ath/ath10k/Kconfig @@ -46,6 +46,7 @@ config ATH10K_SNOC depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_SMEM depends on QCOM_RPROC_COMMON || QCOM_RPROC_COMMON=n + select POWER_SEQUENCING select QCOM_SCM select QCOM_QMI_HELPERS help From 4498664e2d5888efabb96428196a926acdaa25ed Mon Sep 17 00:00:00 2001 From: Yu-Hsiang Tseng Date: Thu, 23 Apr 2026 02:08:14 +0800 Subject: [PATCH 021/972] wifi: ath12k: use lockdep_assert_in_rcu_read_lock() for RCU assertions Two functions in ath12k assert that the caller holds an RCU read lock: ath12k_mac_get_arvif() and ath12k_p2p_noa_update_vdev_iter(). Both use: WARN_ON(!rcu_read_lock_any_held()); On kernels using preemptible RCU (CONFIG_PREEMPT=y or CONFIG_PREEMPT_RT=y) without CONFIG_DEBUG_LOCK_ALLOC, this produces a false positive splat whenever these functions are invoked from paths that do hold the RCU read lock (e.g. firmware stats processing or mac80211 interface iteration). Root cause: - Without CONFIG_DEBUG_LOCK_ALLOC, rcu_read_lock_any_held() is a static inline that returns !preemptible() as a proxy for "in an RCU read section". - With preemptible RCU, rcu_read_lock() does not disable preemption. A task can therefore be preemptible while legitimately holding an RCU read lock, making the proxy unreliable. - Callers such as ath12k_wmi_tlv_rssi_chain_parse() (via guard(rcu)()) and ieee80211_iterate_active_interfaces_atomic() do hold the RCU read lock, so these warnings are incorrect. Typical splat seen on a WCN7850 station with periodic fw stats processing: WARNING: drivers/net/wireless/ath/ath12k/mac.c:791 at ath12k_mac_get_arvif+0x9e/0xd0 [ath12k] Tainted: G W O 6.19.13-rt #1 PREEMPT_RT Call Trace: ath12k_wmi_tlv_rssi_chain_parse+0x69/0x170 [ath12k] ath12k_wmi_tlv_iter+0x7f/0x120 [ath12k] ath12k_wmi_tlv_fw_stats_parse+0x342/0x6b0 [ath12k] ath12k_wmi_op_rx+0xe9e/0x3150 [ath12k] ath12k_htc_rx_completion_handler+0x3df/0x5b0 [ath12k] ath12k_ce_per_engine_service+0x325/0x3e0 [ath12k] ath12k_pci_ce_workqueue+0x20/0x40 [ath12k] Replace WARN_ON(!rcu_read_lock_any_held()) with lockdep_assert_in_rcu_read_lock(), which is gated on CONFIG_PROVE_RCU and therefore compiles out entirely when PROVE_RCU is disabled. PROVE_RCU kernels continue to get the full lockdep-based check, and the new helper precisely checks for rcu_read_lock() rather than any RCU variant, which better matches the callers' expectations. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 3dd2c68f206e ("wifi: ath12k: prepare vif data structure for MLO handling") Suggested-by: Baochen Qiang Suggested-by: Sebastian Andrzej Siewior Reviewed-by: Baochen Qiang Reviewed-by: Rameshkumar Sundaram Signed-off-by: Yu-Hsiang Tseng Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260422180814.1938317-1-asas1asas200@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 2 +- drivers/net/wireless/ath/ath12k/p2p.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index fbdfe6424fd7c2..df2334f3bad645 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -788,7 +788,7 @@ struct ath12k_link_vif *ath12k_mac_get_arvif(struct ath12k *ar, u32 vdev_id) /* To use the arvif returned, caller must have held rcu read lock. */ - WARN_ON(!rcu_read_lock_any_held()); + lockdep_assert_in_rcu_read_lock(); arvif_iter.vdev_id = vdev_id; arvif_iter.ar = ar; diff --git a/drivers/net/wireless/ath/ath12k/p2p.c b/drivers/net/wireless/ath/ath12k/p2p.c index 59589748f1a8c2..19ebcd1d8eb256 100644 --- a/drivers/net/wireless/ath/ath12k/p2p.c +++ b/drivers/net/wireless/ath/ath12k/p2p.c @@ -123,7 +123,7 @@ static void ath12k_p2p_noa_update_vdev_iter(void *data, u8 *mac, struct ath12k_p2p_noa_arg *arg = data; struct ath12k_link_vif *arvif; - WARN_ON(!rcu_read_lock_any_held()); + lockdep_assert_in_rcu_read_lock(); arvif = &ahvif->deflink; if (!arvif->is_created || arvif->ar != arg->ar || arvif->vdev_id != arg->vdev_id) return; From 375e4e33c18dfa05c5dfd5f3dfffeb29343dd4c7 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Tue, 21 Apr 2026 23:54:12 -0700 Subject: [PATCH 022/972] bpf: Fix NULL pointer dereference in bpf_sk_storage_clone and diag paths bpf_selem_unlink_nofail() sets SDATA(selem)->smap to NULL before removing the selem from the storage hlist. A concurrent RCU reader in bpf_sk_storage_clone() can observe the selem still on the list with smap already NULL, causing a NULL pointer dereference. general protection fault, probably for non-canonical address 0xdffffc000000000a: KASAN: null-ptr-deref in range [0x0000000000000050-0x0000000000000057] RIP: 0010:bpf_sk_storage_clone+0x1cd/0xaa0 net/core/bpf_sk_storage.c:174 Call Trace: sk_clone+0xfed/0x1980 net/core/sock.c:2591 inet_csk_clone_lock+0x30/0x760 net/ipv4/inet_connection_sock.c:1222 tcp_create_openreq_child+0x35/0x2680 net/ipv4/tcp_minisocks.c:571 tcp_v4_syn_recv_sock+0x123/0xf90 net/ipv4/tcp_ipv4.c:1729 tcp_check_req+0x8e1/0x2580 include/net/tcp.h:855 tcp_v4_rcv+0x1845/0x3b80 net/ipv4/tcp_ipv4.c:2347 Add a NULL check for smap in bpf_sk_storage_clone(). bpf_sk_storage_diag_put_all() has the same issue. Add a NULL check and pass the validated smap directly to diag_get(), which is refactored to take smap as a parameter instead of reading it internally. bpf_sk_storage_diag_put() uses diag->maps[i] which is always valid under its refcount, so diag->maps[i] is passed directly to diag_get(). Fixes: 5d800f87d0a5 ("bpf: Support lockless unlink when freeing map or local storage") Reported-by: Xiang Mei Acked-by: Amery Hung Signed-off-by: Weiming Shi Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260422065411.1007737-2-bestswngs@gmail.com --- net/core/bpf_sk_storage.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 14eb7812bda4a9..dc3e8fce8809e4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); - if (!(smap->map.map_flags & BPF_F_CLONE)) + if (!smap || !(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element @@ -531,10 +531,10 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_map *smap, + struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); @@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_stg) return -EMSGSIZE; - smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; @@ -596,9 +595,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); + if (!smap) + continue; diag_size += nla_value_size(smap->map.value_size); - if (nla_stgs && diag_get(SDATA(selem), skb)) + if (nla_stgs && diag_get(smap, SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } @@ -665,7 +666,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, diag_size += nla_value_size(diag->maps[i]->value_size); - if (nla_stgs && diag_get(sdata, skb)) + if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } From 6451d58a355642b612f2bf948ad39108c998ac2a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 20 Apr 2026 19:48:41 +0000 Subject: [PATCH 023/972] sockmap: Fix sk_psock_drop() race vs sock_map_{unhash,close,destroy}(). syzbot reported a splat in sock_map_destroy() [0], where psock was NULL even though sk->sk_prot still pointed to tcp_bpf_prots[][]. The stack trace shows how badly the path was excercised, see inet_release() calls tcp_close(), not sock_map_close() yet, but finally reaching sock_map_destroy(). The root cause is a lack of synchronisation. Even if sk_psock_get() fails to bump psock->refcnt, it does not guarantee that sk_psock_drop() has finished, and thus sk->sk_prot might not have been restored to the original one. Commit 4b4647add7d3 ("sock_map: avoid race between sock_map_close and sk_psock_put") attempted to address this, but it was insufficient for two reasons. It did not cover sock_map_unhash() and sock_map_destroy(), and it missed the corner case where sk_psock() is NULL. On non-x86 platforms, sk_psock_restore_proto(sk, psock) and rcu_assign_sk_user_data(sk, NULL) can be reordered because there is no address dependency between sk->sk_prot and sk->sk_user_data. sk_psock_get() returning NULL implies nothing about sk->sk_prot. Let's simply retry sk_psock_get() in the unlikely case. Note that we cannot avoid loop even if we added memory barrier in sk_psock_drop() and sock_map_psock_get_checked(). Also note that sock_map_destroy() cannot be called from softirq while sock_map_close() has also been running. It is because sock_map_destroy() requires SOCK_DEAD, so sock_map_destroy() cannot happen until sock_map_close() has finished the saved_close() (which is tcp_close()). [0]: WARNING: CPU: 1 PID: 8459 at net/core/sock_map.c:1667 sock_map_destroy+0x28b/0x2b0 net/core/sock_map.c:1667 Modules linked in: CPU: 1 UID: 0 PID: 8459 Comm: syz.0.1109 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:sock_map_destroy+0x28b/0x2b0 net/core/sock_map.c:1667 Code: 8b 36 49 83 c6 38 4c 89 f0 48 c1 e8 03 42 80 3c 38 00 74 08 4c 89 f7 e8 93 62 22 f9 4d 8b 3e e9 79 ff ff ff e8 a6 2b c3 f8 90 <0f> 0b 90 eb 9c e8 9b 2b c3 f8 4c 89 e7 be 03 00 00 00 e8 0e 4e bc RSP: 0018:ffffc9000d067be8 EFLAGS: 00010293 RAX: ffffffff88fb30aa RBX: ffff888024832000 RCX: ffff888024283b80 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: dffffc0000000000 R11: ffffed100862e946 R12: dffffc0000000000 R13: ffff888024832000 R14: ffffffff995b2208 R15: ffffffff88fb2e20 FS: 0000555579a7d500(0000) GS:ffff8881269c2000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00002000000048c0 CR3: 000000003713a000 CR4: 00000000003526f0 Call Trace: inet_csk_destroy_sock+0x166/0x3a0 net/ipv4/inet_connection_sock.c:1294 __tcp_close+0xcc1/0xfd0 net/ipv4/tcp.c:3262 tcp_close+0x28/0x110 net/ipv4/tcp.c:3274 inet_release+0x144/0x190 net/ipv4/af_inet.c:435 __sock_release net/socket.c:649 [inline] sock_close+0xc0/0x240 net/socket.c:1439 __fput+0x45b/0xa80 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xec/0x110 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0x3b0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f265847ebe9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd158dfbd8 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4 RAX: 0000000000000000 RBX: 000000000002ddb0 RCX: 00007f265847ebe9 RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003 RBP: 00007f26586a7da0 R08: 0000000000000001 R09: 0000000e158dfecf R10: 0000001b30a20000 R11: 0000000000000246 R12: 00007f26586a5fac R13: 00007f26586a5fa0 R14: ffffffffffffffff R15: 00007ffd158dfcf0 Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Fixes: b05545e15e1f ("bpf: sockmap, fix transition through disconnect without close") Fixes: d8616ee2affc ("bpf, sockmap: Fix sk->sk_forward_alloc warn_on in sk_stream_kill_queues") Reported-by: syzbot+b0842d38af58376d1fdc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/69cec5ef.050a0220.2dbe29.0009.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260420194846.1089595-1-kuniyu@google.com --- net/core/sock_map.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 02a68be3002a2e..99e3789492a09e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk) void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + if (unlikely(saved_unhash == sock_map_unhash)) + goto retry; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); + + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; } - if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) - return; + if (saved_unhash) saved_unhash(sk); } @@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk) void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + if (unlikely(saved_destroy == sock_map_destroy)) + goto retry; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); + + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; } - if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) - return; + if (saved_destroy) saved_destroy(sk); } @@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout) void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; +retry: lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); - psock = sk_psock_get(sk); - if (unlikely(!psock)) - goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; } else { saved_close = READ_ONCE(sk->sk_prot)->close; -no_psock: rcu_read_unlock(); release_sock(sk); + + if (unlikely(saved_close == sock_map_close)) + goto retry; } - /* Make sure we do not recurse. This is a bug. - * Leak the socket instead of crashing on a stack overflow. - */ - if (WARN_ON_ONCE(saved_close == sock_map_close)) - return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); From 1081de1accb2b224516cca7071122c59532d0b22 Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Thu, 23 Apr 2026 11:38:32 -0700 Subject: [PATCH 024/972] bpf: Fix NULL pointer dereference in bpf_skb_fib_lookup() When tot_len is not provided by the user, bpf_skb_fib_lookup() resolves the FIB result's output device via dev_get_by_index_rcu() to check skb forwardability and fill in mtu_result. The returned pointer is dereferenced without a NULL check. If the device is concurrently unregistered, dev_get_by_index_rcu() returns NULL and is_skb_forwardable() crashes at dev->flags: KASAN: null-ptr-deref in range [0x00000000000000b0-0x00000000000000b7] Call Trace: is_skb_forwardable (include/linux/netdevice.h:4365) bpf_skb_fib_lookup (net/core/filter.c:6446) bpf_prog_test_run_skb (net/bpf/test_run.c) __sys_bpf (kernel/bpf/syscall.c) Add the missing NULL check, returning -ENODEV to be consistent with how bpf_ipv4_fib_lookup() and bpf_ipv6_fib_lookup() handle the same condition. Fixes: 4f74fede40df ("bpf: Add mtu checking to FIB forwarding helper") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Martin KaFai Lau Acked-by: Paul Chaignon Link: https://patch.msgid.link/20260423183831.1325480-2-bestswngs@gmail.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/filter.c b/net/core/filter.c index 2914f5330310d0..bc96c18df4e03b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6473,6 +6473,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; From b5c111f4967ba4fdecdd318923ec7b081e9ef95f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 23 Apr 2026 15:23:55 -0700 Subject: [PATCH 025/972] bpf: Fix sk_local_storage diag dumping uninitialized special fields Call check_and_init_map_value() after the copy_map_value() to zero out special field regions. diag_get() copies sk_local_storage map values into a netlink message using copy_map_value{_locked}(), which intentionally skip special fields. However, the destination buffer from nla_reserve_64bit() is not zeroed and the skipped regions contain uninitialized skb data can be sent to userspace. Fixes: 1ed4d92458a9 ("bpf: INET_DIAG support in bpf_sk_storage") Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260423222356.155387-1-ameryhung@gmail.com --- net/core/bpf_sk_storage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index dc3e8fce8809e4..ecd659f79fd4a0 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -557,6 +557,7 @@ static int diag_get(struct bpf_local_storage_map *smap, sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + check_and_init_map_value(&smap->map, nla_data(nla_value)); nla_nest_end(skb, nla_stg); return 0; From 5f69165b7e4215f02247b0c64052c71b2f66d73a Mon Sep 17 00:00:00 2001 From: "Mukesh Kumar Chaurasiya (IBM)" Date: Sun, 26 Apr 2026 15:17:25 +0530 Subject: [PATCH 026/972] rust/drm: import ARef from sync crate ARef is defined in sync and is getting used from types causing the build to fail. Fix this by using ARef from sync module. Fixes: 80df573af9ef ("rust: drm: gem: shmem: Add DRM shmem helper abstraction") Signed-off-by: Mukesh Kumar Chaurasiya (IBM) Link: https://patch.msgid.link/20260426094725.2188668-2-mkchauras@gmail.com [ Add missing Fixes: tag. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/drm/gem/shmem.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rust/kernel/drm/gem/shmem.rs b/rust/kernel/drm/gem/shmem.rs index d025fb03519545..e1b648920d2f67 100644 --- a/rust/kernel/drm/gem/shmem.rs +++ b/rust/kernel/drm/gem/shmem.rs @@ -19,10 +19,8 @@ use crate::{ }, error::to_result, prelude::*, - types::{ - ARef, - Opaque, // - }, // + sync::aref::ARef, + types::Opaque, // }; use core::{ ops::{ From 15e8bae5d930c91b8739a87d75db0a6efca3cb32 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Thu, 23 Apr 2026 14:46:35 +0200 Subject: [PATCH 027/972] MAINTAINERS: nova: update mailing list The nouveau mailing list has some issues (e.g. with stripping Cc entries from replies when using notmuch + b4 based workflows). Besides that, having a separate mailing list for nova also helps to better distinguish nova from nouveau and makes it easier to track nova-specific discussions. Replace the nouveau mailing list with the new nova-gpu@lists.linux.dev mailing list for both nova-core and nova-drm, and remove the patchwork entries, since those are bound to the nouveau mailing list and not used by nova anyway. Link: https://lore.kernel.org/all/bc2517c2-6772-4cbd-8fd7-6dbdcdd13eab@nvidia.com/ Reviewed-by: Joel Fernandes Reviewed-by: John Hubbard Acked-by: Alexandre Courbot Link: https://patch.msgid.link/20260423124649.38793-1-dakr@kernel.org Signed-off-by: Danilo Krummrich --- MAINTAINERS | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2fb1c75afd1638..5c9272622033ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8193,10 +8193,9 @@ F: include/uapi/drm/nouveau_drm.h CORE DRIVER FOR NVIDIA GPUS [RUST] M: Danilo Krummrich M: Alexandre Courbot -L: nouveau@lists.freedesktop.org +L: nova-gpu@lists.linux.dev S: Supported W: https://rust-for-linux.com/nova-gpu-driver -Q: https://patchwork.freedesktop.org/project/nouveau/ B: https://gitlab.freedesktop.org/drm/nova/-/issues C: irc://irc.oftc.net/nouveau T: git https://gitlab.freedesktop.org/drm/rust/kernel.git drm-rust-next @@ -8205,10 +8204,9 @@ F: drivers/gpu/nova-core/ DRM DRIVER FOR NVIDIA GPUS [RUST] M: Danilo Krummrich -L: nouveau@lists.freedesktop.org +L: nova-gpu@lists.linux.dev S: Supported W: https://rust-for-linux.com/nova-gpu-driver -Q: https://patchwork.freedesktop.org/project/nouveau/ B: https://gitlab.freedesktop.org/drm/nova/-/issues C: irc://irc.oftc.net/nouveau T: git https://gitlab.freedesktop.org/drm/rust/kernel.git drm-rust-next From b0bf14546bcefa4ea49f5efcd7db2a99f0cabde9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 3 Apr 2026 11:20:55 -0400 Subject: [PATCH 028/972] nfsd: fix GET_DIR_DELEGATION when VFS leases are disabled When leases are disabled on the server, running xfstest generic/309 leads to an error because GET_DIR_DELEGATION returns EINVAL. nfsd_get_dir_deleg() can fail in several ways: like memory allocation and unable to get a lease because either leases are disable or it's already held. Currently only the condition "already held" is translated to returning directory-delegation-is-unavailable error. However, other failure conditions are likely temporary and thus should result in the same kind of error. Fixes: 8b99f6a8c116 ("nfsd: wire up GET_DIR_DELEGATION handling") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 85e94c30285a26..2797da8cc9501f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2535,10 +2535,6 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp, dd = nfsd_get_dir_deleg(cstate, gdd, nf); nfsd_file_put(nf); if (IS_ERR(dd)) { - int err = PTR_ERR(dd); - - if (err != -EAGAIN) - return nfserrno(err); gdd->gddrnf_status = GDD4_UNAVAIL; return nfs_ok; } From aa23c94cc433b145d1ce93820ecdfe16d8940e28 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 30 Mar 2026 12:08:21 +0100 Subject: [PATCH 029/972] media: venus: fix QCOM_MDT_LOADER dependency When build-testined with CONFIG_QCOM_MDT_LOADER=m and VIDEO_QCOM_VENUS=y, the kernel fails to link: x86_64-linux-ld: drivers/media/platform/qcom/venus/firmware.o: in function `venus_boot': firmware.c:(.text+0x1e3): undefined reference to `qcom_mdt_get_size' firmware.c:(.text+0x25a): undefined reference to `qcom_mdt_load' firmware.c:(.text+0x272): undefined reference to `qcom_mdt_load_no_init' The problem is the conditional 'select' statement. Change this to make the driver built-in here regardless of CONFIG_ARCH_QCOM, same as for the similar IRIS driver. Signed-off-by: Arnd Bergmann Reviewed-by: Konrad Dybcio Reviewed-by: Dikshita Agarwal Fixes: 0399b696f7f4 ("media: venus: fix compile-test build on non-qcom ARM platform") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/venus/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/venus/Kconfig b/drivers/media/platform/qcom/venus/Kconfig index ffb731ecd48c90..63ee8c78dc6d75 100644 --- a/drivers/media/platform/qcom/venus/Kconfig +++ b/drivers/media/platform/qcom/venus/Kconfig @@ -4,7 +4,7 @@ config VIDEO_QCOM_VENUS depends on VIDEO_DEV && QCOM_SMEM depends on (ARCH_QCOM && ARM64 && IOMMU_API) || COMPILE_TEST select OF_DYNAMIC if ARCH_QCOM - select QCOM_MDT_LOADER if ARCH_QCOM + select QCOM_MDT_LOADER select QCOM_SCM select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV From a297c5165f91366cbc3490e630aabd1c0f70efb8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 5 Feb 2026 15:56:19 +0100 Subject: [PATCH 030/972] media: iris: fix QCOM_MDT_LOADER dependency When build-testined with CONFIG_QCOM_MDT_LOADER=m and VIDEO_QCOM_IRIS=y, the kernel fails to link: x86_64-linux-ld: drivers/media/platform/qcom/iris/iris_firmware.o: in function `iris_fw_load': iris_firmware.c:(.text+0xb0): undefined reference to `qcom_mdt_get_size' iris_firmware.c:(.text+0xfd): undefined reference to `qcom_mdt_load' The problem is the conditional 'select' statement. Change this to make the driver built-in here regardless of CONFIG_ARCH_QCOM. Signed-off-by: Arnd Bergmann Reviewed-by: Konrad Dybcio Reviewed-by: Dikshita Agarwal Reviewed-by: Bryan O'Donoghue Fixes: d19b163356b8 ("media: iris: implement video firmware load/unload") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/iris/Kconfig b/drivers/media/platform/qcom/iris/Kconfig index 3c803a05305a80..5498f48362d15c 100644 --- a/drivers/media/platform/qcom/iris/Kconfig +++ b/drivers/media/platform/qcom/iris/Kconfig @@ -3,7 +3,7 @@ config VIDEO_QCOM_IRIS depends on VIDEO_DEV depends on ARCH_QCOM || COMPILE_TEST select V4L2_MEM2MEM_DEV - select QCOM_MDT_LOADER if ARCH_QCOM + select QCOM_MDT_LOADER select QCOM_SCM select VIDEOBUF2_DMA_CONTIG help From f27cfdcfc916bb59297825805f4c3499f89f9e76 Mon Sep 17 00:00:00 2001 From: Dikshita Agarwal Date: Mon, 16 Feb 2026 12:37:42 +0530 Subject: [PATCH 031/972] media: iris: Fix use-after-free in iris_release_internal_buffers() The recent change in commit 1dabf00ee206 ("media: iris: gen1: Destroy internal buffers after FW releases") introduced a regression where session_release_buf() may free the buffer. The caller, iris_release_internal_buffers(), continued to access `buffer` after the call, leading to a potential use-after-free. Fix this by setting BUF_ATTR_PENDING_RELEASE before calling session_release_buf(), and reverting the flag if the call fails. This ensures no dereference occurs after potential freeing. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/lkml/aYXvKAX3Pg3sL37P@stanley.mountain/#r Signed-off-by: Dikshita Agarwal Reviewed-by: Vikash Garodia Fixes: 1dabf00ee206 ("media: iris: gen1: Destroy internal buffers after FW releases") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/iris_buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/qcom/iris/iris_buffer.c b/drivers/media/platform/qcom/iris/iris_buffer.c index 9151f43bc6b9c2..1d53c7414b754b 100644 --- a/drivers/media/platform/qcom/iris/iris_buffer.c +++ b/drivers/media/platform/qcom/iris/iris_buffer.c @@ -582,10 +582,12 @@ static int iris_release_internal_buffers(struct iris_inst *inst, continue; if (!(buffer->attr & BUF_ATTR_QUEUED)) continue; + buffer->attr |= BUF_ATTR_PENDING_RELEASE; ret = hfi_ops->session_release_buf(inst, buffer); - if (ret) + if (ret) { + buffer->attr &= ~BUF_ATTR_PENDING_RELEASE; return ret; - buffer->attr |= BUF_ATTR_PENDING_RELEASE; + } } return 0; From 4a49ae56b0e4268d48fd96babe0cc68596bc301a Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Fri, 13 Feb 2026 10:13:27 +0100 Subject: [PATCH 032/972] media: iris: Fix dma_free_attrs() size in iris_hfi_queues_init() The core->iface_q_table_vaddr buffer is alloc'd with size queue_size but freed with sizeof(*q_tbl_hdr) which is different. Change the dma_free_attrs() size. Signed-off-by: Thomas Fourier Reviewed-by: Dikshita Agarwal Fixes: d7378f84e94e ("media: iris: introduce iris core state management with shared queues") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/iris_hfi_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/iris/iris_hfi_queue.c b/drivers/media/platform/qcom/iris/iris_hfi_queue.c index b3ed06297953b9..bf6db23b53e210 100644 --- a/drivers/media/platform/qcom/iris/iris_hfi_queue.c +++ b/drivers/media/platform/qcom/iris/iris_hfi_queue.c @@ -263,7 +263,7 @@ int iris_hfi_queues_init(struct iris_core *core) GFP_KERNEL, DMA_ATTR_WRITE_COMBINE); if (!core->sfr_vaddr) { dev_err(core->dev, "sfr alloc and map failed\n"); - dma_free_attrs(core->dev, sizeof(*q_tbl_hdr), core->iface_q_table_vaddr, + dma_free_attrs(core->dev, queue_size, core->iface_q_table_vaddr, core->iface_q_table_daddr, DMA_ATTR_WRITE_COMBINE); return -ENOMEM; } From 95a337f92f0a602d4f935315bfbc8bf07f475e65 Mon Sep 17 00:00:00 2001 From: Vikash Garodia Date: Fri, 13 Mar 2026 18:49:36 +0530 Subject: [PATCH 033/972] media: iris: switch to hardware mode after firmware boot Currently the driver switches the vcodec GDSC to hardware (HW) mode before firmware load and boot sequence. GDSC can be powered off, keeping in hw mode, thereby the vcodec registers programmed in TrustZone (TZ) carry default (reset) values. Move the transition to HW mode after firmware load and boot sequence. The bug was exposed with driver configuring different stream ids to different devices via iommu-map. With registers carrying reset values, VPU would not generate desired stream-id, thereby leading to SMMU fault. For vpu4, when GDSC is switched to HW mode, there is a need to perform the reset operation. Without reset, there are occasional issues of register corruption observed. Hence the vpu GDSC switch also involves the reset. Co-developed-by: Vishnu Reddy Signed-off-by: Vishnu Reddy Signed-off-by: Vikash Garodia Reviewed-by: Dikshita Agarwal Reviewed-by: Dmitry Baryshkov [bod: occassional => occasional] Fixes: dde659d37036 ("media: iris: Introduce vpu ops for vpu4 with necessary hooks") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/iris_core.c | 4 ++++ .../platform/qcom/iris/iris_hfi_common.c | 4 ++++ drivers/media/platform/qcom/iris/iris_vpu2.c | 1 + drivers/media/platform/qcom/iris/iris_vpu3x.c | 9 +++---- drivers/media/platform/qcom/iris/iris_vpu4x.c | 24 ++++++++++--------- .../platform/qcom/iris/iris_vpu_common.c | 16 ++++++++----- .../platform/qcom/iris/iris_vpu_common.h | 3 +++ 7 files changed, 38 insertions(+), 23 deletions(-) diff --git a/drivers/media/platform/qcom/iris/iris_core.c b/drivers/media/platform/qcom/iris/iris_core.c index 8406c48d635b6e..dbaac01eb15a0e 100644 --- a/drivers/media/platform/qcom/iris/iris_core.c +++ b/drivers/media/platform/qcom/iris/iris_core.c @@ -75,6 +75,10 @@ int iris_core_init(struct iris_core *core) if (ret) goto error_unload_fw; + ret = iris_vpu_switch_to_hwmode(core); + if (ret) + goto error_unload_fw; + ret = iris_hfi_core_init(core); if (ret) goto error_unload_fw; diff --git a/drivers/media/platform/qcom/iris/iris_hfi_common.c b/drivers/media/platform/qcom/iris/iris_hfi_common.c index 92112eb16c1104..621c66593d88d4 100644 --- a/drivers/media/platform/qcom/iris/iris_hfi_common.c +++ b/drivers/media/platform/qcom/iris/iris_hfi_common.c @@ -159,6 +159,10 @@ int iris_hfi_pm_resume(struct iris_core *core) if (ret) goto err_suspend_hw; + ret = iris_vpu_switch_to_hwmode(core); + if (ret) + goto err_suspend_hw; + ret = ops->sys_interframe_powercollapse(core); if (ret) goto err_suspend_hw; diff --git a/drivers/media/platform/qcom/iris/iris_vpu2.c b/drivers/media/platform/qcom/iris/iris_vpu2.c index 9c103a2e4e4eaf..01ef40f3895743 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu2.c +++ b/drivers/media/platform/qcom/iris/iris_vpu2.c @@ -44,4 +44,5 @@ const struct vpu_ops iris_vpu2_ops = { .power_off_controller = iris_vpu_power_off_controller, .power_on_controller = iris_vpu_power_on_controller, .calc_freq = iris_vpu2_calc_freq, + .set_hwmode = iris_vpu_set_hwmode, }; diff --git a/drivers/media/platform/qcom/iris/iris_vpu3x.c b/drivers/media/platform/qcom/iris/iris_vpu3x.c index fe4423b951b1e9..3dad47be78b58f 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu3x.c +++ b/drivers/media/platform/qcom/iris/iris_vpu3x.c @@ -234,14 +234,8 @@ static int iris_vpu35_power_on_hw(struct iris_core *core) if (ret) goto err_disable_hw_free_clk; - ret = dev_pm_genpd_set_hwmode(core->pmdomain_tbl->pd_devs[IRIS_HW_POWER_DOMAIN], true); - if (ret) - goto err_disable_hw_clk; - return 0; -err_disable_hw_clk: - iris_disable_unprepare_clock(core, IRIS_HW_CLK); err_disable_hw_free_clk: iris_disable_unprepare_clock(core, IRIS_HW_FREERUN_CLK); err_disable_axi_clk: @@ -266,6 +260,7 @@ const struct vpu_ops iris_vpu3_ops = { .power_off_controller = iris_vpu_power_off_controller, .power_on_controller = iris_vpu_power_on_controller, .calc_freq = iris_vpu3x_vpu4x_calculate_frequency, + .set_hwmode = iris_vpu_set_hwmode, }; const struct vpu_ops iris_vpu33_ops = { @@ -274,6 +269,7 @@ const struct vpu_ops iris_vpu33_ops = { .power_off_controller = iris_vpu33_power_off_controller, .power_on_controller = iris_vpu_power_on_controller, .calc_freq = iris_vpu3x_vpu4x_calculate_frequency, + .set_hwmode = iris_vpu_set_hwmode, }; const struct vpu_ops iris_vpu35_ops = { @@ -283,4 +279,5 @@ const struct vpu_ops iris_vpu35_ops = { .power_on_controller = iris_vpu35_vpu4x_power_on_controller, .program_bootup_registers = iris_vpu35_vpu4x_program_bootup_registers, .calc_freq = iris_vpu3x_vpu4x_calculate_frequency, + .set_hwmode = iris_vpu_set_hwmode, }; diff --git a/drivers/media/platform/qcom/iris/iris_vpu4x.c b/drivers/media/platform/qcom/iris/iris_vpu4x.c index a8db02ce5c5ec5..02e100a4045fce 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu4x.c +++ b/drivers/media/platform/qcom/iris/iris_vpu4x.c @@ -252,21 +252,10 @@ static int iris_vpu4x_power_on_hardware(struct iris_core *core) ret = iris_vpu4x_power_on_apv(core); if (ret) goto disable_hw_clocks; - - iris_vpu4x_ahb_sync_reset_apv(core); } - iris_vpu4x_ahb_sync_reset_hardware(core); - - ret = iris_vpu4x_genpd_set_hwmode(core, true, efuse_value); - if (ret) - goto disable_apv_power_domain; - return 0; -disable_apv_power_domain: - if (!(efuse_value & DISABLE_VIDEO_APV_BIT)) - iris_vpu4x_power_off_apv(core); disable_hw_clocks: iris_vpu4x_disable_hardware_clocks(core, efuse_value); disable_vpp1_power_domain: @@ -359,6 +348,18 @@ static void iris_vpu4x_power_off_hardware(struct iris_core *core) iris_disable_power_domains(core, core->pmdomain_tbl->pd_devs[IRIS_HW_POWER_DOMAIN]); } +static int iris_vpu4x_set_hwmode(struct iris_core *core) +{ + u32 efuse_value = readl(core->reg_base + WRAPPER_EFUSE_MONITOR); + + if (!(efuse_value & DISABLE_VIDEO_APV_BIT)) + iris_vpu4x_ahb_sync_reset_apv(core); + + iris_vpu4x_ahb_sync_reset_hardware(core); + + return iris_vpu4x_genpd_set_hwmode(core, true, efuse_value); +} + const struct vpu_ops iris_vpu4x_ops = { .power_off_hw = iris_vpu4x_power_off_hardware, .power_on_hw = iris_vpu4x_power_on_hardware, @@ -366,4 +367,5 @@ const struct vpu_ops iris_vpu4x_ops = { .power_on_controller = iris_vpu35_vpu4x_power_on_controller, .program_bootup_registers = iris_vpu35_vpu4x_program_bootup_registers, .calc_freq = iris_vpu3x_vpu4x_calculate_frequency, + .set_hwmode = iris_vpu4x_set_hwmode, }; diff --git a/drivers/media/platform/qcom/iris/iris_vpu_common.c b/drivers/media/platform/qcom/iris/iris_vpu_common.c index 548e5f1727fdb7..69e6126dc4d95e 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu_common.c +++ b/drivers/media/platform/qcom/iris/iris_vpu_common.c @@ -292,14 +292,8 @@ int iris_vpu_power_on_hw(struct iris_core *core) if (ret && ret != -ENOENT) goto err_disable_hw_clock; - ret = dev_pm_genpd_set_hwmode(core->pmdomain_tbl->pd_devs[IRIS_HW_POWER_DOMAIN], true); - if (ret) - goto err_disable_hw_ahb_clock; - return 0; -err_disable_hw_ahb_clock: - iris_disable_unprepare_clock(core, IRIS_HW_AHB_CLK); err_disable_hw_clock: iris_disable_unprepare_clock(core, IRIS_HW_CLK); err_disable_power: @@ -308,6 +302,16 @@ int iris_vpu_power_on_hw(struct iris_core *core) return ret; } +int iris_vpu_set_hwmode(struct iris_core *core) +{ + return dev_pm_genpd_set_hwmode(core->pmdomain_tbl->pd_devs[IRIS_HW_POWER_DOMAIN], true); +} + +int iris_vpu_switch_to_hwmode(struct iris_core *core) +{ + return core->iris_platform_data->vpu_ops->set_hwmode(core); +} + int iris_vpu35_vpu4x_power_off_controller(struct iris_core *core) { u32 clk_rst_tbl_size = core->iris_platform_data->clk_rst_tbl_size; diff --git a/drivers/media/platform/qcom/iris/iris_vpu_common.h b/drivers/media/platform/qcom/iris/iris_vpu_common.h index f6dffc613b8223..dee3b1349c5e86 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu_common.h +++ b/drivers/media/platform/qcom/iris/iris_vpu_common.h @@ -21,6 +21,7 @@ struct vpu_ops { int (*power_on_controller)(struct iris_core *core); void (*program_bootup_registers)(struct iris_core *core); u64 (*calc_freq)(struct iris_inst *inst, size_t data_size); + int (*set_hwmode)(struct iris_core *core); }; int iris_vpu_boot_firmware(struct iris_core *core); @@ -30,6 +31,8 @@ int iris_vpu_watchdog(struct iris_core *core, u32 intr_status); int iris_vpu_prepare_pc(struct iris_core *core); int iris_vpu_power_on_controller(struct iris_core *core); int iris_vpu_power_on_hw(struct iris_core *core); +int iris_vpu_set_hwmode(struct iris_core *core); +int iris_vpu_switch_to_hwmode(struct iris_core *core); int iris_vpu_power_on(struct iris_core *core); int iris_vpu_power_off_controller(struct iris_core *core); void iris_vpu_power_off_hw(struct iris_core *core); From 3d9593ad1a58c5acc3e5fa2a48222bb7632e6812 Mon Sep 17 00:00:00 2001 From: Vishnu Reddy Date: Thu, 5 Mar 2026 18:58:31 +0530 Subject: [PATCH 034/972] media: iris: fix use-after-free of fmt_src during MBPF check During concurrency testing, multiple instances can run in parallel, and each instance uses its own inst->lock while the core->lock protects the list of active instances. The race happens because these locks cover different scopes, inst->lock protects only the internals of a single instance, while the Macro Blocks Per Frame (MBPF) checker walks the core list under core->lock and reads fields like fmt_src->width and fmt_src->height. At the same time, iris_close() may free fmt_src and fmt_dst under inst->lock while the instance is still present in the core list. This allows a situation where the MBPF checker, still iterating through the core list, reaches an instance whose fmt_src was already freed by another thread and ends up dereferencing a dangling pointer, resulting in a use-after-free. This happens because the MBPF checker assumes that any instance in the core list is fully valid, but the freeing of fmt_src and fmt_dst without removing the instance from the core list is not correct. The correct ordering is to defer freeing fmt_src and fmt_dst until after the instance has been removed from the core list and all teardown under the core lock has completed, ensuring that no dangling pointers are ever exposed during MBPF checks. Reviewed-by: Vikash Garodia Signed-off-by: Vishnu Reddy Reviewed-by: Dikshita Agarwal Fixes: 5ad964ad5656 ("media: iris: Initialize and deinitialize encoder instance structure") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/iris_vdec.c | 6 ------ drivers/media/platform/qcom/iris/iris_vdec.h | 1 - drivers/media/platform/qcom/iris/iris_venc.c | 6 ------ drivers/media/platform/qcom/iris/iris_venc.h | 1 - drivers/media/platform/qcom/iris/iris_vidc.c | 6 ++---- 5 files changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/media/platform/qcom/iris/iris_vdec.c b/drivers/media/platform/qcom/iris/iris_vdec.c index 719217399a304e..99d544e2af4f98 100644 --- a/drivers/media/platform/qcom/iris/iris_vdec.c +++ b/drivers/media/platform/qcom/iris/iris_vdec.c @@ -61,12 +61,6 @@ int iris_vdec_inst_init(struct iris_inst *inst) return iris_ctrls_init(inst); } -void iris_vdec_inst_deinit(struct iris_inst *inst) -{ - kfree(inst->fmt_dst); - kfree(inst->fmt_src); -} - static const struct iris_fmt iris_vdec_formats_cap[] = { [IRIS_FMT_NV12] = { .pixfmt = V4L2_PIX_FMT_NV12, diff --git a/drivers/media/platform/qcom/iris/iris_vdec.h b/drivers/media/platform/qcom/iris/iris_vdec.h index ec1ce55d1375fd..5123d2a340e15f 100644 --- a/drivers/media/platform/qcom/iris/iris_vdec.h +++ b/drivers/media/platform/qcom/iris/iris_vdec.h @@ -9,7 +9,6 @@ struct iris_inst; int iris_vdec_inst_init(struct iris_inst *inst); -void iris_vdec_inst_deinit(struct iris_inst *inst); int iris_vdec_enum_fmt(struct iris_inst *inst, struct v4l2_fmtdesc *f); int iris_vdec_try_fmt(struct iris_inst *inst, struct v4l2_format *f); int iris_vdec_s_fmt(struct iris_inst *inst, struct v4l2_format *f); diff --git a/drivers/media/platform/qcom/iris/iris_venc.c b/drivers/media/platform/qcom/iris/iris_venc.c index aa27b22704eb9e..4d886769d958b9 100644 --- a/drivers/media/platform/qcom/iris/iris_venc.c +++ b/drivers/media/platform/qcom/iris/iris_venc.c @@ -79,12 +79,6 @@ int iris_venc_inst_init(struct iris_inst *inst) return iris_ctrls_init(inst); } -void iris_venc_inst_deinit(struct iris_inst *inst) -{ - kfree(inst->fmt_dst); - kfree(inst->fmt_src); -} - static const struct iris_fmt iris_venc_formats_cap[] = { [IRIS_FMT_H264] = { .pixfmt = V4L2_PIX_FMT_H264, diff --git a/drivers/media/platform/qcom/iris/iris_venc.h b/drivers/media/platform/qcom/iris/iris_venc.h index c4db7433da5375..00c1716b2747c7 100644 --- a/drivers/media/platform/qcom/iris/iris_venc.h +++ b/drivers/media/platform/qcom/iris/iris_venc.h @@ -9,7 +9,6 @@ struct iris_inst; int iris_venc_inst_init(struct iris_inst *inst); -void iris_venc_inst_deinit(struct iris_inst *inst); int iris_venc_enum_fmt(struct iris_inst *inst, struct v4l2_fmtdesc *f); int iris_venc_try_fmt(struct iris_inst *inst, struct v4l2_format *f); int iris_venc_s_fmt(struct iris_inst *inst, struct v4l2_format *f); diff --git a/drivers/media/platform/qcom/iris/iris_vidc.c b/drivers/media/platform/qcom/iris/iris_vidc.c index bd38d84c9cc79d..5eb1786b07371d 100644 --- a/drivers/media/platform/qcom/iris/iris_vidc.c +++ b/drivers/media/platform/qcom/iris/iris_vidc.c @@ -289,10 +289,6 @@ int iris_close(struct file *filp) v4l2_m2m_ctx_release(inst->m2m_ctx); v4l2_m2m_release(inst->m2m_dev); mutex_lock(&inst->lock); - if (inst->domain == DECODER) - iris_vdec_inst_deinit(inst); - else if (inst->domain == ENCODER) - iris_venc_inst_deinit(inst); iris_session_close(inst); iris_inst_change_state(inst, IRIS_INST_DEINIT); iris_v4l2_fh_deinit(inst, filp); @@ -304,6 +300,8 @@ int iris_close(struct file *filp) mutex_unlock(&inst->lock); mutex_destroy(&inst->ctx_q_lock); mutex_destroy(&inst->lock); + kfree(inst->fmt_src); + kfree(inst->fmt_dst); kfree(inst); return 0; From 3e0b2053751657ed2924adfe3ff25b1450231e33 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 27 Mar 2026 22:19:55 +0200 Subject: [PATCH 035/972] media: qcom: iris: increase H265D_MAX_SLICE to fix H.265 decoding on SC7280 Follow the commit bfe1326573ff ("venus: Fix for H265 decoding failure.") and increase H265D_MAX_SLICE following firmware requirements on that platform. Otherwise decoding of the H.265 streams fails with the "insufficient scratch_1 buffer size" from the firmware. Signed-off-by: Dmitry Baryshkov Reviewed-by: Dikshita Agarwal Reviewed-by: Vikash Garodia Reviewed-by: Konrad Dybcio [bod: Fixed commit log withthe => with the] Fixes: e1f5d32608ec ("media: iris: Add internal buffer calculation for HEVC and VP9 decoders") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/iris/iris_vpu_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/qcom/iris/iris_vpu_buffer.h b/drivers/media/platform/qcom/iris/iris_vpu_buffer.h index 12640eb5ed8c45..8c0d6b7b5de85f 100644 --- a/drivers/media/platform/qcom/iris/iris_vpu_buffer.h +++ b/drivers/media/platform/qcom/iris/iris_vpu_buffer.h @@ -67,7 +67,7 @@ struct iris_inst; #define SIZE_DOLBY_RPU_METADATA (41 * 1024) #define H264_CABAC_HDR_RATIO_HD_TOT 1 #define H264_CABAC_RES_RATIO_HD_TOT 3 -#define H265D_MAX_SLICE 1200 +#define H265D_MAX_SLICE 3600 #define SIZE_H265D_HW_PIC_T SIZE_H264D_HW_PIC_T #define H265_CABAC_HDR_RATIO_HD_TOT 2 #define H265_CABAC_RES_RATIO_HD_TOT 2 From dd1b373941079cc102cc18bc68884e18245f5912 Mon Sep 17 00:00:00 2001 From: Wenmeng Liu Date: Fri, 13 Mar 2026 18:13:02 +0800 Subject: [PATCH 036/972] media: qcom: camss: Fix csid IRQ offset for sa8775p Fix BUF_DONE_IRQ_STATUS_RDI_OFFSET calculation for csid lite on sa8775p platform. The offset should be 0 for csid lite on sa8775p, Signed-off-by: Wenmeng Liu Reviewed-by: Bryan O'Donoghue Fixes: ed03e99de0fa ("media: qcom: camss: Add support for CSID 690") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/camss/camss-csid-gen3.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen3.c b/drivers/media/platform/qcom/camss/camss-csid-gen3.c index 664245cf6eb0ca..bd059243790ede 100644 --- a/drivers/media/platform/qcom/camss/camss-csid-gen3.c +++ b/drivers/media/platform/qcom/camss/camss-csid-gen3.c @@ -48,9 +48,9 @@ #define IS_CSID_690(csid) ((csid->camss->res->version == CAMSS_8775P) \ || (csid->camss->res->version == CAMSS_8300)) #define CSID_BUF_DONE_IRQ_STATUS 0x8C -#define BUF_DONE_IRQ_STATUS_RDI_OFFSET (csid_is_lite(csid) ?\ - 1 : (IS_CSID_690(csid) ?\ - 13 : 14)) +#define BUF_DONE_IRQ_STATUS_RDI_OFFSET (csid_is_lite(csid) ? \ + ((IS_CSID_690(csid) ? 0 : 1)) : \ + ((IS_CSID_690(csid) ? 13 : 14))) #define CSID_BUF_DONE_IRQ_MASK 0x90 #define CSID_BUF_DONE_IRQ_CLEAR 0x94 #define CSID_BUF_DONE_IRQ_SET 0x98 From fe56c674118aa46da1a3e65aa22ca709ebd7d812 Mon Sep 17 00:00:00 2001 From: Wenmeng Liu Date: Fri, 13 Mar 2026 18:13:03 +0800 Subject: [PATCH 037/972] media: qcom: camss: Fix csid clock configuration for sa8775p Fix the mismatch between clock list and clock rate table for CSID lite instances. The current implementation has 5 clocks defined but only 2 are actually needed (vfe_lite_csid and vfe_lite_cphy_rx), while the clock rate table doesn't match this configuration. Update both clock list and rate table to maintain consistency: - Remove unused clocks: cpas_vfe_lite, vfe_lite_ahb, vfe_lite - Update clock rate table to match the remaining two clocks Signed-off-by: Wenmeng Liu Reviewed-by: Bryan O'Donoghue Fixes: ed03e99de0fa ("media: qcom: camss: Add support for CSID 690") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/camss/camss.c | 40 +++++++++-------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c index 00b87fd9afbd89..cb013471898506 100644 --- a/drivers/media/platform/qcom/camss/camss.c +++ b/drivers/media/platform/qcom/camss/camss.c @@ -3598,12 +3598,10 @@ static const struct camss_subdev_resources csid_res_8775p[] = { /* CSID2 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", - "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + .clock = { "vfe_lite_csid", "vfe_lite_cphy_rx" }, .clock_rate = { - { 0, 0, 400000000, 400000000, 0}, - { 0, 0, 400000000, 480000000, 0} + { 400000000, 480000000 }, + { 400000000, 480000000 } }, .reg = { "csid_lite0" }, .interrupt = { "csid_lite0" }, @@ -3617,12 +3615,10 @@ static const struct camss_subdev_resources csid_res_8775p[] = { /* CSID3 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", - "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + .clock = { "vfe_lite_csid", "vfe_lite_cphy_rx" }, .clock_rate = { - { 0, 0, 400000000, 400000000, 0}, - { 0, 0, 400000000, 480000000, 0} + { 400000000, 480000000 }, + { 400000000, 480000000 } }, .reg = { "csid_lite1" }, .interrupt = { "csid_lite1" }, @@ -3636,12 +3632,10 @@ static const struct camss_subdev_resources csid_res_8775p[] = { /* CSID4 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", - "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + .clock = { "vfe_lite_csid", "vfe_lite_cphy_rx" }, .clock_rate = { - { 0, 0, 400000000, 400000000, 0}, - { 0, 0, 400000000, 480000000, 0} + { 400000000, 480000000 }, + { 400000000, 480000000 } }, .reg = { "csid_lite2" }, .interrupt = { "csid_lite2" }, @@ -3655,12 +3649,10 @@ static const struct camss_subdev_resources csid_res_8775p[] = { /* CSID5 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", - "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + .clock = { "vfe_lite_csid", "vfe_lite_cphy_rx" }, .clock_rate = { - { 0, 0, 400000000, 400000000, 0}, - { 0, 0, 400000000, 480000000, 0} + { 400000000, 480000000 }, + { 400000000, 480000000 } }, .reg = { "csid_lite3" }, .interrupt = { "csid_lite3" }, @@ -3674,12 +3666,10 @@ static const struct camss_subdev_resources csid_res_8775p[] = { /* CSID6 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", - "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + .clock = { "vfe_lite_csid", "vfe_lite_cphy_rx" }, .clock_rate = { - { 0, 0, 400000000, 400000000, 0}, - { 0, 0, 400000000, 480000000, 0} + { 400000000, 480000000 }, + { 400000000, 480000000 } }, .reg = { "csid_lite4" }, .interrupt = { "csid_lite4" }, From d31fac47b39f5e1ed85a587688ca70b793e421b4 Mon Sep 17 00:00:00 2001 From: Wenmeng Liu Date: Fri, 13 Mar 2026 18:13:04 +0800 Subject: [PATCH 038/972] media: qcom: camss: Add missing clocks for VFE lite on sa8775p Add missing required clocks (cpas_ahb and camnoc_axi) for VFE lite instances on sa8775p platform. These clocks are necessary for proper VFE lite operation: Reviewed-by: Bryan O'Donoghue Signed-off-by: Wenmeng Liu Fixes: e7b59e1d06fb ("media: qcom: camss: Add support for VFE 690") Cc: stable@vger.kernel.org Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/camss/camss.c | 40 ++++++++++++++--------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss.c b/drivers/media/platform/qcom/camss/camss.c index cb013471898506..9335636d7c4dfc 100644 --- a/drivers/media/platform/qcom/camss/camss.c +++ b/drivers/media/platform/qcom/camss/camss.c @@ -3742,15 +3742,17 @@ static const struct camss_subdev_resources vfe_res_8775p[] = { /* VFE2 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", + .clock = { "cpas_ahb", "cpas_vfe_lite", "vfe_lite_ahb", "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + "vfe_lite", "camnoc_axi"}, .clock_rate = { - { 0, 0, 0, 0 }, + { 0 }, + { 0 }, { 300000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 480000000, 600000000, 600000000, 600000000 }, + { 400000000 }, }, .reg = { "vfe_lite0" }, .interrupt = { "vfe_lite0" }, @@ -3765,15 +3767,17 @@ static const struct camss_subdev_resources vfe_res_8775p[] = { /* VFE3 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", + .clock = { "cpas_ahb", "cpas_vfe_lite", "vfe_lite_ahb", "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + "vfe_lite", "camnoc_axi"}, .clock_rate = { - { 0, 0, 0, 0 }, + { 0 }, + { 0 }, { 300000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 480000000, 600000000, 600000000, 600000000 }, + { 400000000 }, }, .reg = { "vfe_lite1" }, .interrupt = { "vfe_lite1" }, @@ -3788,15 +3792,17 @@ static const struct camss_subdev_resources vfe_res_8775p[] = { /* VFE4 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", + .clock = { "cpas_ahb", "cpas_vfe_lite", "vfe_lite_ahb", "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + "vfe_lite", "camnoc_axi"}, .clock_rate = { - { 0, 0, 0, 0 }, + { 0 }, + { 0 }, { 300000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 480000000, 600000000, 600000000, 600000000 }, + { 400000000 }, }, .reg = { "vfe_lite2" }, .interrupt = { "vfe_lite2" }, @@ -3811,15 +3817,17 @@ static const struct camss_subdev_resources vfe_res_8775p[] = { /* VFE5 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", + .clock = { "cpas_ahb", "cpas_vfe_lite", "vfe_lite_ahb", "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + "vfe_lite", "camnoc_axi"}, .clock_rate = { - { 0, 0, 0, 0 }, + { 0 }, + { 0 }, { 300000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 480000000, 600000000, 600000000, 600000000 }, + { 400000000 }, }, .reg = { "vfe_lite3" }, .interrupt = { "vfe_lite3" }, @@ -3834,15 +3842,17 @@ static const struct camss_subdev_resources vfe_res_8775p[] = { /* VFE6 (lite) */ { .regulators = {}, - .clock = { "cpas_vfe_lite", "vfe_lite_ahb", + .clock = { "cpas_ahb", "cpas_vfe_lite", "vfe_lite_ahb", "vfe_lite_csid", "vfe_lite_cphy_rx", - "vfe_lite"}, + "vfe_lite", "camnoc_axi"}, .clock_rate = { - { 0, 0, 0, 0 }, + { 0 }, + { 0 }, { 300000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 400000000, 400000000, 400000000, 400000000 }, { 480000000, 600000000, 600000000, 600000000 }, + { 400000000 }, }, .reg = { "vfe_lite4" }, .interrupt = { "vfe_lite4" }, From 23c39cb598977f10909a2387c5e5f34afc1d6933 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Mar 2026 16:18:24 +0100 Subject: [PATCH 039/972] media: qcom: camss: avoid format string warning clang-22 warns about csiphy_match_clock_name() taking a variable format string that is not checked against the 'int index' argument: drivers/media/platform/qcom/camss/camss-csiphy.c:566:44: error: diagnostic behavior may be improved by adding the 'format(printf, 2, 3)' attribute to the declaration of 'csiphy_match_clock_name' [-Werror,-Wmissing-format-attribute] 561 | static bool csiphy_match_clock_name(const char *clock_name, const char *format, | __attribute__((format(printf, 2, 3))) 562 | int index) 563 | { 564 | char name[16]; /* csiphyXXX_timer\0 */ 565 | 566 | snprintf(name, sizeof(name), format, index); | ^ drivers/media/platform/qcom/camss/camss-csiphy.c:561:13: note: 'csiphy_match_clock_name' declared here 561 | static bool csiphy_match_clock_name(const char *clock_name, const char *format, | ^ Change the function to use a snprintf() style format string that allows this to be checked at the call site. Signed-off-by: Arnd Bergmann Reviewed-by: Bryan O'Donoghue Signed-off-by: Bryan O'Donoghue Signed-off-by: Hans Verkuil --- drivers/media/platform/qcom/camss/camss-csiphy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss-csiphy.c b/drivers/media/platform/qcom/camss/camss-csiphy.c index 62623393f41449..78a1b568dbae6a 100644 --- a/drivers/media/platform/qcom/camss/camss-csiphy.c +++ b/drivers/media/platform/qcom/camss/camss-csiphy.c @@ -558,12 +558,16 @@ static int csiphy_init_formats(struct v4l2_subdev *sd, return csiphy_set_format(sd, fh ? fh->state : NULL, &format); } -static bool csiphy_match_clock_name(const char *clock_name, const char *format, - int index) +static bool __printf(2, 3) +csiphy_match_clock_name(const char *clock_name, const char *format, ...) { char name[16]; /* csiphyXXX_timer\0 */ + va_list args; + + va_start(args, format); + vsnprintf(name, sizeof(name), format, args); + va_end(args); - snprintf(name, sizeof(name), format, index); return !strcmp(clock_name, name); } From bfb4dc533d0abaca07013dd71e6b5c6f182232b3 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Fri, 10 Apr 2026 18:11:06 +0800 Subject: [PATCH 040/972] xfs: remove the meaningless XFS_ALLOC_FLAG_FREEING In xfs_refcount_finish_one(), there's no need to pass XFS_ALLOC_FLAG_FREEING to xfs_alloc_read_agf(). So remove it. Signed-off-by: Jinliang Zheng Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_refcount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 40c7f0ff6cf3a7..0ec6ccd8b4dcf5 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1414,8 +1414,7 @@ xfs_refcount_finish_one( if (rcur == NULL) { struct xfs_perag *pag = to_perag(ri->ri_group); - error = xfs_alloc_read_agf(pag, tp, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; From 00dd8d7ec5253c6273023a0fd6dc08683e0bdfef Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:13 +0100 Subject: [PATCH 041/972] xfs: zero entire directory data block header region at init xfs_dir3_data_init currently zeroes only the xfs_dir3_blk_hdr portion of the directory data block header, then manually initializes the bestfree entries in a loop. This leaves the pad field in xfs_dir3_data_hdr uninitialized and requires explicit zeroing of each bestfree slot. Zero the entire header region (geo->data_entry_offset bytes) unconditionally before setting individual fields. This covers all current and future header fields, all padding (implicit and explicit), and the bestfree array, so the manual zeroing loop for bestfree can be removed. Suggested-by: Dave Chinner Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_dir2_data.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 80ba94f51e5c7c..35ff119aa84b52 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -728,7 +728,6 @@ xfs_dir3_data_init( struct xfs_dir2_data_unused *dup; struct xfs_dir2_data_free *bf; int error; - int i; /* * Get the buffer set up for the block. @@ -741,13 +740,16 @@ xfs_dir3_data_init( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); /* - * Initialize the header. + * Initialize the whole directory header region to zero + * so that all padding, bestfree entries, and any + * future header fields are clean. */ hdr = bp->b_addr; + memset(hdr, 0, geo->data_entry_offset); + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(args->owner); @@ -759,10 +761,6 @@ xfs_dir3_data_init( bf = xfs_dir2_data_bestfree_p(mp, hdr); bf[0].offset = cpu_to_be16(geo->data_entry_offset); bf[0].length = cpu_to_be16(geo->blksize - geo->data_entry_offset); - for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - bf[i].length = 0; - bf[i].offset = 0; - } /* * Set up an unused entry for the block's body. From 8fbb1877dfa5e26bda1baf8cc6abd3f805098486 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:14 +0100 Subject: [PATCH 042/972] xfs: zero directory data block padding on write verification Old kernels did not zero the pad field in xfs_dir3_data_hdr when initializing directory data blocks, so existing filesystems may have non-zero padding on disk. Zero the pad field in xfs_dir3_data_write_verify alongside the existing LSN and checksum updates. The pad field is pure alignment padding with no runtime meaning, so zeroing it during write verification is safe and has no additional I/O cost. This lets filesystems gradually self-heal stale non-zero padding as directories are modified, without requiring an explicit repair pass. Suggested-by: Dave Chinner Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_dir2_data.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 35ff119aa84b52..aecbab61014c77 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -382,6 +382,7 @@ xfs_dir3_data_write_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + struct xfs_dir3_data_hdr *datahdr3 = bp->b_addr; xfs_failaddr_t fa; fa = xfs_dir3_data_verify(bp); @@ -396,6 +397,11 @@ xfs_dir3_data_write_verify( if (bip) hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Zero padding that may be stale from old kernels. + */ + datahdr3->pad = 0; + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); } From 939919ccddfcc379bb82b1f90f732d9a5cb32cc8 Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Sat, 11 Apr 2026 15:24:15 +0100 Subject: [PATCH 043/972] xfs: check directory data block header padding in scrub Add the missing scrub check for the pad field in directory data block headers. Old kernels may have written non-zero padding without issue, and the write path now self-heals stale padding on modification. Flag non-zero padding as an optimization opportunity (preen) rather than corruption. Add xchk_fblock_set_preen helper for reporting file fork block issues that could be optimized. The trace event xchk_fblock_preen already exists. Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/common.c | 11 +++++++++++ fs/xfs/scrub/common.h | 2 ++ fs/xfs/scrub/dir.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 20e63069088b39..3d40cb0b2496c4 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -251,6 +251,17 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record a block indexed by a file fork that could be optimized. */ +void +xchk_fblock_set_preen( + struct xfs_scrub *sc, + int whichfork, + xfs_fileoff_t offset) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; + trace_xchk_fblock_preen(sc, whichfork, offset, __return_address); +} + /* Record something being wrong with the filesystem primary superblock. */ void xchk_set_corrupt( diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index f2ecc68538f0c3..b494d747c0084a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -25,6 +25,8 @@ bool xchk_fblock_xref_process_error(struct xfs_scrub *sc, void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_fblock_set_preen(struct xfs_scrub *sc, + int whichfork, xfs_fileoff_t offset); void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index e09724cd372553..09715a4aa154bf 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -492,7 +492,12 @@ xchk_directory_data_bestfree( goto out; xchk_buffer_recheck(sc, bp); - /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (xfs_has_crc(sc->mp)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (hdr3->pad) + xchk_fblock_set_preen(sc, XFS_DATA_FORK, lblk); + } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out_buf; From 592975da8c3ca87b043077e6eafa37665eae7936 Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 15 Apr 2026 09:45:14 +1000 Subject: [PATCH 044/972] xfs: fix memory leak on error in xfs_alloc_zone_info() Currently, the 0th index of the zi_used_bucket_bitmap array is not freed on error due to the pre-decrement then evaluate semantic of the while loop used in xfs_alloc_zone_info(). Fix it by allowing for the i == 0 case to be covered. Fixes: 080d01c41d44 ("xfs: implement zoned garbage collection") Cc: stable@vger.kernel.org # v6.15 Reviewed-by: Damien Le Moal Reviewed-by: Carlos Maiolino Signed-off-by: Wilfred Mallawa Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index a851b98143c0b4..c64f9ab743a60a 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1217,7 +1217,7 @@ xfs_alloc_zone_info( return zi; out_free_bitmaps: - while (--i > 0) + while (--i >= 0) kvfree(zi->zi_used_bucket_bitmap[i]); kfree(zi); return NULL; From af47a4be6a90c8bfc874f9994ac9c15813b9718b Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Fri, 17 Apr 2026 12:16:30 +1000 Subject: [PATCH 045/972] xfs: fix memory leak for data allocated by xfs_zone_gc_data_alloc() In xfs_zone_gc_mount(), on error, a struct xfs_zone_gc_data allocated with xfs_zone_gc_data_alloc() is freed with kfree(), however, this doesn't free the underlying folios or the rmap_irecs. Use xfs_zone_gc_data_free() to correctly free this memory. Fixes: 080d01c41d44 ("xfs: implement zoned garbage collection") Cc: stable@vger.kernel.org # v6.15 Signed-off-by: Wilfred Mallawa Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index fedcc47048aff5..c8a1d5c0332c52 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -1221,7 +1221,7 @@ xfs_zone_gc_mount( if (data->oz) xfs_open_zone_put(data->oz); out_free_gc_data: - kfree(data); + xfs_zone_gc_data_free(data); return error; } From fca20fcb76a20655daf18738f4a88c638a6bb64c Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Fri, 10 Apr 2026 18:06:15 +0100 Subject: [PATCH 046/972] xfs: check da node block pad field during scrub The da node block header (xfs_da3_node_hdr) contains a __pad32 field that should always be zero. Add a check for this during directory and attribute btree scrubbing. Since old kernels may have written non-zero padding without issues, flag this as an optimization opportunity (preen) rather than corruption. Signed-off-by: Yuto Ohnuki Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/dabtree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 1a71d36898b1d7..c2d6ad59d03ef2 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -454,7 +454,12 @@ xchk_da_btree_block( } } - /* XXX: Check hdr3.pad32 once we know how to fix it. */ + if (xfs_has_crc(ip->i_mount)) { + struct xfs_da3_node_hdr *nodehdr3 = blk->bp->b_addr; + + if (nodehdr3->__pad32) + xchk_da_set_preen(ds, level); + } break; default: xchk_da_set_corrupt(ds, level); From 0cfe660559e857d7c00ab86c73e4510ce069086f Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Fri, 24 Apr 2026 15:39:00 -0400 Subject: [PATCH 047/972] KVM: s390: pci: Fix aisb calculation The current implementation of aisb calculation will erroneously index via an unsigned long * as well as multiply by 8B for every 64-bits in the offset; only one or the other is required. This throws off aisb calculations once the number of devices exceeds 64, and can result in out-of-bounds access as well as failure to indicate summary bits associated with those devices in guests. Fix this by converting to a physical address before applying the offset, as is already done in arch/s390/pci/pci_irq.c. Fixes: 3c5a1b6f0a18 ("KVM: s390: pci: provide routines for enabling/disabling interrupt forwarding") Signed-off-by: Matthew Rosato Reviewed-by: Niklas Schnelle Signed-off-by: Christian Borntraeger --- arch/s390/kvm/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index eed45af1a92d48..5b075c38998e31 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -166,7 +166,7 @@ static int kvm_zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.noi = airq_iv_end(zdev->aibv); fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); fib.fmt0.aibvo = 0; - fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; fib.gd = zdev->gisa; @@ -308,7 +308,7 @@ static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, /* Update guest FIB for re-issue */ fib->fmt0.aisbo = zdev->aisb & 63; - fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector) + (zdev->aisb / 64) * 8; fib->fmt0.isc = gisc; /* Save some guest fib values in the host for later use */ From 87e63466c9fc30c3d95b8741c3df1f1ff01d7f23 Mon Sep 17 00:00:00 2001 From: Ravi Singh Date: Wed, 22 Apr 2026 07:39:59 +0000 Subject: [PATCH 048/972] xfs: flush delalloc blocks on ENOSPC in xfs_trans_alloc_icreate xfs_trans_alloc_icreate() can fail with ENOSPC when delalloc reservations have consumed most of the available block count (fdblocks). xfs_trans_alloc() already retries internally with xfs_blockgc_flush_all(), but that only trims post-EOF speculative preallocation and may not free enough space for the transaction reservation. Add a retry with xfs_flush_inodes() when xfs_trans_alloc() returns ENOSPC. This forces writeback of all dirty inodes via sync_inodes_sb(), converting delalloc reservations to real allocations and freeing the over-reserved portion back to fdblocks. This fixes all callers of xfs_trans_alloc_icreate() and removes the existing caller-level retry from xfs_create(), which is now handled centrally. Signed-off-by: Ravi Singh Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_inode.c | 6 ------ fs/xfs/xfs_trans.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index beaa26ec62da40..9978ac1422fc4f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -699,12 +699,6 @@ xfs_create( */ error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, &tp); - if (error == -ENOSPC) { - /* flush outstanding delalloc blocks and retry */ - xfs_flush_inodes(mp); - error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, - resblks, &tp); - } if (error) goto out_parent; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bcc470f56e4662..148cc32449c1f8 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1199,10 +1199,21 @@ xfs_trans_alloc_icreate( { struct xfs_trans *tp; bool retried = false; + bool flushed = false; int error; retry: error = xfs_trans_alloc(mp, resv, dblocks, 0, 0, &tp); + if (error == -ENOSPC && !flushed) { + /* + * Flush all delalloc blocks to reclaim space from speculative + * preallocation. This is similar to the quota retry below + * but targets FS-wide ENOSPC. + */ + xfs_flush_inodes(mp); + flushed = true; + goto retry; + } if (error) return error; From 711a9c018ad252b2807f85d44e1267b595644f9b Mon Sep 17 00:00:00 2001 From: Rio Liu Date: Wed, 15 Apr 2026 16:57:13 +0000 Subject: [PATCH 049/972] wifi: mac80211: skip ieee80211_verify_sta_ht_mcs_support check in non-strict mode Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in their basic HT-MCS set. On cards with lower spatial streams, the check would fail, and we'd be stuck with no HT when in fact work fine with its own supported rate. This change makes it so the check is only performed in strict mode. Fixes: 574faa0e936d ("wifi: mac80211: add HT and VHT basic set verification") Signed-off-by: Rio Liu Link: https://patch.msgid.link/99Mv9QEceyPrQhSP52MtAVmz0_kWJmzqotJjD9YW6LGLqk-AZloAueUyHCURilFkuqOh6Ecv8i2KKdSE1ujP3AnbU5QEouVisT1w_V3xdfc=@r26.me Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 160ae65a5c6458..298ebff6bbf84b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -437,6 +437,15 @@ ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + /* + * Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in + * their basic HT-MCS set. On cards with lower spatial streams, the check + * would fail, and we'd be stuck with no HT when it in fact work fine with + * its own supported rate. So check it only in strict mode. + */ + if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) + return true; + /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... From c623b63580880cc742255eaed3d79804c1b91143 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 16 Apr 2026 11:33:39 +0200 Subject: [PATCH 050/972] wifi: brcmfmac: Fix potential use-after-free issue when stopping watchdog task Watchdog task might end between send_sig() and kthread_stop() calls, what results in the use-after-free issue. Fix this by increasing watchdog task reference count before calling send_sig() and dropping it by switching to kthread_stop_put(). Cc: stable@vger.kernel.org Fixes: 373c83a801f1 ("brcmfmac: stop watchdog before detach and free everything") Fixes: a9ffda88be74 ("brcm80211: fmac: abstract bus_stop interface function pointer") Signed-off-by: Marek Szyprowski Acked-by: Arend van Spriel Link: https://patch.msgid.link/20260416093339.2066829-1-m.szyprowski@samsung.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 30f6fcb6863279..8fb595733b9c36 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -2476,8 +2476,9 @@ static void brcmf_sdio_bus_stop(struct device *dev) brcmf_dbg(TRACE, "Enter\n"); if (bus->watchdog_tsk) { + get_task_struct(bus->watchdog_tsk); send_sig(SIGTERM, bus->watchdog_tsk, 1); - kthread_stop(bus->watchdog_tsk); + kthread_stop_put(bus->watchdog_tsk); bus->watchdog_tsk = NULL; } @@ -4567,8 +4568,9 @@ void brcmf_sdio_remove(struct brcmf_sdio *bus) if (bus) { /* Stop watchdog task */ if (bus->watchdog_tsk) { + get_task_struct(bus->watchdog_tsk); send_sig(SIGTERM, bus->watchdog_tsk, 1); - kthread_stop(bus->watchdog_tsk); + kthread_stop_put(bus->watchdog_tsk); bus->watchdog_tsk = NULL; } From 1f4f78bf8549e6ac4f04fba4176854f3a6e0c332 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Fri, 17 Apr 2026 11:11:44 +0000 Subject: [PATCH 051/972] wifi: b43: enforce bounds check on firmware key index in b43_rx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware-controlled key index in b43_rx() can exceed the dev->key[] array size (58 entries). The existing B43_WARN_ON is non-enforcing in production builds, allowing an out-of-bounds read. Make the B43_WARN_ON check enforcing by dropping the frame when the firmware returns an invalid key index. Suggested-by: Jonas Gorski Acked-by: Michael Büsch Fixes: e4d6b7951812 ("[B43]: add mac80211-based driver for modern BCM43xx devices") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Link: https://patch.msgid.link/20260417111145.2694196-1-tristmd@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/b43/xmit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43/xmit.c b/drivers/net/wireless/broadcom/b43/xmit.c index 7651b1bdb59266..f0b082596637ff 100644 --- a/drivers/net/wireless/broadcom/b43/xmit.c +++ b/drivers/net/wireless/broadcom/b43/xmit.c @@ -702,7 +702,8 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr) * key index, but the ucode passed it slightly different. */ keyidx = b43_kidx_to_raw(dev, keyidx); - B43_WARN_ON(keyidx >= ARRAY_SIZE(dev->key)); + if (B43_WARN_ON(keyidx >= ARRAY_SIZE(dev->key))) + goto drop; if (dev->key[keyidx].algorithm != B43_SEC_ALGO_NONE) { wlhdr_len = ieee80211_hdrlen(fctl); From a035766f970bde2d4298346a31a80685be5c0205 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Fri, 17 Apr 2026 11:11:45 +0000 Subject: [PATCH 052/972] wifi: b43legacy: enforce bounds check on firmware key index in RX path Same fix as b43: the firmware-controlled key index in b43legacy_rx() can exceed dev->max_nr_keys. The existing B43legacy_WARN_ON is non-enforcing in production builds, allowing an out-of-bounds read of dev->key[]. Make the check enforcing by dropping the frame for invalid indices. Fixes: 75388acd0cd8 ("[B43LEGACY]: add mac80211-based driver for legacy BCM43xx devices") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Link: https://patch.msgid.link/20260417111145.2694196-2-tristmd@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/b43legacy/xmit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43legacy/xmit.c b/drivers/net/wireless/broadcom/b43legacy/xmit.c index efd63f4ce74f2b..ee199d4eaf039a 100644 --- a/drivers/net/wireless/broadcom/b43legacy/xmit.c +++ b/drivers/net/wireless/broadcom/b43legacy/xmit.c @@ -476,7 +476,8 @@ void b43legacy_rx(struct b43legacy_wldev *dev, * key index, but the ucode passed it slightly different. */ keyidx = b43legacy_kidx_to_raw(dev, keyidx); - B43legacy_WARN_ON(keyidx >= dev->max_nr_keys); + if (B43legacy_WARN_ON(keyidx >= dev->max_nr_keys)) + goto drop; if (dev->key[keyidx].algorithm != B43legacy_SEC_ALGO_NONE) { /* Remove PROTECTED flag to mark it as decrypted. */ From 3994b4afd521d60e47e012fe2ed7b606aaec370b Mon Sep 17 00:00:00 2001 From: Amir Mohammad Jahangirzad Date: Sat, 18 Apr 2026 04:12:47 +0330 Subject: [PATCH 053/972] wifi: libertas: fix integer underflow in process_cmdrequest() The existing validation only checks if recvlength exceeds LBS_CMD_BUFFER_SIZE, but doesn't check the lower bound. When a USB device sends a response shorter than MESSAGE_HEADER_LEN, the subtraction (recvlength - MESSAGE_HEADER_LEN) wraps to a huge value, causing memcpy to corrupt the heap. Add the same lower bound check that libertas_tf already has. Signed-off-by: Amir Mohammad Jahangirzad Link: https://patch.msgid.link/20260418004247.368944-1-a.jahangirzad@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/libertas/if_usb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c index 4fae0e33513661..a00d53350fa971 100644 --- a/drivers/net/wireless/marvell/libertas/if_usb.c +++ b/drivers/net/wireless/marvell/libertas/if_usb.c @@ -633,9 +633,10 @@ static inline void process_cmdrequest(int recvlength, uint8_t *recvbuff, unsigned long flags; u8 i; - if (recvlength > LBS_CMD_BUFFER_SIZE) { + if (recvlength < MESSAGE_HEADER_LEN || + recvlength > LBS_CMD_BUFFER_SIZE) { lbs_deb_usbd(&cardp->udev->dev, - "The receive buffer is too large\n"); + "The receive buffer is invalid: %d\n", recvlength); kfree_skb(skb); return; } From 381cd547bc6e35a610c5dfebe554d891eea40f03 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 18:45:52 -0400 Subject: [PATCH 054/972] wifi: nl80211: require admin perm on SET_PMK / DEL_PMK NL80211_CMD_SET_PMK and NL80211_CMD_DEL_PMK manage the offloaded 4-way-handshake PMK state used by drivers advertising NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X. The only in-tree driver that wires up both ->set_pmk / ->del_pmk and advertises the feature today is brcmfmac, so the practical reach of this patch is narrow. Both ops were introduced without a .flags gate, so the generic netlink layer dispatches them to an unprivileged caller instead of rejecting with -EPERM at the permission check. Every other connection-state op in the adjacent block (CONNECT, ASSOCIATE, AUTHENTICATE, SET_KEY, ...) carries GENL_UNS_ADMIN_PERM; SET_PMK / DEL_PMK were introduced without the flag in 2017 and left unchanged by later refactors. Johannes checked the original Intel submission history and confirmed there is no admin check in any prior revision either, so this seems likely to be a simple oversight rather than an intentional carve-out. Require GENL_UNS_ADMIN_PERM so the genl layer performs the same capable(CAP_NET_ADMIN) check as its siblings. wpa_supplicant already needs CAP_NET_ADMIN for every other nl80211 op it issues, so supplicant operation is unaffected. The worst case the missing gate enables today is an unprivileged local process on a multi-user system invalidating the offloaded PMK state of another user's 4-way-handshake session, forcing a full EAP re-auth on the next reconnect. Verified in UML: an unprivileged probe (uid=1000) sees SET_MULTICAST_TO_UNICAST (sibling op with GENL_UNS_ADMIN_PERM) return -EPERM on both pre- and post-fix kernels, while SET_PMK / DEL_PMK return -ENODEV from nl80211_pre_doit()'s wdev lookup pre- fix (proving dispatch crossed the genl permission check) and -EPERM post-fix (rejected at the genl layer as intended). Suggested-by: Johannes Berg Fixes: 3a00df5707b6 ("cfg80211: support 4-way handshake offloading for 802.1X") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Acked-by: Arend van Spriel Link: https://patch.msgid.link/20260421224552.4044147-1-michael.bommarito@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f334cdef895870..67088804dcc796 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -19828,6 +19828,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_CLEAR_SKB), }, @@ -19835,6 +19836,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { From 9b55d5c1f5e481e391957f9096d798ca331c461b Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 20:06:51 -0400 Subject: [PATCH 055/972] wifi: mac80211: check ieee80211_rx_data_set_link return in pubsta MLO path __ieee80211_rx_handle_packet() resolves the link via ieee80211_rx_data_set_link() on the pubsta->mlo path but ignores the helper's return value. Inside the helper, rx->link = rcu_dereference(rx->sdata->link[link_id]); can leave rx->link NULL if link_id references a slot already cleared by ieee80211_vif_set_links() during station-initiated ML reconfiguration (see mlme.c's ieee80211_ml_reconfiguration(), which invalidates sdata->link[] before the matching ieee80211_sta_remove_link() loop walks the link-sta hash). RX dispatch still resolves a link_sta from the hash and then drops into ieee80211_prepare_and_rx_handle(), which dereferences link->conf->addr. Every other user site of ieee80211_rx_data_set_link() checks the return and bails on failure; only this branch did not. Mirror the safe pattern. Fixes: e66b7920aa5a ("wifi: mac80211: fix initialization of rx->link and rx->link_sta") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Link: https://patch.msgid.link/20260422000651.4184602-1-michael.bommarito@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3e5d1c47a5b067..5a92413a911f2d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5380,7 +5380,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (!link_sta) goto out; - ieee80211_rx_data_set_link(&rx, link_sta->link_id); + if (!ieee80211_rx_data_set_link(&rx, + link_sta->link_id)) + goto out; } if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) From 7a5b81e0c87a075afd572f659d8eb68c9c4cd2ba Mon Sep 17 00:00:00 2001 From: Catherine Date: Fri, 24 Apr 2026 21:14:36 +0800 Subject: [PATCH 056/972] wifi: mac80211: drop stray 'static' from fast-RX rx_result ieee80211_invoke_fast_rx() is documented as safe for parallel RX, but its per-invocation rx_result is declared static. Concurrent callers then share one instance and can overwrite each other's result between ieee80211_rx_mesh_data() and the switch on res. That can make a packet that was queued or consumed by ieee80211_rx_mesh_data() fall through into ieee80211_rx_8023(), or make a packet that should continue return as queued. Make res an automatic variable so each invocation keeps its own result. Fixes: 3468e1e0c639 ("wifi: mac80211: add mesh fast-rx support") Cc: stable@vger.kernel.org Signed-off-by: Catherine Link: https://patch.msgid.link/20260424131435.83212-2-enderaoelyther@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5a92413a911f2d..d18e962126ce4d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4971,7 +4971,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - static ieee80211_rx_result res; + ieee80211_rx_result res; int orig_len = skb->len; int hdrlen = ieee80211_hdrlen(hdr->frame_control); int snap_offs = hdrlen; From 58c0ac6125d89bf6ec65a521eaeb52a0e8e20a9f Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 20 Apr 2026 08:42:03 +0000 Subject: [PATCH 057/972] iommu/amd: Use maximum Event log buffer size when SNP is enabled on Family 0x19 Due to CVE-2023-20585, the Event log buffer must use the maximum supported size (512K) on Milan/Genoa (Family 0x19) systems when SNP is enabled, to mitigate a potential security vulnerability. All other systems continue to use the default Event log buffer size (8K). Apply the errata fix by making the following changes: * Introduce new global variable (amd_iommu_evtlog_size) to have event log buffer size. Adjust variable size for family 0x19. * Since 'iommu_snp_enable()' must be called after the core IOMMU subsystem is initialized, it cannot be moved to the early init stage. The SNP errata must also be applied after the 'iommu_snp_enable()' check. Therefore, 'alloc_event_buffer()' and 'iommu_enable_event_buffer()' are now called in the IOMMU_ENABLED state, after the errata is applied. * Adjust alloc_event_buffer() and iommu_enable_event_buffer() to handle all IOMMU instances. * Also rename EVT_* macros to make it more readable. Link: https://www.amd.com/en/resources/product-security/bulletin/amd-sb-3016.html Cc: Borislav Petkov Cc: Suravee Suthikulpanit Cc: Joerg Roedel Signed-off-by: Vasant Hegde Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 2 + drivers/iommu/amd/amd_iommu_types.h | 10 ++- drivers/iommu/amd/init.c | 110 +++++++++++++++++++--------- drivers/iommu/amd/iommu.c | 2 +- 4 files changed, 86 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 1342e764a5486d..f1c486dcf0f384 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -11,6 +11,8 @@ #include "amd_iommu_types.h" +extern int amd_iommu_evtlog_size; + irqreturn_t amd_iommu_int_thread(int irq, void *data); irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data); irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index c685d3771436a2..c3430c09bc5c97 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -141,7 +142,6 @@ #define MMIO_STATUS_GALOG_INT_MASK BIT(10) /* event logging constants */ -#define EVENT_ENTRY_SIZE 0x10 #define EVENT_TYPE_SHIFT 28 #define EVENT_TYPE_MASK 0xf #define EVENT_TYPE_ILL_DEV 0x1 @@ -259,8 +259,12 @@ #define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ -#define EVT_BUFFER_SIZE 8192 /* 512 entries */ -#define EVT_LEN_MASK (0x9ULL << 56) +#define EVTLOG_ENTRY_SIZE 0x10 +#define EVTLOG_SIZE_SHIFT 56 +#define EVTLOG_SIZE_DEF SZ_8K /* 512 entries */ +#define EVTLOG_LEN_MASK_DEF (0x9ULL << EVTLOG_SIZE_SHIFT) +#define EVTLOG_SIZE_MAX SZ_512K /* 32K entries */ +#define EVTLOG_LEN_MASK_MAX (0xFULL << EVTLOG_SIZE_SHIFT) /* Constants for PPR Log handling */ #define PPR_LOG_ENTRIES 512 diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 56ad020df49497..d8dc5c6db29d69 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -132,6 +132,8 @@ struct ivhd_entry { u8 uid; } __attribute__((packed)); +int amd_iommu_evtlog_size = EVTLOG_SIZE_DEF; + /* * An AMD IOMMU memory definition structure. It defines things like exclusion * ranges for devices and regions that should be unity mapped. @@ -865,35 +867,47 @@ void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp, } /* allocates the memory where the IOMMU will log its events to */ -static int __init alloc_event_buffer(struct amd_iommu *iommu) +static int __init alloc_event_buffer(void) { - iommu->evt_buf = iommu_alloc_4k_pages(iommu, GFP_KERNEL, - EVT_BUFFER_SIZE); + struct amd_iommu *iommu; - return iommu->evt_buf ? 0 : -ENOMEM; + for_each_iommu(iommu) { + iommu->evt_buf = iommu_alloc_4k_pages(iommu, GFP_KERNEL, + amd_iommu_evtlog_size); + if (!iommu->evt_buf) + return -ENOMEM; + } + + return 0; } -static void iommu_enable_event_buffer(struct amd_iommu *iommu) +static void iommu_enable_event_buffer(void) { + struct amd_iommu *iommu; u64 entry; - BUG_ON(iommu->evt_buf == NULL); + for_each_iommu(iommu) { + BUG_ON(iommu->evt_buf == NULL); - if (!is_kdump_kernel()) { - /* - * Event buffer is re-used for kdump kernel and setting - * of MMIO register is not required. - */ - entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; - memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, - &entry, sizeof(entry)); - } + if (!is_kdump_kernel()) { + /* + * Event buffer is re-used for kdump kernel and setting + * of MMIO register is not required. + */ + entry = iommu_virt_to_phys(iommu->evt_buf); + entry |= (amd_iommu_evtlog_size == EVTLOG_SIZE_DEF) ? + EVTLOG_LEN_MASK_DEF : EVTLOG_LEN_MASK_MAX; + + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + } - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); - iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); + } } /* @@ -984,15 +998,20 @@ static int __init alloc_cwwb_sem(struct amd_iommu *iommu) return 0; } -static int __init remap_event_buffer(struct amd_iommu *iommu) +static int __init remap_event_buffer(void) { + struct amd_iommu *iommu; u64 paddr; pr_info_once("Re-using event buffer from the previous kernel\n"); - paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK; - iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE); + for_each_iommu(iommu) { + paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK; + iommu->evt_buf = iommu_memremap(paddr, amd_iommu_evtlog_size); + if (!iommu->evt_buf) + return -ENOMEM; + } - return iommu->evt_buf ? 0 : -ENOMEM; + return 0; } static int __init remap_command_buffer(struct amd_iommu *iommu) @@ -1044,10 +1063,6 @@ static int __init alloc_iommu_buffers(struct amd_iommu *iommu) ret = remap_command_buffer(iommu); if (ret) return ret; - - ret = remap_event_buffer(iommu); - if (ret) - return ret; } else { ret = alloc_cwwb_sem(iommu); if (ret) @@ -1056,10 +1071,6 @@ static int __init alloc_iommu_buffers(struct amd_iommu *iommu) ret = alloc_command_buffer(iommu); if (ret) return ret; - - ret = alloc_event_buffer(iommu); - if (ret) - return ret; } return 0; @@ -2893,7 +2904,6 @@ static void early_enable_iommu(struct amd_iommu *iommu) iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); - iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); iommu_enable_gt(iommu); iommu_enable_ga(iommu); @@ -2957,7 +2967,6 @@ static void early_enable_iommus(void) iommu_disable_event_buffer(iommu); iommu_disable_irtcachedis(iommu); iommu_enable_command_buffer(iommu); - iommu_enable_event_buffer(iommu); iommu_enable_ga(iommu); iommu_enable_xt(iommu); iommu_enable_irtcachedis(iommu); @@ -3070,6 +3079,7 @@ static void amd_iommu_resume(void *data) for_each_iommu(iommu) early_enable_iommu(iommu); + iommu_enable_event_buffer(); amd_iommu_enable_interrupts(); } @@ -3399,6 +3409,23 @@ static __init void iommu_snp_enable(void) #endif } +static void amd_iommu_apply_erratum_snp(void) +{ +#ifdef CONFIG_KVM_AMD_SEV + if (!amd_iommu_snp_en) + return; + + /* Errata fix for Family 0x19 */ + if (boot_cpu_data.x86 != 0x19) + return; + + /* Set event log buffer size to max */ + amd_iommu_evtlog_size = EVTLOG_SIZE_MAX; + pr_info("Applying erratum: Increase Event log size to 0x%x\n", + amd_iommu_evtlog_size); +#endif +} + /**************************************************************************** * * AMD IOMMU Initialization State Machine @@ -3435,6 +3462,21 @@ static int __init state_next(void) case IOMMU_ENABLED: register_syscore(&amd_iommu_syscore); iommu_snp_enable(); + + amd_iommu_apply_erratum_snp(); + + /* Allocate/enable event log buffer */ + if (is_kdump_kernel()) + ret = remap_event_buffer(); + else + ret = alloc_event_buffer(); + + if (ret) { + init_state = IOMMU_INIT_ERROR; + break; + } + iommu_enable_event_buffer(); + ret = amd_iommu_init_pci(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT; break; @@ -4037,7 +4079,7 @@ int amd_iommu_snp_disable(void) return 0; for_each_iommu(iommu) { - ret = iommu_make_shared(iommu->evt_buf, EVT_BUFFER_SIZE); + ret = iommu_make_shared(iommu->evt_buf, amd_iommu_evtlog_size); if (ret) return ret; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 01171361f9bc22..157505d96fdd28 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1010,7 +1010,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) iommu_print_event(iommu, iommu->evt_buf + head); /* Update head pointer of hardware ring-buffer */ - head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; + head = (head + EVTLOG_ENTRY_SIZE) % amd_iommu_evtlog_size; writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } From 1f44aab79bac31f459422dfb213e907bb386509c Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 20 Apr 2026 08:42:04 +0000 Subject: [PATCH 058/972] iommu/amd: Use maximum PPR log buffer size when SNP is enabled on Family 0x19 Due to CVE-2023-20585, the PPR log buffer must use the maximum supported size (512K) on Genoa (Family 0x19, model >= 0x10) systems when SNP is enabled, to mitigate a potential security vulnerability. Note that Family 0x19 models below 0x10 (Milan) do not support PPR when SNP is enabled. Hence the PPR log size increase is only applied for model >= 0x10. All other systems continue to use the default PPR log buffer size (8K). Apply the errata fix by making the following changes: - Introduce global new variable (amd_iommu_pprlog_size) to have PPR log buffer size. Adjust variable size for Genoa family. - Extend 'amd_iommu_apply_erratum_snp()' to also set the PPR log buffer size to maximum for Family 0x19 model >= 0x10 when SNP is enabled. - Rename PPR_* macros to make it more readable. Link: https://www.amd.com/en/resources/product-security/bulletin/amd-sb-3016.html Cc: Borislav Petkov Cc: Suravee Suthikulpanit Cc: Joerg Roedel Signed-off-by: Vasant Hegde Tested-by: Dheeraj Kumar Srivastava Signed-off-by: Joerg Roedel --- drivers/iommu/amd/amd_iommu.h | 1 + drivers/iommu/amd/amd_iommu_types.h | 11 ++++++----- drivers/iommu/amd/init.c | 13 ++++++++++++- drivers/iommu/amd/ppr.c | 8 +++++--- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index f1c486dcf0f384..834d8fabfba387 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -12,6 +12,7 @@ #include "amd_iommu_types.h" extern int amd_iommu_evtlog_size; +extern int amd_iommu_pprlog_size; irqreturn_t amd_iommu_int_thread(int irq, void *data); irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data); diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index c3430c09bc5c97..f9f71808789303 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -267,11 +267,12 @@ #define EVTLOG_LEN_MASK_MAX (0xFULL << EVTLOG_SIZE_SHIFT) /* Constants for PPR Log handling */ -#define PPR_LOG_ENTRIES 512 -#define PPR_LOG_SIZE_SHIFT 56 -#define PPR_LOG_SIZE_512 (0x9ULL << PPR_LOG_SIZE_SHIFT) -#define PPR_ENTRY_SIZE 16 -#define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES) +#define PPRLOG_ENTRY_SIZE 0x10 +#define PPRLOG_SIZE_SHIFT 56 +#define PPRLOG_SIZE_DEF SZ_8K /* 512 entries */ +#define PPRLOG_LEN_MASK_DEF (0x9ULL << PPRLOG_SIZE_SHIFT) +#define PPRLOG_SIZE_MAX SZ_512K /* 32K entries */ +#define PPRLOG_LEN_MASK_MAX (0xFULL << PPRLOG_SIZE_SHIFT) /* PAGE_SERVICE_REQUEST PPR Log Buffer Entry flags */ #define PPR_FLAG_EXEC 0x002 /* Execute permission requested */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index d8dc5c6db29d69..3bdb380d23e9a9 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -133,6 +133,7 @@ struct ivhd_entry { } __attribute__((packed)); int amd_iommu_evtlog_size = EVTLOG_SIZE_DEF; +int amd_iommu_pprlog_size = PPRLOG_SIZE_DEF; /* * An AMD IOMMU memory definition structure. It defines things like exclusion @@ -3423,6 +3424,16 @@ static void amd_iommu_apply_erratum_snp(void) amd_iommu_evtlog_size = EVTLOG_SIZE_MAX; pr_info("Applying erratum: Increase Event log size to 0x%x\n", amd_iommu_evtlog_size); + + /* + * Set PPR log buffer size to max. + * (Family 0x19, model < 0x10 doesn't support PPR when SNP is enabled). + */ + if (boot_cpu_data.x86_model >= 0x10) { + amd_iommu_pprlog_size = PPRLOG_SIZE_MAX; + pr_info("Applying erratum: Increase PPR log size to 0x%x\n", + amd_iommu_pprlog_size); + } #endif } @@ -4083,7 +4094,7 @@ int amd_iommu_snp_disable(void) if (ret) return ret; - ret = iommu_make_shared(iommu->ppr_log, PPR_LOG_SIZE); + ret = iommu_make_shared(iommu->ppr_log, amd_iommu_pprlog_size); if (ret) return ret; diff --git a/drivers/iommu/amd/ppr.c b/drivers/iommu/amd/ppr.c index e6767c057d01fa..1f8d2823bea42c 100644 --- a/drivers/iommu/amd/ppr.c +++ b/drivers/iommu/amd/ppr.c @@ -20,7 +20,7 @@ int __init amd_iommu_alloc_ppr_log(struct amd_iommu *iommu) { iommu->ppr_log = iommu_alloc_4k_pages(iommu, GFP_KERNEL | __GFP_ZERO, - PPR_LOG_SIZE); + amd_iommu_pprlog_size); return iommu->ppr_log ? 0 : -ENOMEM; } @@ -33,7 +33,9 @@ void amd_iommu_enable_ppr_log(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_PPR_EN); - entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; + entry = iommu_virt_to_phys(iommu->ppr_log); + entry |= (amd_iommu_pprlog_size == PPRLOG_SIZE_DEF) ? + PPRLOG_LEN_MASK_DEF : PPRLOG_LEN_MASK_MAX; memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, &entry, sizeof(entry)); @@ -201,7 +203,7 @@ void amd_iommu_poll_ppr_log(struct amd_iommu *iommu) raw[0] = raw[1] = 0UL; /* Update head pointer of hardware ring-buffer */ - head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; + head = (head + PPRLOG_ENTRY_SIZE) % amd_iommu_pprlog_size; writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); /* Handle PPR entry */ From f9471dc1a7177f594acf8106cc4a6ab33bf0bcb6 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 24 Apr 2026 11:50:51 +0000 Subject: [PATCH 059/972] iommu/pages: Fix iommu_pages_flush_incoherent() for non-x86 The dma_sync_single_for_device() function expects a dma_addr_t, but iommu_pages_flush_incoherent() was incorrectly passing a virtual address. Since iommu_pages_start_incoherent() enforces a 1:1 mapping between DMA addresses and physical addresses (checked via WARN_ON), we can convert the virtual address to a physical address before passing it to the DMA API. This also matches the behaviour of the other non-x86 in iommu_pages_free_incoherent(), which uses virt_to_phys(virt); Fixes: 36ae67b13976 ("iommu/pages: Add support for incoherent IOMMU page table walkers") Signed-off-by: Mostafa Saleh Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-pages.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index ae9da4f571f614..e9e605b5fa3afc 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -137,7 +137,7 @@ static inline void iommu_pages_flush_incoherent(struct device *dma_dev, void *virt, size_t offset, size_t len) { - dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len, + dma_sync_single_for_device(dma_dev, virt_to_phys(virt) + offset, len, DMA_TO_DEVICE); } void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, From 26735dfdd8930d9ef1fa92e590a9bf77726efdf6 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Fri, 17 Apr 2026 13:13:31 +0200 Subject: [PATCH 060/972] pmdomain: core: Fix detach procedure for virtual devices in genpd If a device is attached to a PM domain through genpd_dev_pm_attach_by_id(), genpd calls pm_runtime_enable() for the corresponding virtual device that it registers. While this avoids boilerplate code in drivers, there is no corresponding call to pm_runtime_disable() in genpd_dev_pm_detach(). This means these virtual devices are typically detached from its genpd, while runtime PM remains enabled for them, which is not how things are designed to work. In worst cases it may lead to critical errors, like a NULL pointer dereference bug in genpd_runtime_suspend(), which was recently reported. For another case, we may end up keeping an unnecessary vote for a performance state for the device. To fix these problems, let's add this missing call to pm_runtime_disable() in genpd_dev_pm_detach(). Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/all/CAMuHMdWapT40hV3c+CSBqFOW05aWcV1a6v_NiJYgoYi0i9_PDQ@mail.gmail.com/ Fixes: 3c095f32a92b ("PM / Domains: Add support for multi PM domains per device to genpd") Cc: stable@vger.kernel.org Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 4d32fc676aaf53..71e930e80178e9 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -3089,6 +3089,7 @@ static const struct bus_type genpd_bus_type = { static void genpd_dev_pm_detach(struct device *dev, bool power_off) { struct generic_pm_domain *pd; + bool is_virt_dev; unsigned int i; int ret = 0; @@ -3098,6 +3099,13 @@ static void genpd_dev_pm_detach(struct device *dev, bool power_off) dev_dbg(dev, "removing from PM domain %s\n", pd->name); + /* Check if the device was created by genpd at attach. */ + is_virt_dev = dev->bus == &genpd_bus_type; + + /* Disable runtime PM if we enabled it at attach. */ + if (is_virt_dev) + pm_runtime_disable(dev); + /* Drop the default performance state */ if (dev_gpd_data(dev)->default_pstate) { dev_pm_genpd_set_performance_state(dev, 0); @@ -3123,7 +3131,7 @@ static void genpd_dev_pm_detach(struct device *dev, bool power_off) genpd_queue_power_off_work(pd); /* Unregister the device if it was created by genpd. */ - if (dev->bus == &genpd_bus_type) + if (is_virt_dev) device_unregister(dev); } From ec1fcddb3117d9452210e838fd37389ee61e10e8 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 8 Apr 2026 14:11:21 +0000 Subject: [PATCH 061/972] pmdomain: mediatek: fix use-after-free in scpsys_get_bus_protection_legacy() In scpsys_get_bus_protection_legacy(), of_find_node_with_property() returns a device node with its reference count incremented. The function then calls of_node_put(node) before checking whether syscon_regmap_lookup_by_phandle() returns an error. If an error occurs, dev_err_probe() dereferences the node pointer to print diagnostic information, but the node memory may have already been freed due to the earlier of_node_put(), leading to a use-after-free vulnerability. Fix this by moving the of_node_put() call after the error check, ensuring the node is still valid when accessed in the error path. Fixes: c29345fa5f66 ("pmdomain: mediatek: Refactor bus protection regmaps retrieval") Cc: stable@vger.kernel.org Signed-off-by: Wentao Liang Signed-off-by: Ulf Hansson --- drivers/pmdomain/mediatek/mtk-pm-domains.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/pmdomain/mediatek/mtk-pm-domains.c b/drivers/pmdomain/mediatek/mtk-pm-domains.c index d2b8d033295153..e1cfd42234734f 100644 --- a/drivers/pmdomain/mediatek/mtk-pm-domains.c +++ b/drivers/pmdomain/mediatek/mtk-pm-domains.c @@ -1015,6 +1015,7 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s struct device_node *node, *smi_np; int num_regmaps = 0, i, j; struct regmap *regmap[3]; + int ret = 0; /* * Legacy code retrieves a maximum of three bus protection handles: @@ -1065,11 +1066,14 @@ static int scpsys_get_bus_protection_legacy(struct device *dev, struct scpsys *s if (node) { regmap[2] = syscon_regmap_lookup_by_phandle(node, "mediatek,infracfg-nao"); num_regmaps++; - of_node_put(node); - if (IS_ERR(regmap[2])) - return dev_err_probe(dev, PTR_ERR(regmap[2]), + if (IS_ERR(regmap[2])) { + ret = dev_err_probe(dev, PTR_ERR(regmap[2]), "%pOF: failed to get infracfg regmap\n", node); + of_node_put(node); + return ret; + } + of_node_put(node); } else { regmap[2] = NULL; } From 3f91484f6c13c434bd573ca6b6779c26adb0ddab Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Mon, 13 Apr 2026 21:49:12 +0300 Subject: [PATCH 062/972] USB: omap_udc: DMA: Don't enable burst 4 mode Commit 65111084c63d7 ("USB: more omap_udc updates (dma and omap1710)") added setting for DMA burst 4 mode. But I think this should be undone for two reasons: - It breaks DMA on 15xx boards - transfers just silently stall. - On newer OMAP1 boards, like Nokia 770 (omap1710), there is no measurable performance impact when testing TCP throughput with g_ether with large 15000 byte MTU size. It's also worth noting that when the original change was made, the OMAP_DMA_DATA_BURST_4 handling in arch/arm/plat-omap/dma.c was broken, and actually resulted in the same as the OMAP_DMA_DATA_BURST_DIS i.e. burst disabled. This was fixed not until a couple kernel releases later in an unrelated commit 1a8bfa1eb998a ("[ARM] 3142/1: OMAP 2/5: Update files common to omap1 and omap2"). So based on this it seems there was never really a very good reason to enable this burst mode in omap_udc, so remove it now to allow 15xx DMA to work again (it provides 2x throughput compared to PIO mode). Fixes: 65111084c63d ("[PATCH] USB: more omap_udc updates (dma and omap1710)") Cc: stable Signed-off-by: Aaro Koskinen Link: https://patch.msgid.link/ad06qHLclWHeSGnV@darkstar.musicnaut.iki.fi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/omap_udc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c index 91139ae668f480..f3ca79cece1bea 100644 --- a/drivers/usb/gadget/udc/omap_udc.c +++ b/drivers/usb/gadget/udc/omap_udc.c @@ -733,8 +733,6 @@ static void dma_channel_claim(struct omap_ep *ep, unsigned channel) if (status == 0) { omap_writew(reg, UDC_TXDMA_CFG); /* EMIFF or SDRC */ - omap_set_dma_src_burst_mode(ep->lch, - OMAP_DMA_DATA_BURST_4); omap_set_dma_src_data_pack(ep->lch, 1); /* TIPB */ omap_set_dma_dest_params(ep->lch, @@ -756,8 +754,6 @@ static void dma_channel_claim(struct omap_ep *ep, unsigned channel) UDC_DATA_DMA, 0, 0); /* EMIFF or SDRC */ - omap_set_dma_dest_burst_mode(ep->lch, - OMAP_DMA_DATA_BURST_4); omap_set_dma_dest_data_pack(ep->lch, 1); } } From 0b9fcab1b8608d429e5f239afb197de928d4de7d Mon Sep 17 00:00:00 2001 From: Felix Gu Date: Tue, 7 Apr 2026 21:21:22 +0800 Subject: [PATCH 063/972] usb: ulpi: fix memory leak on ulpi_register() error paths Commit 01af542392b5 ("usb: ulpi: fix double free in ulpi_register_interface() error path") removed kfree(ulpi) from ulpi_register_interface() to fix a double-free when device_register() fails. But when ulpi_of_register() or ulpi_read_id() fail before device_register() is called, the ulpi allocation is leaked. Add kfree(ulpi) on both error paths to properly clean up the allocation. Fixes: 01af542392b5 ("usb: ulpi: fix double free in ulpi_register_interface() error path") Cc: stable Signed-off-by: Felix Gu Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260407-ulpi-v1-1-f3fafe53f7b2@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/ulpi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index b34fb65813c45e..9b69148128e5bf 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -286,12 +286,15 @@ static int ulpi_register(struct device *dev, struct ulpi *ulpi) ACPI_COMPANION_SET(&ulpi->dev, ACPI_COMPANION(dev)); ret = ulpi_of_register(ulpi); - if (ret) + if (ret) { + kfree(ulpi); return ret; + } ret = ulpi_read_id(ulpi); if (ret) { of_node_put(ulpi->dev.of_node); + kfree(ulpi); return ret; } From 2909f0d4994fb4306bf116df5ccee797791fce2c Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Tue, 14 Apr 2026 00:58:32 +0000 Subject: [PATCH 064/972] usb: typec: tcpm: reset internal port states on soft reset AMS Reset internal port states (such as vdm_sm_running and explicit_contract) on soft reset AMS as the port needs to negotiate a new contract. The consequence of leaving the states in as-is cond are as follows: * port is in SRC power role and an explicit contract is negotiated with the port partner (in sink role) * port partner sends a Soft Reset AMS while VDM State Machine is running * port accepts the Soft Reset request and the port advertises src caps * port partner sends a Request message but since the explicit_contract and vdm_sm_running are true from previous negotiation, the port ends up sending Soft Reset instead of Accept msg. Stub Log: [ 203.653942] AMS DISCOVER_IDENTITY start [ 203.653947] PD TX, header: 0x176f [ 203.655901] PD TX complete, status: 0 [ 203.657470] PD RX, header: 0x124f [1] [ 203.657477] Rx VDM cmd 0xff008081 type 2 cmd 1 len 1 [ 203.657482] AMS DISCOVER_IDENTITY finished [ 203.657484] cc:=4 [ 204.155698] PD RX, header: 0x144f [1] [ 204.155718] Rx VDM cmd 0xeeee8001 type 0 cmd 1 len 1 [ 204.155741] PD TX, header: 0x196f [ 204.157622] PD TX complete, status: 0 [ 204.160060] PD RX, header: 0x4d [1] [ 204.160066] state change SRC_READY -> SOFT_RESET [rev2 SOFT_RESET_AMS] [ 204.160076] PD TX, header: 0x163 [ 204.162486] PD TX complete, status: 0 [ 204.162832] AMS SOFT_RESET_AMS finished [ 204.162840] cc:=4 [ 204.162891] AMS POWER_NEGOTIATION start [ 204.162896] state change SOFT_RESET -> AMS_START [rev2 POWER_NEGOTIATION] [ 204.162908] state change AMS_START -> SRC_SEND_CAPABILITIES [rev2 POWER_NEGOTIATION] [ 204.162913] PD TX, header: 0x1361 [ 204.165529] PD TX complete, status: 0 [ 204.165571] pending state change SRC_SEND_CAPABILITIES -> SRC_SEND_CAPABILITIES_TIMEOUT @ 60 ms [rev2 POWER_NEGOTIATION] [ 204.166996] PD RX, header: 0x1242 [1] [ 204.167009] state change SRC_SEND_CAPABILITIES -> SRC_SOFT_RESET_WAIT_SNK_TX [rev2 POWER_NEGOTIATION] [ 204.167019] AMS POWER_NEGOTIATION finished [ 204.167020] cc:=4 [ 204.167083] AMS SOFT_RESET_AMS start [ 204.167086] state change SRC_SOFT_RESET_WAIT_SNK_TX -> SOFT_RESET_SEND [rev2 SOFT_RESET_AMS] [ 204.167092] PD TX, header: 0x16d [ 204.168824] PD TX complete, status: 0 [ 204.168854] pending state change SOFT_RESET_SEND -> HARD_RESET_SEND @ 60 ms [rev2 SOFT_RESET_AMS] [ 204.171876] PD RX, header: 0x43 [1] [ 204.171879] AMS SOFT_RESET_AMS finished This causes COMMON.PROC.PD.11.2 check failure for TEST.PD.VDM.SRC.2_Rev2Src test on the PD compliance tester. Signed-off-by: Amit Sunil Dhamne Fixes: 8d3a0578ad1a ("usb: typec: tcpm: Respond Wait if VDM state machine is running") Fixes: f0690a25a140 ("staging: typec: USB Type-C Port Manager (tcpm)") Cc: stable Reviewed-by: Badhri Jagan Sridharan Acked-by: Heikki Krogerus Link: https://patch.msgid.link/20260414-fix-soft-reset-v1-1-01d7cb9764e2@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index dfbb94ddc98a61..cd5560d6f19e19 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5935,6 +5935,8 @@ static void run_state_machine(struct tcpm_port *port) /* remove existing capabilities */ tcpm_partner_source_caps_reset(port); tcpm_pd_send_control(port, PD_CTRL_ACCEPT, TCPC_TX_SOP); + port->vdm_sm_running = false; + port->explicit_contract = false; tcpm_ams_finish(port); if (port->pwr_role == TYPEC_SOURCE) { port->upcoming_state = SRC_SEND_CAPABILITIES; From f6ec9bb4acc7182b25a793ad094a764e1cb819a7 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Fri, 24 Apr 2026 15:40:09 +0800 Subject: [PATCH 065/972] usb: typec: tcpm: fix debug accessory mode detection for sink ports The port in debug accessory mode can be either a source or sink. The previous tcpm_port_is_debug() function only checked for source port. Commit 8db73e6a42b6 ("usb: typec: tcpm: allow sink (ufp) to toggle into accessory mode debug") changed the detection logic to support both roles, but left some logic in _tcpm_cc_change() unchanged, This causes the state machine to transition to an incorrect state when operating as a sink in debug accessory mode. Log as below: [ 978.637541] CC1: 0 -> 5, CC2: 0 -> 5 [state TOGGLING, polarity 0, connected] [ 978.637567] state change TOGGLING -> SRC_ATTACH_WAIT [rev1 NONE_AMS] [ 978.637596] pending state change SRC_ATTACH_WAIT -> DEBUG_ACC_ATTACHED @ 180 ms [rev1 NONE_AMS] [ 978.647098] CC1: 5 -> 0, CC2: 5 -> 5 [state SRC_ATTACH_WAIT, polarity 0, connected] [ 978.647115] state change SRC_ATTACH_WAIT -> SRC_ATTACH_WAIT [rev1 NONE_AMS] It should go to SNK_ATTACH_WAIT instead of SRC_ATTACH_WAIT state. To fix this, add tcpm_port_is_debug_source() and tcpm_port_is_debug_sink() helper to explicitly identify the power mode in debug accessory mode. Update the state transition logic in _tcpm_cc_change() to ensure the state machine transitions comply with Type-C specification. Also update the logic in run_state_machine() to keep consistency. Fixes: 8db73e6a42b6 ("usb: typec: tcpm: allow sink (ufp) to toggle into accessory mode debug") Cc: stable Signed-off-by: Xu Yang Acked-by: Heikki Krogerus Reviewed-by: Amit Sunil Dhamne Link: https://patch.msgid.link/20260424074009.2979266-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index cd5560d6f19e19..55fee96d3342ad 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -732,9 +732,14 @@ static const char * const pd_rev[] = { (tcpm_cc_is_source((port)->cc2) && \ !tcpm_cc_is_source((port)->cc1))) +#define tcpm_port_is_debug_source(port) \ + (tcpm_cc_is_source((port)->cc1) && tcpm_cc_is_source((port)->cc2)) + +#define tcpm_port_is_debug_sink(port) \ + (tcpm_cc_is_sink((port)->cc1) && tcpm_cc_is_sink((port)->cc2)) + #define tcpm_port_is_debug(port) \ - ((tcpm_cc_is_source((port)->cc1) && tcpm_cc_is_source((port)->cc2)) || \ - (tcpm_cc_is_sink((port)->cc1) && tcpm_cc_is_sink((port)->cc2))) + (tcpm_port_is_debug_source(port) || tcpm_port_is_debug_sink(port)) #define tcpm_port_is_audio(port) \ (tcpm_cc_is_audio((port)->cc1) && tcpm_cc_is_audio((port)->cc2)) @@ -5176,7 +5181,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_state(port, SNK_UNATTACHED, PD_T_DRP_SNK); break; case SRC_ATTACH_WAIT: - if (tcpm_port_is_debug(port)) + if (tcpm_port_is_debug_source(port)) tcpm_set_state(port, DEBUG_ACC_ATTACHED, port->timings.cc_debounce_time); else if (tcpm_port_is_audio(port)) @@ -5434,7 +5439,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_set_state(port, SRC_UNATTACHED, PD_T_DRP_SRC); break; case SNK_ATTACH_WAIT: - if (tcpm_port_is_debug(port)) + if (tcpm_port_is_debug_sink(port)) tcpm_set_state(port, DEBUG_ACC_ATTACHED, PD_T_CC_DEBOUNCE); else if (tcpm_port_is_audio(port)) @@ -5454,7 +5459,7 @@ static void run_state_machine(struct tcpm_port *port) if (tcpm_port_is_disconnected(port)) tcpm_set_state(port, SNK_UNATTACHED, PD_T_PD_DEBOUNCE); - else if (tcpm_port_is_debug(port)) + else if (tcpm_port_is_debug_sink(port)) tcpm_set_state(port, DEBUG_ACC_ATTACHED, PD_T_CC_DEBOUNCE); else if (tcpm_port_is_audio(port)) @@ -6362,10 +6367,10 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, switch (port->state) { case TOGGLING: - if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) || + if (tcpm_port_is_debug_source(port) || tcpm_port_is_audio(port) || tcpm_port_is_source(port)) tcpm_set_state(port, SRC_ATTACH_WAIT, 0); - else if (tcpm_port_is_sink(port)) + else if (tcpm_port_is_debug_sink(port) || tcpm_port_is_sink(port)) tcpm_set_state(port, SNK_ATTACH_WAIT, 0); break; case CHECK_CONTAMINANT: @@ -6373,9 +6378,11 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, break; case SRC_UNATTACHED: case ACC_UNATTACHED: - if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) || + if (tcpm_port_is_debug_source(port) || tcpm_port_is_audio(port) || tcpm_port_is_source(port)) tcpm_set_state(port, SRC_ATTACH_WAIT, 0); + else if (tcpm_port_is_debug_sink(port)) + tcpm_set_state(port, SNK_ATTACH_WAIT, 0); break; case SRC_ATTACH_WAIT: if (tcpm_port_is_disconnected(port) || @@ -6397,7 +6404,7 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, } break; case SNK_UNATTACHED: - if (tcpm_port_is_debug(port) || tcpm_port_is_audio(port) || + if (tcpm_port_is_debug_sink(port) || tcpm_port_is_audio(port) || tcpm_port_is_sink(port)) tcpm_set_state(port, SNK_ATTACH_WAIT, 0); break; From aad35f9c926ec220b0742af1ada45666ae667956 Mon Sep 17 00:00:00 2001 From: Selvarasu Ganesan Date: Fri, 17 Apr 2026 12:03:11 +0530 Subject: [PATCH 066/972] usb: dwc3: Move GUID programming after PHY initialization The Linux Version Code is currently written to the GUID register before PHY initialization. Certain PHY implementations (such as Synopsys eUSB PHY performing link_sw_reset) clear the GUID register to its default value during initialization, causing the kernel version information to be lost. Move the GUID register programming to occur after PHY initialization completes to ensure the Linux version information persists. Fixes: fa0ea13e9f1c ("usb: dwc3: core: write LINUX_VERSION_CODE to our GUID register") Cc: stable Reported-by: Pritam Manohar Sutar Signed-off-by: Selvarasu Ganesan Acked-by: Thinh Nguyen Link: https://patch.msgid.link/20260417063314.2359-1-selvarasu.g@samsung.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 58899b1fa96d29..65213896de998a 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -1359,12 +1359,6 @@ int dwc3_core_init(struct dwc3 *dwc) hw_mode = DWC3_GHWPARAMS0_MODE(dwc->hwparams.hwparams0); - /* - * Write Linux Version Code to our GUID register so it's easy to figure - * out which kernel version a bug was found. - */ - dwc3_writel(dwc, DWC3_GUID, LINUX_VERSION_CODE); - ret = dwc3_phy_setup(dwc); if (ret) return ret; @@ -1398,6 +1392,12 @@ int dwc3_core_init(struct dwc3 *dwc) if (ret) goto err_exit_phy; + /* + * Write Linux Version Code to our GUID register so it's easy to figure + * out which kernel version a bug was found. + */ + dwc3_writel(dwc, DWC3_GUID, LINUX_VERSION_CODE); + dwc3_core_setup_global_control(dwc); dwc3_core_num_eps(dwc); From 7a400c6fe3617e31e690e3f7ca37bb335e0498f3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 20 Apr 2026 18:11:03 +0200 Subject: [PATCH 067/972] usb: usblp: fix heap leak in IEEE 1284 device ID via short response usblp_ctrl_msg() collapses the usb_control_msg() return value to 0/-errno, discarding the actual number of bytes transferred. A broken printer can complete the GET_DEVICE_ID control transfer short and the driver has no way to know. usblp_cache_device_id_string() reads the 2-byte big-endian length prefix from the response and trusts it (clamped only to the buffer bounds). The buffer is kmalloc(1024) at probe time. A device that sends exactly two bytes (e.g. 0x03 0xFF, claiming a 1023-byte ID) leaves device_id_string[2..1022] holding stale kmalloc heap. That stale data is then exposed: - via the ieee1284_id sysfs attribute (sprintf("%s", buf+2), truncated at the first NUL in the stale heap), and - via the IOCNR_GET_DEVICE_ID ioctl, which copy_to_user()s the full claimed length regardless of NULs, up to 1021 bytes of uninitialized heap, with the leak size chosen by the device. Fix this up by just zapping the buffer with zeros before each request sent to the device. Cc: Pete Zaitcev Assisted-by: gkh_clanker_t1000 Cc: stable Link: https://patch.msgid.link/2026042002-unicorn-greedily-3c63@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usblp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index 669b9e6879bfa5..e9b848622a3aa5 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -1377,6 +1377,7 @@ static int usblp_cache_device_id_string(struct usblp *usblp) { int err, length; + memset(usblp->device_id_string, 0, USBLP_DEVICE_ID_SIZE); err = usblp_get_id(usblp, 0, usblp->device_id_string, USBLP_DEVICE_ID_SIZE - 1); if (err < 0) { dev_dbg(&usblp->intf->dev, From b38e53cbfb9d84732e5984fbd73e128d592415c5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 20 Apr 2026 18:11:04 +0200 Subject: [PATCH 068/972] usb: usblp: fix uninitialized heap leak via LPGETSTATUS ioctl Just like in a previous problem in this driver, usblp_ctrl_msg() will collapse the usb_control_msg() return value to 0/-errno, discarding the actual number of bytes transferred. Ideally that short command should be detected and error out, but many printers are known to send "incorrect" responses back so we can't just do that. statusbuf is kmalloc(8) at probe time and never filled before the first LPGETSTATUS ioctl. usblp_read_status() requests 1 byte. If a malicious printer responds with zero bytes, *statusbuf is one byte of stale kmalloc heap, sign-extended into the local int status, which the LPGETSTATUS path then copy_to_user()s directly to the ioctl caller. Fix this all by just zapping out the memory buffer when allocated at probe time. If a later call does a short read, the data will be identical to what the device sent it the last time, so there is no "leak" of information happening. Cc: Pete Zaitcev Assisted-by: gkh_clanker_t1000 Cc: stable Link: https://patch.msgid.link/2026042011-shredder-savage-48c6@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usblp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index e9b848622a3aa5..746414763da5d6 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -1178,7 +1178,7 @@ static int usblp_probe(struct usb_interface *intf, } /* Allocate buffer for printer status */ - usblp->statusbuf = kmalloc(STATUS_BUF_SIZE, GFP_KERNEL); + usblp->statusbuf = kzalloc(STATUS_BUF_SIZE, GFP_KERNEL); if (!usblp->statusbuf) { retval = -ENOMEM; goto abort; From 74f192205c48333de054620a79d7ce9f4515fb0b Mon Sep 17 00:00:00 2001 From: Sarthak Sharma Date: Mon, 27 Apr 2026 16:54:47 +0530 Subject: [PATCH 069/972] selftests: kselftest: fix wrong test number in ksft_exit_skip ksft_exit_skip() increments ksft_xskip before printing the KTAP result. As a result, ksft_test_num() already includes the skipped test. Adding 1 to ksft_test_num() increments the printed test number again, producing an incorrect test number and wrong KTAP output. Drop the extra increment and print ksft_test_num() directly. Link: https://lore.kernel.org/r/20260427112447.147985-1-sarthak.sharma@arm.com Fixes: b85d387c9b09 ("kselftest: fix TAP output for skipped tests") Signed-off-by: Sarthak Sharma Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 6d809f08ab7b1b..60838b61a2da52 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -450,7 +450,7 @@ static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ... */ if (ksft_plan || ksft_test_num()) { ksft_cnt.ksft_xskip++; - printf("ok %u # SKIP ", 1 + ksft_test_num()); + printf("ok %u # SKIP ", ksft_test_num()); } else { printf("1..0 # SKIP "); } From 465b05bae5ac553c13315681c1490dc565337771 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Apr 2026 14:32:33 +0200 Subject: [PATCH 070/972] selftests: harness: Restore order of test functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent addition of explicit constructor orders for fixture tests broke the ordering of those relative to non-fixture tests and the reverse-constructor-order detection. Restore the ordering of the test functions relative to each other by using the same explicit test order for all test registrations and __constructor_order_first(). Rename the constant, as it is not specific to TEST_F() anymore. Link: https://lore.kernel.org/r/20260422-kselftests-harness-order-v2-1-93ea980ea3ac@linutronix.de Fixes: 6be268151426 ("selftests/harness: order TEST_F and XFAIL_ADD constructors") Signed-off-by: Thomas Weißschuh Reviewed-by: Kees Cook Signed-off-by: Shuah Khan --- tools/testing/selftests/kselftest_harness.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 75fb016cd190bb..cfdce9cd252e75 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -76,7 +76,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) memset(s, c, n); } -#define KSELFTEST_PRIO_TEST_F 20000 +#define KSELFTEST_PRIO_TEST 20000 #define KSELFTEST_PRIO_XFAIL 20001 #define TEST_TIMEOUT_DEFAULT 30 @@ -194,7 +194,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) .fixture = &_fixture_global, \ .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ - static void __attribute__((constructor)) _register_##test_name(void) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) _register_##test_name(void) \ { \ __register_test(&_##test_name##_object); \ } \ @@ -238,7 +238,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) FIXTURE_VARIANT(fixture_name); \ static struct __fixture_metadata _##fixture_name##_fixture_object = \ { .name = #fixture_name, }; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_data(void) \ { \ __register_fixture(&_##fixture_name##_fixture_object); \ @@ -364,7 +364,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) _##fixture_name##_##variant_name##_object = \ { .name = #variant_name, \ .data = &_##fixture_name##_##variant_name##_variant}; \ - static void __attribute__((constructor)) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_##variant_name(void) \ { \ __register_fixture_variant(&_##fixture_name##_fixture_object, \ @@ -468,7 +468,7 @@ static inline void __kselftest_memset_safe(void *s, int c, size_t n) fixture_name##_teardown(_metadata, self, variant); \ } \ static struct __test_metadata *_##fixture_name##_##test_name##_object; \ - static void __attribute__((constructor(KSELFTEST_PRIO_TEST_F))) \ + static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) \ _register_##fixture_name##_##test_name(void) \ { \ struct __test_metadata *object = mmap(NULL, sizeof(*object), \ @@ -1323,7 +1323,7 @@ static int test_harness_run(int argc, char **argv) return KSFT_FAIL; } -static void __attribute__((constructor)) __constructor_order_first(void) +static void __attribute__((constructor(KSELFTEST_PRIO_TEST))) __constructor_order_first(void) { __constructor_order_forward = true; } From cfcbfe5cb11650d53f7cafd7adfd556690b77114 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 21 Apr 2026 08:06:44 -0700 Subject: [PATCH 071/972] PCI: Don't fallback to bus reset after failed slot reset If a bus has hotplug slots that implement the slot's reset_slot callback, it is not safe to do the non-slot specific bus reset, so don't fallback to it. If a slot reset does fail, the subsequent bus reset will attempt a 2nd link reset on top of previous and fail to handle the hotplug events. Fixes: 8238cb69c01fe ("PCI: Make reset_subordinate hotplug safe") Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260421150644.3543733-1-kbusch@meta.com --- drivers/pci/pci.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8f7cfcc000901d..d34266651ad09f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5607,13 +5607,14 @@ static int pci_try_reset_bus(struct pci_bus *bus) * reset for affected devices * * This function will first try to reset the slots on this bus if the method is - * available. If slot reset fails or is not available, this will fall back to a + * available. If slot reset is not available, this will fall back to a * secondary bus reset. */ static int pci_reset_bridge(struct pci_dev *bridge, bool restore) { struct pci_bus *bus = bridge->subordinate; struct pci_slot *slot; + int ret = 0; if (!bus) return -ENOTTY; @@ -5627,19 +5628,17 @@ static int pci_reset_bridge(struct pci_dev *bridge, bool restore) goto bus_reset; list_for_each_entry(slot, &bus->slots, list) { - int ret; - if (restore) ret = pci_try_reset_slot(slot); else ret = pci_slot_reset(slot, PCI_RESET_DO_RESET); if (ret) - goto bus_reset; + break; } mutex_unlock(&pci_slot_mutex); - return 0; + return ret; bus_reset: mutex_unlock(&pci_slot_mutex); From ada95e5e603bc6e353ee029f2ba7a7d9a42ad018 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:46 +0300 Subject: [PATCH 072/972] tools/selftests: Use a sensible timeout value for iperf3 client The default timeout of cmd() is 5 seconds and Iperf3Runner requests the iperf3 client to run for 10 seconds, which clearly doesn't work since commit [1] enforced the timeout parameter. Use a value derived from duration as timeout (+5 seconds for startup/teardown/various other overhead). [1] commit f0bd19316663 ("selftests: net: fix timeout passed as positional argument to communicate()") Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- tools/testing/selftests/drivers/net/lib/py/load.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index f181fa2d38fca0..e24660e5c27f3b 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -48,7 +48,10 @@ def start_client(self, background=False, streams=1, duration=10, reverse=False): Starts the iperf3 client with the configured options. """ cmdline = self._build_client(streams, duration, reverse) - return cmd(cmdline, background=background, host=self.env.remote) + kwargs = {"background": background, "host": self.env.remote} + if not background: + kwargs["timeout"] = duration + 5 + return cmd(cmdline, **kwargs) def measure_bandwidth(self, reverse=False): """ From e64e03b478e2da7093564819e903932fca2ddfa1 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:47 +0300 Subject: [PATCH 073/972] tools/selftests: Add a VXLAN+IPsec traffic test There are VXLAN tests and IPsec tests, but there is no test that combines the two protocols and exercises the tunnel-over-ipsec code paths. Fix that by adding a traffic test with VXLAN and IPsec using crypto offload. This is runnable on HW which supports ESP offload (so no nsim unfortunately). Traffic is done with iperf3 and the test validates that there are no packet drops and iperf3 can get to at least 100 Mbps (a very conservative value on today's crypto offload HW, as it can typically reach multi-Gbps rates). Ran right now, the test fails due to a recently exposed bug in xfrm, which will be fixed in the next patch: # ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py TAP version 13 1..4 # Check| At ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py, # line 161, in test_vxlan_ipsec_crypto_offload: # Check| ksft_eq(drops_after - drops_before, 0, # Check failed 189 != 0 TX drops during VXLAN+IPsec # Check| At ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py, # line 163, in test_vxlan_ipsec_crypto_offload: # Check| ksft_ge(bw_gbps, 0.1, # Check failed 0.0015058278404812596 < 0.1 Minimum 100Mbps over # VXLAN+IPsec not ok 1 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v4 ... Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- .../testing/selftests/drivers/net/hw/Makefile | 1 + tools/testing/selftests/drivers/net/hw/config | 5 + .../selftests/drivers/net/hw/ipsec_vxlan.py | 204 ++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 85ca4d1ecf9ecb..3b6ff4708005d9 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -30,6 +30,7 @@ TEST_PROGS = \ gro_hw.py \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + ipsec_vxlan.py \ iou-zcrx.py \ irq.py \ loopback.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index dd50cb8a79110c..ae0168c2bbe6fa 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -12,5 +12,10 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NETKIT=y CONFIG_NET_SCH_INGRESS=y +CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y CONFIG_UDMABUF=y CONFIG_VXLAN=y +CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py new file mode 100755 index 00000000000000..0740a4d852407e --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""Traffic test for VXLAN + IPsec crypto-offload.""" + +import os + +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx +from lib.py import CmdExitFailure, NetDrvEpEnv, cmd, defer, ethtool, ip +from lib.py import Iperf3Runner + +# Inner tunnel addresses - TEST-NET-2 (RFC 5737) / doc prefix (RFC 3849) +INNER_V4_LOCAL = "198.51.100.1" +INNER_V4_REMOTE = "198.51.100.2" +INNER_V6_LOCAL = "2001:db8:100::1" +INNER_V6_REMOTE = "2001:db8:100::2" + +# ESP parameters +SPI_OUT = "0x1000" +SPI_IN = "0x1001" +# 128-bit key + 32-bit salt = 20 bytes hex, 128-bit ICV +ESP_AEAD = "aead 'rfc4106(gcm(aes))' 0x" + "01" * 20 + " 128" + + +def xfrm(args, host=None): + """Runs 'ip xfrm' via shell to preserve parentheses in algo names.""" + cmd(f"ip xfrm {args}", shell=True, host=host) + + +def check_xfrm_offload_support(): + """Skips if iproute2 lacks xfrm offload support.""" + out = cmd("ip xfrm state help", fail=False) + if "offload" not in out.stdout + out.stderr: + raise KsftSkipEx("iproute2 too old, missing xfrm offload") + + +def check_esp_hw_offload(cfg): + """Skips if device lacks esp-hw-offload support.""" + check_xfrm_offload_support() + try: + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + except (CmdExitFailure, IndexError) as e: + raise KsftSkipEx(f"can't query features: {e}") from e + if not feat.get("esp-hw-offload", {}).get("active"): + raise KsftSkipEx("Device does not support esp-hw-offload") + + +def get_tx_drops(cfg): + """Returns TX dropped counter from the physical device.""" + stats = ip("-s -s link show dev " + cfg.ifname, json=True)[0] + return stats["stats64"]["tx"]["dropped"] + + +def setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver): + """Sets up VXLAN tunnel with IPsec transport-mode crypto-offload.""" + vxlan_name = f"vx{os.getpid()}" + local_addr = cfg.addr_v[outer_ipver] + remote_addr = cfg.remote_addr_v[outer_ipver] + + if inner_ipver == "4": + inner_local = f"{INNER_V4_LOCAL}/24" + inner_remote = f"{INNER_V4_REMOTE}/24" + addr_extra = "" + else: + inner_local = f"{INNER_V6_LOCAL}/64" + inner_remote = f"{INNER_V6_REMOTE}/64" + addr_extra = " nodad" + + if outer_ipver == "6": + vxlan_opts = "udp6zerocsumtx udp6zerocsumrx" + else: + vxlan_opts = "noudpcsum" + + # VXLAN tunnel - local side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {local_addr} remote {remote_addr} dev {cfg.ifname}") + defer(ip, f"link del {vxlan_name}") + ip(f"addr add {inner_local} dev {vxlan_name}{addr_extra}") + ip(f"link set {vxlan_name} up") + + # VXLAN tunnel - remote side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {remote_addr} remote {local_addr} dev {cfg.remote_ifname}", + host=cfg.remote) + defer(ip, f"link del {vxlan_name}", host=cfg.remote) + ip(f"addr add {inner_remote} dev {vxlan_name}{addr_extra}", + host=cfg.remote) + ip(f"link set {vxlan_name} up", host=cfg.remote) + + # xfrm state - local outbound SA + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir out") + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}") + + # xfrm state - local inbound SA + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir in") + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}") + + # xfrm state - remote outbound SA (mirror, software crypto) + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}", host=cfg.remote) + + # xfrm state - remote inbound SA (mirror, software crypto) + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}", host=cfg.remote) + + # xfrm policy - local out + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport") + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out") + + # xfrm policy - local in + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport") + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in") + + # xfrm policy - remote out + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out", host=cfg.remote) + + # xfrm policy - remote in + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in", host=cfg.remote) + + +def _vxlan_ipsec_variants(): + """Generates outer/inner IP version variants.""" + for outer in ["4", "6"]: + for inner in ["4", "6"]: + yield KsftNamedVariant(f"outer_v{outer}_inner_v{inner}", outer, inner) + + +@ksft_variants(_vxlan_ipsec_variants()) +def test_vxlan_ipsec_crypto_offload(cfg, outer_ipver, inner_ipver): + """Tests VXLAN+IPsec crypto-offload has no TX drops.""" + cfg.require_ipver(outer_ipver) + check_esp_hw_offload(cfg) + + setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver) + + if inner_ipver == "4": + inner_local = INNER_V4_LOCAL + inner_remote = INNER_V4_REMOTE + ping = "ping" + else: + inner_local = INNER_V6_LOCAL + inner_remote = INNER_V6_REMOTE + ping = "ping -6" + + cmd(f"{ping} -c 1 -W 2 {inner_remote}") + + drops_before = get_tx_drops(cfg) + + runner = Iperf3Runner(cfg, server_ip=inner_local, + client_ip=inner_remote) + bw_gbps = runner.measure_bandwidth(reverse=True) + + cfg.wait_hw_stats_settle() + drops_after = get_tx_drops(cfg) + + ksft_eq(drops_after - drops_before, 0, + comment="TX drops during VXLAN+IPsec") + ksft_ge(bw_gbps, 0.1, + comment="Minimum 100Mbps over VXLAN+IPsec") + + +def main(): + """Runs VXLAN+IPsec crypto-offload GSO selftest.""" + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_vxlan_ipsec_crypto_offload], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() From fa90a3145c0340c3f624206a81637c542254ea1d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:48 +0300 Subject: [PATCH 074/972] xfrm: Don't clobber inner headers when already set On VXLAN over IPsec egress, xfrm{4,6}_transport_output() blindly overwrite inner_transport_header (== the inner TCP header saved in VXLAN iptunnel_handle_offloads() -> skb_reset_inner_headers()) with the current transport_header (== the VXLAN outer UDP header set by udp_tunnel_xmit_skb()). This was a latent bug, harmless until commit [1] added a doff validation check in qdisc_pkt_len_segs_init() for encapsulated GSO packets. With the wrong inner_transport_header set by xfrm, qdisc_pkt_len_segs_init() interprets inner_transport_header as a TCP header, reads doff=0 from the upper byte of the VNI and drops the packet with DROP_REASON_SKB_BAD_GSO. Besides the use in GSO to determine the header size of segmented packets, inner_transport_header might be used by drivers to set up inner checksum offloading by pointing the HW to the inner transport header. A quick browse through available drivers shows that mlx5 uses skb->csum_start specifically for this scenario, while others either don't support VXLAN over IPsec crypto offload (ixgbe) or the HW is capable of parsing the packets itself (nfp, Chelsio). But in all cases, it is more correct to let the inner_transport_header point to the innermost header instead of overwriting it in xfrm. So fix this by guarding all four inner header save sites in xfrm_output.c (xfrm{4,6}_transport_output, xfrm{4,6}_tunnel_encap_add) with a check for skb->inner_protocol. When inner_protocol is set, a tunnel layer (VXLAN, Geneve, GRE, etc.) has already saved the correct inner header offsets and they must not be overwritten. When inner_protocol is zero, no prior tunnel encapsulation exists and xfrm must save the inner headers itself. The tunnel mode checks are only added for completion, since they aren't strictly required, as xfrm_output() forces software GSO in tunnel mode before encap. This makes the previously added test pass: # ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py TAP version 13 1..4 ok 1 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v4 ok 2 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v6 ok 3 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v6_inner_v4 ok 4 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v6_inner_v6 # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0 [1] commit 7fb4c1967011 ("net: pull headers in qdisc_pkt_len_segs_init()") Fixes: f1bd7d659ef0 ("xfrm: Add encapsulation header offsets while SKB is not encrypted") Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a9652b422f5121..cc35c2fcbbe095 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,7 +66,9 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -167,7 +169,9 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; iph = ipv6_hdr(skb); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) @@ -276,8 +280,10 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *top_iph; int flags; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -321,8 +327,10 @@ static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct ipv6hdr *top_iph; int dsfield; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + From db57a1aa54ff68669781976e4edb045e09e2b65b Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Thu, 23 Apr 2026 02:38:46 +0900 Subject: [PATCH 075/972] wifi: rsi: fix kthread lifetime race between self-exit and external-stop RSI driver use both self-exit(kthread_complete_and_exit) and external-stop (kthread_stop) when killing a kthread. Generally, kthread_stop() is called first, and in this case, no particular issues occur. However, in rare instances where kthread_complete_and_exit() is called first and then kthread_stop() is called, a UAF occurs because the kthread object, which has already exited and been freed, is accessed again. Therefore, to prevent this with minimal modification, you must remove kthread_stop() and change the code to wait until the self-exit operation is completed. Cc: Reported-by: syzbot+5de83f57cd8531f55596@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e5d03b.a00a0220.1bd0ca.0064.GAE@google.com/ Fixes: 4c62764d0fc2 ("rsi: improve kernel thread handling to fix kernel panic") Signed-off-by: Jeongjun Park Link: https://patch.msgid.link/20260422173846.37640-1-aha310510@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/rsi/rsi_common.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_common.h b/drivers/net/wireless/rsi/rsi_common.h index 591602beeec689..3cdf9ded876d9c 100644 --- a/drivers/net/wireless/rsi/rsi_common.h +++ b/drivers/net/wireless/rsi/rsi_common.h @@ -70,12 +70,11 @@ static inline int rsi_create_kthread(struct rsi_common *common, return 0; } -static inline int rsi_kill_thread(struct rsi_thread *handle) +static inline void rsi_kill_thread(struct rsi_thread *handle) { atomic_inc(&handle->thread_done); rsi_set_event(&handle->event); - - return kthread_stop(handle->task); + wait_for_completion(&handle->completion); } void rsi_mac80211_detach(struct rsi_hw *hw); From a9e8765fd206388d5672db229784982bf559f097 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 22 Apr 2026 14:25:27 +0200 Subject: [PATCH 076/972] efivarfs: use QSTR() in efivarfs_alloc_dentry Use QSTR() and drop strlen() in efivarfs_alloc_dentry(). Signed-off-by: Thorsten Blum Signed-off-by: Ard Biesheuvel --- fs/efivarfs/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 1c5224cf183e6d..733c19571f1cfe 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -191,13 +191,10 @@ static const struct dentry_operations efivarfs_d_ops = { static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) { + struct qstr q = QSTR(name); struct dentry *d; - struct qstr q; int err; - q.name = name; - q.len = strlen(name); - err = efivarfs_d_hash(parent, &q); if (err) return ERR_PTR(err); From b336e40c62fbdc4b8a1f09a4ada31f4a90c69eb1 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 27 Apr 2026 17:56:30 +0200 Subject: [PATCH 077/972] efi: pstore: Drop efivar lock when efi_pstore_open() returns with an error If kzalloc fails, the function returns -ENOMEM without calling efivar_unlock(). Since open() returned an error, the calling site in pstore_get_backend_records() won't call the close() function, so the lock is never released. Thus drop the lock in case of errors here. Fixes: 859748255b434 ("efi: pstore: Omit efivars caching EFI varstore access layer") Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Thomas Huth Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi-pstore.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index a253b61449459e..a5db3534f0a632 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -60,8 +60,10 @@ static int efi_pstore_open(struct pstore_info *psi) return err; psi->data = kzalloc(record_size, GFP_KERNEL); - if (!psi->data) + if (!psi->data) { + efivar_unlock(); return -ENOMEM; + } return 0; } From f1fb23a0a0fcbdb66672da51d7d63a259f6396ca Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Apr 2026 11:32:36 -0700 Subject: [PATCH 078/972] fbdev: ipu-v3: clean up kernel-doc warnings Correct all kernel-doc warnings: - fix a typedef kernel-doc comment - mark a list_head as private - use Returns: for function return values Warning: include/video/imx-ipu-image-convert.h:31 struct member 'list' not described in 'ipu_image_convert_run' Warning: include/video/imx-ipu-image-convert.h:40 function parameter 'ipu_image_convert_cb_t' not described in 'void' Warning: include/video/imx-ipu-image-convert.h:40 expecting prototype for ipu_image_convert_cb_t(). Prototype was for void() instead Warning: include/video/imx-ipu-image-convert.h:66 No description found for return value of 'ipu_image_convert_verify' Warning: include/video/imx-ipu-image-convert.h:90 No description found for return value of 'ipu_image_convert_prepare' Warning: include/video/imx-ipu-image-convert.h:125 No description found for return value of 'ipu_image_convert_queue' Warning: include/video/imx-ipu-image-convert.h:163 No description found for return value of 'ipu_image_convert' Signed-off-by: Randy Dunlap Reviewed-by: Philipp Zabel Signed-off-by: Helge Deller --- include/video/imx-ipu-image-convert.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/video/imx-ipu-image-convert.h b/include/video/imx-ipu-image-convert.h index 003b3927ede5cd..6b77968a6a150f 100644 --- a/include/video/imx-ipu-image-convert.h +++ b/include/video/imx-ipu-image-convert.h @@ -27,12 +27,13 @@ struct ipu_image_convert_run { int status; + /* private: */ /* internal to image converter, callers don't touch */ struct list_head list; }; /** - * ipu_image_convert_cb_t - conversion callback function prototype + * typedef ipu_image_convert_cb_t - conversion callback function prototype * * @run: the completed conversion run pointer * @ctx: a private context pointer for the callback @@ -60,7 +61,7 @@ void ipu_image_convert_adjust(struct ipu_image *in, struct ipu_image *out, * @out: output image format * @rot_mode: rotation mode * - * Returns 0 if the formats and rotation mode meet IPU restrictions, + * Returns: 0 if the formats and rotation mode meet IPU restrictions, * -EINVAL otherwise. */ int ipu_image_convert_verify(struct ipu_image *in, struct ipu_image *out, @@ -77,11 +78,11 @@ int ipu_image_convert_verify(struct ipu_image *in, struct ipu_image *out, * @complete: run completion callback * @complete_context: a context pointer for the completion callback * - * Returns an opaque conversion context pointer on success, error pointer + * In V4L2, drivers should call ipu_image_convert_prepare() at streamon. + * + * Returns: an opaque conversion context pointer on success, error pointer * on failure. The input/output formats and rotation mode must already meet * IPU retrictions. - * - * In V4L2, drivers should call ipu_image_convert_prepare() at streamon. */ struct ipu_image_convert_ctx * ipu_image_convert_prepare(struct ipu_soc *ipu, enum ipu_ic_task ic_task, @@ -122,6 +123,8 @@ void ipu_image_convert_unprepare(struct ipu_image_convert_ctx *ctx); * In V4L2, drivers should call ipu_image_convert_queue() while * streaming to queue the conversion of a received input buffer. * For example mem2mem devices this would be called in .device_run. + * + * Returns: 0 on success or -errno on error. */ int ipu_image_convert_queue(struct ipu_image_convert_run *run); @@ -155,6 +158,9 @@ void ipu_image_convert_abort(struct ipu_image_convert_ctx *ctx); * On successful return the caller can queue more run requests if needed, using * the prepared context in run->ctx. The caller is responsible for unpreparing * the context when no more conversion requests are needed. + * + * Returns: pointer to the created &struct ipu_image_convert_run that has + * been queued on success; an ERR_PTR(errno) on error. */ struct ipu_image_convert_run * ipu_image_convert(struct ipu_soc *ipu, enum ipu_ic_task ic_task, From 0b996ae54d876b41c52dd7cfc512eb008a47d781 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Tue, 28 Apr 2026 11:17:25 +0800 Subject: [PATCH 079/972] fbdev: defio: Remove duplicate include of linux/module.h Remove duplicate inclusion of linux/module.h in fb_defio.c to clean up redundant code. Signed-off-by: Chen Ni Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index a12dd25ab6979f..fd00b86e1ae602 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include From d237f719b2726c0e6d62bfa1543f53b624471929 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 28 Apr 2026 10:28:43 +0200 Subject: [PATCH 080/972] lib/fonts: Fix bit position when rotating by 180 degrees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the horizontal bit position when rotating a glyph by 180°. The original code in rotate_ud() rounded the value in width up to a multiple of 8, aka the bit pitch, and calculated the rotated pixel from that value. The new code stores the glyph's pitch in bit_pitch, but fails to update the rotated pixel's output accordingly. Simply replacing the variable does this. The bug can be reproduced by setting a font with an unaligned width, such as sun12x22, like this: setfont sun12x22 echo 2 > /sys/class/graphics/fbcon/rotate Without the fix, the font looks distorted. Fixes: a30e9e6b018f ("lib/fonts: Refactor glyph-rotation helpers") Closes: https://lore.gitlab.freedesktop.org/drm-ai-reviews/review-patch7-20260407092555.58816-8-tzimmermann@suse.de/ Cc: dri-devel@lists.freedesktop.org Cc: linux-fbdev@vger.kernel.org Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- lib/fonts/font_rotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fonts/font_rotate.c b/lib/fonts/font_rotate.c index 065e0fc0667baf..275406008823b6 100644 --- a/lib/fonts/font_rotate.c +++ b/lib/fonts/font_rotate.c @@ -106,7 +106,7 @@ static void __font_glyph_rotate_180(const unsigned char *glyph, for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { if (font_glyph_test_bit(glyph, x, y, bit_pitch)) { - font_glyph_set_bit(out, width - (1 + x + shift), height - (1 + y), + font_glyph_set_bit(out, bit_pitch - 1 - x - shift, height - 1 - y, bit_pitch); } } From a6715d7ec472a476db17787697a4abda62962284 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Fri, 10 Apr 2026 01:16:05 +0000 Subject: [PATCH 081/972] kho: skip KHO for crash kernel kho_fill_kimage() unconditionally populates the kimage with KHO metadata for every kexec image type. When the image is a crash kernel, this can be problematic as the crash kernel can run in a small reserved region and the KHO scratch areas can sit outside it. The crash kernel then faults during kho_memory_init() when it tries phys_to_virt() on the KHO FDT address: Unable to handle kernel paging request at virtual address xxxxxxxx ... fdt_offset_ptr+... fdt_check_node_offset_+... fdt_first_property_offset+... fdt_get_property_namelen_+... fdt_getprop+... kho_memory_init+... mm_core_init+... start_kernel+... kho_locate_mem_hole() already skips KHO logic for KEXEC_TYPE_CRASH images, but kho_fill_kimage() was missing the same guard. As kho_fill_kimage() is the single point that populates image->kho.fdt and image->kho.scratch, fixing it here is sufficient for both arm64 and x86 as the FDT and boot_params path are bailing out when these fields are unset. Fixes: d7255959b69a ("kho: allow kexec load before KHO finalization") Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Link: https://patch.msgid.link/20260410011609.1103-1-epetron@amazon.de Signed-off-by: Mike Rapoport (Microsoft) --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 94762de1fe5f03..4fde8325c49fb9 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1702,7 +1702,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_enable || image->type == KEXEC_TYPE_CRASH) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); From 0fb1daf0b78d0e23b63b6b65de56d4a3fd83bc14 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Wed, 15 Apr 2026 06:23:00 +0100 Subject: [PATCH 082/972] mm/memfd_luo: report error when restoring a folio fails mid-loop memfd_luo_retrieve_folios() initialises err to -EIO, but the per-iteration calls to mem_cgroup_charge(), shmem_add_to_page_cache() and shmem_inode_acct_blocks() reuse and overwrite err. Once any iteration completes successfully, err becomes zero. If a later iteration's kho_restore_folio() returns NULL, the failure path jumps to put_folios without resetting err, so the function returns 0. The caller memfd_luo_retrieve() then takes the success path, sets args->file and reports the restore as successful, leaving userspace with a partially populated memfd and no indication that anything went wrong. Set err to -EIO in the kho_restore_folio() failure branch so the error is propagated to the caller. Signed-off-by: David Carlier Reviewed-by: Pratyush Yadav Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Link: https://patch.msgid.link/20260415052300.362539-1-devnexen@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memfd_luo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index b02b503c750df0..35d1247281e047 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -427,6 +427,7 @@ static int memfd_luo_retrieve_folios(struct file *file, if (!folio) { pr_err("Unable to restore folio at physical address: %llx\n", phys); + err = -EIO; goto put_folios; } index = pfolio->index; From 76b48a70b16b4036814964b039cde413e0164416 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 6 Feb 2026 00:08:36 -0500 Subject: [PATCH 083/972] IB/hfi1: Fix potential use-after-free in PIO and SDMA map teardown The current teardown logic for dd->pio_map and dd->sdma_map frees the structures while they might still be accessed by RCU readers. Although the pointer is nulled under a spinlock, the memory is reclaimed before waiting for the grace period to end. This patch fixes the sequence by: 1. Extracting the pointer under the lock. 2. Clearing the RCU-protected pointer. 3. Waiting for readers to finish with synchronize_rcu(). 4. Finally freeing the memory. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://patch.msgid.link/r/20260206050836.5890-1-lirongqing@baidu.com Signed-off-by: Li RongQing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/pio.c | 5 ++++- drivers/infiniband/hw/hfi1/sdma.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 51afaac88c7259..9121d83bf88afe 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1942,13 +1942,16 @@ int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) void free_pio_map(struct hfi1_devdata *dd) { + struct pio_vl_map *map; + /* Free PIO map if allocated */ if (rcu_access_pointer(dd->pio_map)) { spin_lock_irq(&dd->pio_map_lock); - pio_map_free(rcu_access_pointer(dd->pio_map)); + map = rcu_access_pointer(dd->pio_map); RCU_INIT_POINTER(dd->pio_map, NULL); spin_unlock_irq(&dd->pio_map_lock); synchronize_rcu(); + pio_map_free(map); } kfree(dd->kernel_send_context); dd->kernel_send_context = NULL; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index e5f442938177eb..cfd9dd0f7e8176 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1255,6 +1255,7 @@ void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) { size_t i; struct sdma_engine *sde; + struct sdma_vl_map *map; if (dd->sdma_pad_dma) { dma_free_coherent(&dd->pcidev->dev, SDMA_PAD, @@ -1291,10 +1292,11 @@ void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) } if (rcu_access_pointer(dd->sdma_map)) { spin_lock_irq(&dd->sde_map_lock); - sdma_map_free(rcu_access_pointer(dd->sdma_map)); + map = rcu_access_pointer(dd->sdma_map); RCU_INIT_POINTER(dd->sdma_map, NULL); spin_unlock_irq(&dd->sde_map_lock); synchronize_rcu(); + sdma_map_free(map); } kfree(dd->per_sdma); dd->per_sdma = NULL; From 4c6f86d85d03cdb33addce86aa69aa795ca6c47a Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 14 Apr 2026 07:15:55 -0400 Subject: [PATCH 084/972] RDMA/rxe: Reject unknown opcodes before ICRC processing Even after applying commit 7244491dab34 ("RDMA/rxe: Validate pad and ICRC before payload_size() in rxe_rcv"), a single unauthenticated UDP packet can still trigger panic. That patch handled payload_size() underflow only for valid opcodes with short packets, not for packets carrying an unknown opcode. The unknown-opcode OOB read described below predates that commit and reaches back to the initial Soft RoCE driver. The check added there reads pkt->paylen < header_size(pkt) + bth_pad(pkt) + RXE_ICRC_SIZE where header_size(pkt) expands to rxe_opcode[pkt->opcode].length. The rxe_opcode[] array has 256 entries but is only populated for defined IB opcodes; any other entry (for example opcode 0xff) is zero-initialized, so length == 0 and the check degenerates to pkt->paylen < 0 + bth_pad(pkt) + RXE_ICRC_SIZE which does not constrain pkt->paylen enough. rxe_icrc_hdr() then computes rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES which underflows when length == 0 and passes a huge value to rxe_crc32(), causing an out-of-bounds read of the skb payload. Reproduced on v7.0-rc7 with that fix applied, QEMU/KVM with CONFIG_RDMA_RXE=y and CONFIG_KASAN=y, after rdma link add rxe0 type rxe netdev eth0 A single 48-byte UDP packet to port 4791 with BTH opcode=0xff and QPN=IB_MULTICAST_QPN triggers: BUG: KASAN: slab-out-of-bounds in crc32_le+0x115/0x170 Read of size 1 at addr ... The buggy address is located 0 bytes to the right of allocated 704-byte region Call Trace: crc32_le+0x115/0x170 rxe_icrc_hdr.isra.0+0x226/0x300 rxe_icrc_check+0x13f/0x3a0 rxe_rcv+0x6e1/0x16e0 rxe_udp_encap_recv+0x20a/0x320 udp_queue_rcv_one_skb+0x7ed/0x12c0 Subsequent packets with the same shape fault on unmapped memory and panic the kernel. The trigger requires only module load and "rdma link add"; no QP, no connection, and no authentication. Fix this by rejecting packets whose opcode has no rxe_opcode[] entry, detected via the zero mask or zero length, before any length arithmetic runs. Cc: stable@vger.kernel.org Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://patch.msgid.link/r/20260414111555.3386793-1-michael.bommarito@gmail.com Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Michael Bommarito Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_recv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index f79214738c2b86..2d5e701ff961af 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -330,6 +330,17 @@ void rxe_rcv(struct sk_buff *skb) pkt->qp = NULL; pkt->mask |= rxe_opcode[pkt->opcode].mask; + /* + * Unknown opcodes have a zero-initialized rxe_opcode[] entry, so + * both mask and length are 0. Reject them before any length math: + * rxe_icrc_hdr() would otherwise compute length - RXE_BTH_BYTES + * and pass the underflowed value to rxe_crc32(), producing an + * out-of-bounds read. + */ + if (unlikely(!rxe_opcode[pkt->opcode].mask || + !rxe_opcode[pkt->opcode].length)) + goto drop; + if (unlikely(pkt->paylen < header_size(pkt) + bth_pad(pkt) + RXE_ICRC_SIZE)) goto drop; From 1114c87aa6f195cf07da55a27b2122ae26557b26 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 18 Apr 2026 12:21:41 -0400 Subject: [PATCH 085/972] RDMA/rxe: Reject non-8-byte ATOMIC_WRITE payloads atomic_write_reply() at drivers/infiniband/sw/rxe/rxe_resp.c unconditionally dereferences 8 bytes at payload_addr(pkt): value = *(u64 *)payload_addr(pkt); check_rkey() previously accepted an ATOMIC_WRITE request with pktlen == resid == 0 because the length validation only compared pktlen against resid. A remote initiator that sets the RETH length to 0 therefore reaches atomic_write_reply() with a zero-byte logical payload, and the responder reads sizeof(u64) bytes from past the logical end of the packet into skb->head tailroom, then writes those 8 bytes into the attacker's MR via rxe_mr_do_atomic_write(). That is a remote disclosure of 4 bytes of kernel tailroom per probe (the other 4 bytes are the packet's own trailing ICRC). IBA oA19-28 defines ATOMIC_WRITE as exactly 8 bytes. Anything else is protocol-invalid. Hoist a strict length check into check_rkey() so the responder never reaches the unchecked dereference, and keep the existing WRITE-family length logic for the normal RDMA WRITE path. Reproduced on mainline with an unmodified rxe driver: a sustained zero-length ATOMIC_WRITE probe repeatedly leaks adjacent skb head-buffer bytes into the attacker's MR, including recognisable kernel strings and partial kernel-direct-map pointer words. With this patch applied the responder rejects the PDU and the MR stays all-zero. Cc: stable@vger.kernel.org Fixes: 034e285f8b99 ("RDMA/rxe: Make responder support atomic write on RC service") Link: https://patch.msgid.link/r/20260418162141.3610201-1-michael.bommarito@gmail.com Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Reviewed-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 9faf8c09aa8e42..9cb2f6fbf2dd6a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -540,7 +540,19 @@ static enum resp_states check_rkey(struct rxe_qp *qp, } skip_check_range: - if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { + if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + /* IBA oA19-28: ATOMIC_WRITE payload is exactly 8 bytes. + * Reject any other length before the responder reads + * sizeof(u64) bytes from payload_addr(pkt); a shorter + * payload would read past the logical end of the packet + * into skb->head tailroom. + */ + if (resid != sizeof(u64) || pktlen != sizeof(u64) || + bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err; + } + } else if (pkt->mask & RXE_WRITE_MASK) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; From c488df06bd552bb8b6e14fa0cfd5ad986c6e9525 Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Fri, 24 Apr 2026 13:51:02 +0800 Subject: [PATCH 086/972] RDMA/mlx5: Fix error path fall-through in mlx5_ib_dev_res_srq_init() mlx5_ib_dev_res_srq_init() allocates two SRQs, s0 and s1. When ib_create_srq() fails for s1, the error branch destroys s0 but falls through and unconditionally assigns the freed s0 and the ERR_PTR s1 to devr->s0 and devr->s1. This leads to several problems: the lock-free fast path checks "if (devr->s1) return 0;" and treats the ERR_PTR as already initialised; users in mlx5_ib_create_qp() dereference the freed SRQ or ERR_PTR via to_msrq(devr->s0)->msrq.srqn; and mlx5_ib_dev_res_cleanup() dereferences the ERR_PTR and double-frees s0 on teardown. Fix by adding the same `goto unlock` in the s1 failure path. Cc: stable@vger.kernel.org Fixes: 5895e70f2e6e ("IB/mlx5: Allocate resources just before first QP/SRQ is created") Link: https://patch.msgid.link/r/SYBPR01MB7881E1E0970268BD69C0BA75AF2B2@SYBPR01MB7881.ausprd01.prod.outlook.com Reported-by: Yuhao Jiang Signed-off-by: Junrui Luo Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 109661c2ac12b0..8115ae869ef209 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3392,6 +3392,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) "Couldn't create SRQ 1 for res init, err=%pe\n", s1); ib_destroy_srq(s0); + goto unlock; } devr->s0 = s0; From 8135b89a376ab2c20db3921c9c38cf83994b1f1c Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 23 Apr 2026 09:10:52 +0200 Subject: [PATCH 087/972] Revert "parisc: led: fix reference leak on failed device registration" This reverts commit 707610bcccbd0327530938e33f3f33211a640a4e. platform_device_register() is going to be fixed instead. Signed-off-by: Helge Deller --- drivers/parisc/led.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index b299fcc48b0874..016c9d5a60a8a8 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -543,10 +543,8 @@ static void __init register_led_regions(void) static int __init startup_leds(void) { - if (platform_device_register(&platform_leds)) { - pr_info("LED: failed to register LEDs\n"); - platform_device_put(&platform_leds); - } + if (platform_device_register(&platform_leds)) + printk(KERN_INFO "LED: failed to register LEDs\n"); register_led_regions(); return 0; } From b8425ceefee5df2b8490fa70a07e4e402c752492 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 24 Apr 2026 12:32:11 +0200 Subject: [PATCH 088/972] parisc: drivers: switch to dynamic root device Driver core expects devices to be dynamically allocated and will, for example, complain loudly if a device that lacks a release function is ever freed. Use root_device_register() to allocate and register the root device instead of open coding using a static device. While at it, drop the redundant additional reference taken at init. Signed-off-by: Johan Hovold Signed-off-by: Helge Deller --- arch/parisc/kernel/drivers.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index bc47bbe3026e72..b52ad704ec8a8c 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -41,9 +41,7 @@ const struct dma_map_ops *hppa_dma_ops __ro_after_init; EXPORT_SYMBOL(hppa_dma_ops); -static struct device root = { - .init_name = "parisc", -}; +static struct device *root; static inline int check_dev(struct device *dev) { @@ -89,7 +87,7 @@ static int for_each_padev(int (*fn)(struct device *, void *), void * data) .obj = data, .fn = fn, }; - return device_for_each_child(&root, &recurse_data, descend_children); + return device_for_each_child(root, &recurse_data, descend_children); } /** @@ -290,7 +288,7 @@ const struct parisc_device * find_pa_parent_type(const struct parisc_device *padev, int type) { const struct device *dev = &padev->dev; - while (dev != &root) { + while (dev != root) { struct parisc_device *candidate = to_parisc_device(dev); if (candidate->id.hw_type == type) return candidate; @@ -319,7 +317,7 @@ static void get_node_path(struct device *dev, struct hardware_path *path) dev = dev->parent; } - while (dev != &root) { + while (dev != root) { if (dev_is_pci(dev)) { unsigned int devfn = to_pci_dev(dev)->devfn; path->bc[i--] = PCI_SLOT(devfn) | (PCI_FUNC(devfn)<< 5); @@ -482,7 +480,7 @@ static struct parisc_device * __init alloc_tree_node( static struct parisc_device *create_parisc_device(struct hardware_path *modpath) { int i; - struct device *parent = &root; + struct device *parent = root; for (i = 0; i < 6; i++) { if (modpath->bc[i] == -1) continue; @@ -755,7 +753,7 @@ parse_tree_node(struct device *parent, int index, struct hardware_path *modpath) struct device *hwpath_to_device(struct hardware_path *modpath) { int i; - struct device *parent = &root; + struct device *parent = root; for (i = 0; i < 6; i++) { if (modpath->bc[i] == -1) continue; @@ -880,7 +878,7 @@ void __init walk_central_bus(void) { walk_native_bus(CENTRAL_BUS_ADDR, CENTRAL_BUS_ADDR + (MAX_NATIVE_DEVICES * NATIVE_DEVICE_OFFSET), - &root); + root); } static __init void print_parisc_device(struct parisc_device *dev) @@ -907,9 +905,10 @@ void __init init_parisc_bus(void) { if (bus_register(&parisc_bus_type)) panic("Could not register PA-RISC bus type\n"); - if (device_register(&root)) + + root = root_device_register("parisc"); + if (IS_ERR(root)) panic("Could not register PA-RISC root device\n"); - get_device(&root); } static __init void qemu_header(void) From 0de4cb473aed57ee4ba7e0551ad27bddc19fc519 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 28 Apr 2026 08:10:43 -0700 Subject: [PATCH 089/972] workqueue: fix devm_alloc_workqueue() va_list misuse devm_alloc_workqueue() built a va_list and passed it as a single positional argument to the variadic alloc_workqueue() macro: va_start(args, max_active); wq = alloc_workqueue(fmt, flags, max_active, args); va_end(args); C does not allow forwarding a va_list through a ... parameter. alloc_workqueue() expands to alloc_workqueue_noprof(), which runs its own va_start() over its ... params, so the inner vsnprintf(wq->name, sizeof(wq->name), fmt, args) in __alloc_workqueue() received the outer va_list object as the first variadic slot rather than the caller's actual format arguments. Add a new static helper alloc_workqueue_va() that wraps __alloc_workqueue() and runs wq_init_lockdep() on success, and fold both alloc_workqueue_noprof() and devm_alloc_workqueue_noprof() onto it as suggested by Tejun. The wq_init_lockdep() step is required on the devm path too, otherwise __flush_workqueue()'s on-stack COMPLETION_INITIALIZER_ONSTACK_MAP would NULL-deref wq->lockdep_map. No caller changes are required. devm_alloc_ordered_workqueue() is a macro forwarding to devm_alloc_workqueue() and inherits the fix. Two in-tree callers actively trigger the broken path on every probe: drivers/power/supply/mt6370-charger.c:889 drivers/power/supply/max77705_charger.c:649 both of which use devm_alloc_ordered_workqueue(dev, "%s", 0, dev_name(dev)). A standalone reproducer module is available at[1]. Link: https://github.com/leitao/debug/blob/main/workqueue/valist/wq_va_test.c [1] Fixes: 1dfc9d60a69e ("workqueue: devres: Add device-managed allocate workqueue") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++-- kernel/workqueue.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index ab6cb70ca1a52c..6177624539b3b3 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,8 +534,10 @@ alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...) * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...); +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...); +#define devm_alloc_workqueue(...) \ + alloc_hooks(devm_alloc_workqueue_noprof(__VA_ARGS__)) #ifdef CONFIG_LOCKDEP /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f1e..24d0265191d464 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5906,6 +5906,20 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, return NULL; } +static struct workqueue_struct *alloc_workqueue_va(const char *fmt, + unsigned int flags, + int max_active, + va_list args) +{ + struct workqueue_struct *wq; + + wq = __alloc_workqueue(fmt, flags, max_active, args); + if (wq) + wq_init_lockdep(wq); + + return wq; +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, @@ -5915,12 +5929,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, va_list args; va_start(args, max_active); - wq = __alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); - if (!wq) - return NULL; - - wq_init_lockdep(wq); return wq; } @@ -5932,15 +5942,15 @@ static void devm_workqueue_release(void *res) } __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...) +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; int ret; va_start(args, max_active); - wq = alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; @@ -5951,7 +5961,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, return wq; } -EXPORT_SYMBOL_GPL(devm_alloc_workqueue); +EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) From 278dd0487907112de8e34e1a97ac6145a8081523 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Fri, 10 Apr 2026 21:53:54 +0200 Subject: [PATCH 090/972] HID: sony: fix incorrect force-feedback check in sony_suspend() This commit fixes the incorrect force-feedback check in sony_suspend(), without this the check will always be true due to checking a constant define that is never 0. Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index b5e724676c1dea..23db406092ef15 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -2456,11 +2456,10 @@ static void sony_remove(struct hid_device *hdev) static int sony_suspend(struct hid_device *hdev, pm_message_t message) { #ifdef CONFIG_SONY_FF + struct sony_sc *sc = hid_get_drvdata(hdev); /* On suspend stop any running force-feedback events */ - if (SONY_FF_SUPPORT) { - struct sony_sc *sc = hid_get_drvdata(hdev); - + if (sc->quirks & SONY_FF_SUPPORT) { sc->left = sc->right = 0; sony_send_output_report(sc); } From 80c4bbb2b38513e9c3d84805fa61a0ee16d79c45 Mon Sep 17 00:00:00 2001 From: Michael Zaidman Date: Sat, 11 Apr 2026 09:24:37 +0300 Subject: [PATCH 091/972] HID: ft260: validate i2c input report length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two checks to ft260_raw_event() to prevent out-of-bounds reads from malicious or malfunctioning devices: First, reject reports shorter than the 2-byte header (report ID + length fields). Without this, even accessing xfer->length on a 1-byte report is an OOB read. Second, validate xfer->length against the actual data capacity of the received HID report. Each I2C data report ID (0xD0 through 0xDE) defines a different report size in the HID descriptor, so the available payload varies per report. A corrupted length field could cause memcpy to read beyond the report buffer. Reported-by: Sebastián Josué Alba Vives Signed-off-by: Michael Zaidman Signed-off-by: Jiri Kosina --- drivers/hid/hid-ft260.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-ft260.c b/drivers/hid/hid-ft260.c index 333341e80b0ec6..70e2eedb465af1 100644 --- a/drivers/hid/hid-ft260.c +++ b/drivers/hid/hid-ft260.c @@ -1068,10 +1068,22 @@ static int ft260_raw_event(struct hid_device *hdev, struct hid_report *report, struct ft260_device *dev = hid_get_drvdata(hdev); struct ft260_i2c_input_report *xfer = (void *)data; + if (size < offsetof(struct ft260_i2c_input_report, data)) { + hid_err(hdev, "short report %d\n", size); + return -1; + } + if (xfer->report >= FT260_I2C_REPORT_MIN && xfer->report <= FT260_I2C_REPORT_MAX) { - ft260_dbg("i2c resp: rep %#02x len %d\n", xfer->report, - xfer->length); + ft260_dbg("i2c resp: rep %#02x len %d size %d\n", + xfer->report, xfer->length, size); + + if (xfer->length > size - + offsetof(struct ft260_i2c_input_report, data)) { + hid_err(hdev, "report %#02x: length %d exceeds HID report size\n", + xfer->report, xfer->length); + return -1; + } if ((dev->read_buf == NULL) || (xfer->length > dev->read_len - dev->read_idx)) { From 0f2b8466fb744a8b3313a9c1e2008f8cd53b2db7 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sat, 11 Apr 2026 17:32:48 +0200 Subject: [PATCH 092/972] HID: sony: remove unneeded WARN_ON() in sony_leds_init() This commit removes the unneeded WARN_ON() macro usage in sony_leds_init(), this is unneeded because the sony_leds_init() function call is already gated behind a SONY_LED_SUPPORT check in sony_input_configured() Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 23db406092ef15..6a860b9ef677a2 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1640,9 +1640,6 @@ static int sony_leds_init(struct sony_sc *sc) u8 max_brightness[MAX_LEDS] = { [0 ... (MAX_LEDS - 1)] = 1 }; u8 use_hw_blink[MAX_LEDS] = { 0 }; - if (WARN_ON(!(sc->quirks & SONY_LED_SUPPORT))) - return -EINVAL; - if (sc->quirks & BUZZ_CONTROLLER) { sc->led_count = 4; use_color_names = 0; From a4170b63eda999d20ad6dc39ddc3ce5c1ac619e6 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sun, 12 Apr 2026 03:08:06 +0200 Subject: [PATCH 093/972] HID: sony: add missing size validation for SMK-Link remotes This commit adds the missing size validation for SMK-Link remotes in sony_raw_event(), this prevents a malicious device from allowing hid-sony to read out of bounds of the provided buffer. I do not own these devices so the size check only forces that the buffer is large enough for nsg_mrxu_parse_report(). Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 6a860b9ef677a2..13fe7a3e57d736 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1169,10 +1169,9 @@ static int sony_raw_event(struct hid_device *hdev, struct hid_report *report, sixaxis_parse_report(sc, rd, size); } else if ((sc->quirks & MOTION_CONTROLLER_BT) && rd[0] == 0x01 && size == 49) { sixaxis_parse_report(sc, rd, size); - } else if ((sc->quirks & NAVIGATION_CONTROLLER) && rd[0] == 0x01 && - size == 49) { + } else if ((sc->quirks & NAVIGATION_CONTROLLER) && rd[0] == 0x01 && size == 49) { sixaxis_parse_report(sc, rd, size); - } else if ((sc->quirks & NSG_MRXU_REMOTE) && rd[0] == 0x02) { + } else if ((sc->quirks & NSG_MRXU_REMOTE) && rd[0] == 0x02 && size >= 12) { nsg_mrxu_parse_report(sc, rd, size); return 1; } else if ((sc->quirks & RB4_GUITAR_PS4_USB) && rd[0] == 0x01 && size == 64) { From 12bd440b66ed8968afffc46928233967b5b79b98 Mon Sep 17 00:00:00 2001 From: Rosalie Wanders Date: Sun, 12 Apr 2026 03:12:03 +0200 Subject: [PATCH 094/972] HID: sony: add missing size validation for Rock Band 3 Pro instruments This commit adds the missing size validation for Rock Band 3 PS3 Pro instruments in sony_raw_event(), this prevents a malicious device from allowing hid-sony to read out of bounds of the provided buffer. Signed-off-by: Rosalie Wanders Signed-off-by: Jiri Kosina --- drivers/hid/hid-sony.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c index 13fe7a3e57d736..315343415e8f1f 100644 --- a/drivers/hid/hid-sony.c +++ b/drivers/hid/hid-sony.c @@ -1188,7 +1188,7 @@ static int sony_raw_event(struct hid_device *hdev, struct hid_report *report, /* Rock Band 3 PS3 Pro instruments set rd[24] to 0xE0 when they're * sending full reports, and 0x02 when only sending navigation. */ - if ((sc->quirks & RB3_PRO_INSTRUMENT) && rd[24] == 0x02) { + if ((sc->quirks & RB3_PRO_INSTRUMENT) && size >= 25 && rd[24] == 0x02) { /* Only attempt to enable full report every 8 seconds */ if (time_after(jiffies, sc->rb3_pro_poke_jiffies)) { sc->rb3_pro_poke_jiffies = jiffies + secs_to_jiffies(8); From 55ce1858848132ed074fe907f00b5ce1ccab0ce1 Mon Sep 17 00:00:00 2001 From: Damien Dejean Date: Tue, 14 Apr 2026 13:38:58 +0000 Subject: [PATCH 095/972] HID: elan: Add support for ELAN SB974D touchpad MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Elan SB974D touchpad uses ELAN_MT_I2C format to send HID reports. Add an entry to match for the device and parse its vendor specific format. Signed-off-by: Damien Dejean Signed-off-by: Kornel Dulęba Signed-off-by: Jiri Kosina --- drivers/hid/hid-elan.c | 1 + drivers/hid/hid-ids.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/hid/hid-elan.c b/drivers/hid/hid-elan.c index 76d93fc48f6a28..0190ad567ce4d4 100644 --- a/drivers/hid/hid-elan.c +++ b/drivers/hid/hid-elan.c @@ -513,6 +513,7 @@ static const struct hid_device_id elan_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_HP_X2_10_COVER), .driver_data = ELAN_HAS_LED }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_TOSHIBA_CLICK_L9W) }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, USB_DEVICE_ID_SB974D) }, { } }; MODULE_DEVICE_TABLE(hid, elan_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 0cf63742315bf8..8cfec7dced6645 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -455,6 +455,7 @@ #define USB_DEVICE_ID_EDIFIER_QR30 0xa101 /* EDIFIER Hal0 2.0 SE */ #define USB_VENDOR_ID_ELAN 0x04f3 +#define USB_DEVICE_ID_SB974D 0x0400 #define USB_DEVICE_ID_TOSHIBA_CLICK_L9W 0x0401 #define USB_DEVICE_ID_HP_X2 0x074d #define USB_DEVICE_ID_HP_X2_10_COVER 0x0755 From 3524900cc571bd922a1a6b6a0eb0c2705cdb3559 Mon Sep 17 00:00:00 2001 From: Matthew Schwartz Date: Mon, 20 Apr 2026 11:15:22 -0700 Subject: [PATCH 096/972] HID: hid-lenovo-go-s: restore OS_TYPE after resume from s2idle The controller MCU does not persist OS_TYPE across power cycles. During s2idle resume, the USB device may be power-cycled, causing the OS_TYPE setting to revert to the default Windows value. Add a reset_resume callback so that this is correctly restored after resume. Fixes: a23f3497bf208c59ad ("HID: hid-lenovo-go-s: Add Lenovo Legion Go S Series HID Driver") Reviewed-by: Derek J. Clark Signed-off-by: Matthew Schwartz Signed-off-by: Jiri Kosina --- drivers/hid/hid-lenovo-go-s.c | 44 +++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/drivers/hid/hid-lenovo-go-s.c b/drivers/hid/hid-lenovo-go-s.c index 01c7bdd4fbe044..ff1782a751915f 100644 --- a/drivers/hid/hid-lenovo-go-s.c +++ b/drivers/hid/hid-lenovo-go-s.c @@ -1369,6 +1369,14 @@ static void cfg_setup(struct work_struct *work) "Failed to retrieve IMU Manufacturer: %i\n", ret); return; } + + ret = mcu_property_out(drvdata.hdev, GET_GAMEPAD_CFG, FEATURE_OS_MODE, + NULL, 0); + if (ret) { + dev_err(&drvdata.hdev->dev, + "Failed to retrieve OS Mode: %i\n", ret); + return; + } } static int hid_gos_cfg_probe(struct hid_device *hdev, @@ -1427,6 +1435,27 @@ static void hid_gos_cfg_remove(struct hid_device *hdev) hid_set_drvdata(hdev, NULL); } +static int hid_gos_cfg_reset_resume(struct hid_device *hdev) +{ + u8 os_mode = drvdata.os_mode; + int ret; + + ret = mcu_property_out(drvdata.hdev, SET_GAMEPAD_CFG, + FEATURE_OS_MODE, &os_mode, 1); + if (ret < 0) + return ret; + + ret = mcu_property_out(drvdata.hdev, GET_GAMEPAD_CFG, + FEATURE_OS_MODE, NULL, 0); + if (ret < 0) + return ret; + + if (drvdata.os_mode != os_mode) + return -ENODEV; + + return 0; +} + static int hid_gos_probe(struct hid_device *hdev, const struct hid_device_id *id) { @@ -1481,6 +1510,20 @@ static void hid_gos_remove(struct hid_device *hdev) } } +static int hid_gos_reset_resume(struct hid_device *hdev) +{ + int ep = get_endpoint_address(hdev); + + switch (ep) { + case GO_S_CFG_INTF_IN: + return hid_gos_cfg_reset_resume(hdev); + default: + break; + } + + return 0; +} + static const struct hid_device_id hid_gos_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_QHE, USB_DEVICE_ID_LENOVO_LEGION_GO_S_XINPUT) }, @@ -1496,6 +1539,7 @@ static struct hid_driver hid_lenovo_go_s = { .probe = hid_gos_probe, .remove = hid_gos_remove, .raw_event = hid_gos_raw_event, + .reset_resume = hid_gos_reset_resume, }; module_hid_driver(hid_lenovo_go_s); From ae4ac077332ea3341a0f4c0973556c6b7ac5b7a1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Apr 2026 10:10:02 +0300 Subject: [PATCH 097/972] HID: intel-thc-hid: Intel-quickspi: Fix some error codes If we have a partial read that is supposed to be treated as failure but in this code we forgot to set the error code. Return -EINVAL. Fixes: 9d8d51735a3a ("HID: intel-thc-hid: intel-quickspi: Add HIDSPI protocol implementation") Signed-off-by: Dan Carpenter Reviewed-by: Even Xu Reviewed-by: Mark Pearson Signed-off-by: Jiri Kosina --- drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c index 16f780bc879b12..cb19057f1191ba 100644 --- a/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c +++ b/drivers/hid/intel-thc-hid/intel-quickspi/quickspi-protocol.c @@ -94,7 +94,7 @@ static int quickspi_get_device_descriptor(struct quickspi_device *qsdev) dev_err_once(qsdev->dev, "Read DEVICE_DESCRIPTOR failed, ret = %d\n", ret); dev_err_once(qsdev->dev, "DEVICE_DESCRIPTOR expected len = %u, actual read = %u\n", input_len, read_len); - return ret; + return ret ?: -EINVAL; } input_rep_type = ((struct input_report_body_header *)read_buf)->input_report_type; @@ -318,7 +318,7 @@ int reset_tic(struct quickspi_device *qsdev) dev_err_once(qsdev->dev, "Read RESET_RESPONSE body failed, ret = %d\n", ret); dev_err_once(qsdev->dev, "RESET_RESPONSE body expected len = %u, actual = %u\n", read_len, actual_read_len); - return ret; + return ret ?: -EINVAL; } input_rep_type = FIELD_GET(HIDSPI_IN_REP_BDY_HDR_REP_TYPE, reset_response); From 487359284509a6745e14b8c0518768bc277809b0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 28 Apr 2026 10:33:16 +0200 Subject: [PATCH 098/972] HID: uclogic: Fix regression of input name assignment The previous fix for adding the devm_kasprintf() return check in the commit bd07f751208b ("HID: uclogic: Add NULL check in uclogic_input_configured()") changed the condition of hi->input->name assignment, and it resulted in missing the proper input device name when no custom suffix is defined. Restore the conditional to the original content to address the regression. Fixes: bd07f751208b ("HID: uclogic: Add NULL check in uclogic_input_configured()") Signed-off-by: Takashi Iwai Signed-off-by: Jiri Kosina --- drivers/hid/hid-uclogic-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/hid/hid-uclogic-core.c b/drivers/hid/hid-uclogic-core.c index bd7f93e96e4e48..b73f09d26688ab 100644 --- a/drivers/hid/hid-uclogic-core.c +++ b/drivers/hid/hid-uclogic-core.c @@ -184,7 +184,9 @@ static int uclogic_input_configured(struct hid_device *hdev, suffix = "System Control"; break; } - } else { + } + + if (suffix) { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); if (!hi->input->name) From c4cca236968683eb0d59abfb12d5c7e4d8514227 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 20 Apr 2026 12:39:51 -0500 Subject: [PATCH 099/972] ipmi: Add limits to event and receive message requests The driver would just fetch events and receive messages until the BMC said it was done. To avoid issues with BMCs that never say they are done, add a limit of 10 fetches at a time. In addition, an si interface has an attn state it can return from the hardware which is supposed to cause a flag fetch to see if the driver needs to fetch events or message or a few other things. If the attn bit gets stuck, it's a similar problem. So allow messages in between flag fetches so the driver itself doesn't get stuck. This is a more general fix than the previous fix for the specific bad BMC, but should fix the more general issue of a BMC that won't stop saying it has data. This has been there from the beginning of the driver. It's not a bug per-se, but it is accounting for bugs in BMCs. Reported-by: Matt Fleming Closes: https://lore.kernel.org/lkml/20260415115930.3428942-1-matt@readmodwrite.com/ Fixes: <1da177e4c3f4> ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 54 +++++++++++++++++++++++++------- drivers/char/ipmi/ipmi_ssif.c | 23 ++++++++++++-- 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 08c208cc64c56b..7c3c463e08da25 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -168,6 +168,10 @@ struct smi_info { OEM2_DATA_AVAIL) unsigned char msg_flags; + /* When requesting events and messages, don't do it forever. */ + unsigned int num_requests_in_a_row; + bool last_was_flag_fetch; + /* Does the BMC have an event buffer? */ bool has_event_buffer; @@ -410,7 +414,10 @@ static void start_getting_msg_queue(struct smi_info *smi_info) start_new_msg(smi_info, smi_info->curr_msg->data, smi_info->curr_msg->data_size); - smi_info->si_state = SI_GETTING_MESSAGES; + if (smi_info->si_state != SI_GETTING_MESSAGES) { + smi_info->num_requests_in_a_row = 0; + smi_info->si_state = SI_GETTING_MESSAGES; + } } static void start_getting_events(struct smi_info *smi_info) @@ -421,7 +428,10 @@ static void start_getting_events(struct smi_info *smi_info) start_new_msg(smi_info, smi_info->curr_msg->data, smi_info->curr_msg->data_size); - smi_info->si_state = SI_GETTING_EVENTS; + if (smi_info->si_state != SI_GETTING_EVENTS) { + smi_info->num_requests_in_a_row = 0; + smi_info->si_state = SI_GETTING_EVENTS; + } } /* @@ -595,6 +605,7 @@ static void handle_transaction_done(struct smi_info *smi_info) smi_info->si_state = SI_NORMAL; } else { smi_info->msg_flags = msg[3]; + smi_info->last_was_flag_fetch = true; handle_flags(smi_info); } break; @@ -646,6 +657,11 @@ static void handle_transaction_done(struct smi_info *smi_info) } else { smi_inc_stat(smi_info, events); + smi_info->num_requests_in_a_row++; + if (smi_info->num_requests_in_a_row > 10) + /* Stop if we do this too many times. */ + smi_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL; + /* * Do this before we deliver the message * because delivering the message releases the @@ -684,6 +700,11 @@ static void handle_transaction_done(struct smi_info *smi_info) } else { smi_inc_stat(smi_info, incoming_messages); + smi_info->num_requests_in_a_row++; + if (smi_info->num_requests_in_a_row > 10) + /* Stop if we do this too many times. */ + smi_info->msg_flags &= ~RECEIVE_MSG_AVAIL; + /* * Do this before we deliver the message * because delivering the message releases the @@ -825,6 +846,26 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, goto out; } + /* + * If we are currently idle, or if the last thing that was + * done was a flag fetch and there is a message pending, try + * to start the next message. + * + * We do the waiting message check to avoid a stuck flag + * completely wedging the driver. Let a message through + * in between flag operations if that happens. + */ + if (si_sm_result == SI_SM_IDLE || + (si_sm_result == SI_SM_ATTN && smi_info->waiting_msg && + smi_info->last_was_flag_fetch)) { + smi_info->last_was_flag_fetch = false; + smi_inc_stat(smi_info, idles); + + si_sm_result = start_next_msg(smi_info); + if (si_sm_result != SI_SM_IDLE) + goto restart; + } + /* * We prefer handling attn over new messages. But don't do * this if there is not yet an upper layer to handle anything. @@ -852,15 +893,6 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info, } } - /* If we are currently idle, try to start the next message. */ - if (si_sm_result == SI_SM_IDLE) { - smi_inc_stat(smi_info, idles); - - si_sm_result = start_next_msg(smi_info); - if (si_sm_result != SI_SM_IDLE) - goto restart; - } - if ((si_sm_result == SI_SM_IDLE) && (atomic_read(&smi_info->req_events))) { /* diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index b49500a1bd3637..f3798f4e6a6374 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -225,6 +225,9 @@ struct ssif_info { bool has_event_buffer; bool supports_alert; + /* When requesting events and messages, don't do it forever. */ + unsigned int num_requests_in_a_row; + /* * Used to tell what we should do with alerts. If we are * waiting on a response, read the data immediately. @@ -413,7 +416,10 @@ static void start_event_fetch(struct ssif_info *ssif_info, unsigned long *flags) } ssif_info->curr_msg = msg; - ssif_info->ssif_state = SSIF_GETTING_EVENTS; + if (ssif_info->ssif_state != SSIF_GETTING_EVENTS) { + ssif_info->num_requests_in_a_row = 0; + ssif_info->ssif_state = SSIF_GETTING_EVENTS; + } ipmi_ssif_unlock_cond(ssif_info, flags); msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2); @@ -436,7 +442,10 @@ static void start_recv_msg_fetch(struct ssif_info *ssif_info, } ssif_info->curr_msg = msg; - ssif_info->ssif_state = SSIF_GETTING_MESSAGES; + if (ssif_info->ssif_state != SSIF_GETTING_MESSAGES) { + ssif_info->num_requests_in_a_row = 0; + ssif_info->ssif_state = SSIF_GETTING_MESSAGES; + } ipmi_ssif_unlock_cond(ssif_info, flags); msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2); @@ -843,6 +852,11 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result, ssif_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL; handle_flags(ssif_info, flags); } else { + ssif_info->num_requests_in_a_row++; + if (ssif_info->num_requests_in_a_row > 10) + /* Stop if we do this too many times. */ + ssif_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL; + handle_flags(ssif_info, flags); ssif_inc_stat(ssif_info, events); deliver_recv_msg(ssif_info, msg); @@ -876,6 +890,11 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result, ssif_info->msg_flags &= ~RECEIVE_MSG_AVAIL; handle_flags(ssif_info, flags); } else { + ssif_info->num_requests_in_a_row++; + if (ssif_info->num_requests_in_a_row > 10) + /* Stop if we do this too many times. */ + ssif_info->msg_flags &= ~RECEIVE_MSG_AVAIL; + ssif_inc_stat(ssif_info, incoming_messages); handle_flags(ssif_info, flags); deliver_recv_msg(ssif_info, msg); From 09dd798270ff582d7309f285d4aaf5dbebae01cb Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 20 Apr 2026 13:02:18 -0500 Subject: [PATCH 100/972] ipmi:si: Return state to normal if message allocation fails There were places where nothing would get started if a message allocation failed, so the driver needs to return to normal state. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_si_intf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 7c3c463e08da25..9a9d12be9bf743 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -497,15 +497,19 @@ static void handle_flags(struct smi_info *smi_info) } else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) { /* Messages available. */ smi_info->curr_msg = alloc_msg_handle_irq(smi_info); - if (!smi_info->curr_msg) + if (!smi_info->curr_msg) { + smi_info->si_state = SI_NORMAL; return; + } start_getting_msg_queue(smi_info); } else if (smi_info->msg_flags & EVENT_MSG_BUFFER_FULL) { /* Events available. */ smi_info->curr_msg = alloc_msg_handle_irq(smi_info); - if (!smi_info->curr_msg) + if (!smi_info->curr_msg) { + smi_info->si_state = SI_NORMAL; return; + } start_getting_events(smi_info); } else if (smi_info->msg_flags & OEM_DATA_AVAIL && From a8aebe93a4938c0ca1941eeaae821738f869be3d Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Tue, 21 Apr 2026 06:50:22 -0500 Subject: [PATCH 101/972] ipmi:ssif: NULL thread on error Cleanup code was checking the thread for NULL, but it was possibly a PTR_ERR() in one spot. Spotted with static analysis. Link: https://sourceforge.net/p/openipmi/mailman/message/59324676/ Fixes: 75c486cb1bca ("ipmi:ssif: Clean up kthread on errors") Cc: # 91eb7ec72612: ipmi:ssif: Remove unnecessary indention Cc: stable@vger.kernel.org Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ssif.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index f3798f4e6a6374..f419b46bf00207 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1905,6 +1905,7 @@ static int ssif_probe(struct i2c_client *client) "kssif%4.4x", thread_num); if (IS_ERR(ssif_info->thread)) { rv = PTR_ERR(ssif_info->thread); + ssif_info->thread = NULL; dev_notice(&ssif_info->client->dev, "Could not start kernel thread: error %d\n", rv); From 3e75021f615ceee8562e6455c335936b39929ffb Mon Sep 17 00:00:00 2001 From: Troy Mitchell Date: Fri, 24 Apr 2026 16:20:32 +0800 Subject: [PATCH 102/972] clk: spacemit: k3: mark top_dclk as CLK_IS_CRITICAL top_dclk is the DDR bus clock. If it is gated by clk_disable_unused, all memory-mapped bus transactions cease to function, causing DMA engines to hang and general system instability. Mark it CLK_IS_CRITICAL so the CCF never gates it during the unused clock sweep. Fixes: e371a77255b8 ("clk: spacemit: k3: add the clock tree") Reviewed-by: Brian Masney Signed-off-by: Troy Mitchell Signed-off-by: Stephen Boyd --- drivers/clk/spacemit/ccu-k3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/spacemit/ccu-k3.c b/drivers/clk/spacemit/ccu-k3.c index e98afd59f05c64..bb8b75bdbdb30d 100644 --- a/drivers/clk/spacemit/ccu-k3.c +++ b/drivers/clk/spacemit/ccu-k3.c @@ -846,7 +846,7 @@ static const struct clk_parent_data top_parents[] = { CCU_PARENT_HW(pll6_d3), }; CCU_MUX_DIV_GATE_FC_DEFINE(top_dclk, top_parents, APMU_TOP_DCLK_CTRL, 5, 3, - BIT(8), 2, 3, BIT(1), 0); + BIT(8), 2, 3, BIT(1), CLK_IS_CRITICAL); static const struct clk_parent_data ucie_parents[] = { CCU_PARENT_HW(pll1_d8_307p2), From 79a1886be1564a009cd2a003bada15ed6153f819 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Wed, 25 Feb 2026 17:55:05 +0100 Subject: [PATCH 103/972] clk: eyeq: use the auxiliary device creation helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The auxiliary device creation of this driver is simple enough to use the available auxiliary device creation helper. Use it and remove some boilerplate code. Tested-by: Théo Lebrun # On Mobileye EyeQ5 Signed-off-by: Jerome Brunet Reviewed-by: Luca Ceresoli Signed-off-by: Théo Lebrun Signed-off-by: Stephen Boyd --- drivers/clk/clk-eyeq.c | 57 +++++++++--------------------------------- 1 file changed, 12 insertions(+), 45 deletions(-) diff --git a/drivers/clk/clk-eyeq.c b/drivers/clk/clk-eyeq.c index c1dccedf8d5b1f..ae1aaf274336dd 100644 --- a/drivers/clk/clk-eyeq.c +++ b/drivers/clk/clk-eyeq.c @@ -321,38 +321,18 @@ static void eqc_probe_init_fixed_factors(struct device *dev, } } -static void eqc_auxdev_release(struct device *dev) -{ - struct auxiliary_device *adev = to_auxiliary_dev(dev); - - kfree(adev); -} - -static int eqc_auxdev_create(struct device *dev, void __iomem *base, - const char *name, u32 id) +static void eqc_auxdev_create_optional(struct device *dev, void __iomem *base, + const char *name) { struct auxiliary_device *adev; - int ret; - - adev = kzalloc_obj(*adev); - if (!adev) - return -ENOMEM; - - adev->name = name; - adev->dev.parent = dev; - adev->dev.platform_data = (void __force *)base; - adev->dev.release = eqc_auxdev_release; - adev->id = id; - ret = auxiliary_device_init(adev); - if (ret) - return ret; - - ret = auxiliary_device_add(adev); - if (ret) - auxiliary_device_uninit(adev); - - return ret; + if (name) { + adev = devm_auxiliary_device_create(dev, name, + (void __force *)base); + if (!adev) + dev_warn(dev, "failed creating auxiliary device %s.%s\n", + KBUILD_MODNAME, name); + } } static int eqc_probe(struct platform_device *pdev) @@ -364,7 +344,6 @@ static int eqc_probe(struct platform_device *pdev) unsigned int i, clk_count; struct resource *res; void __iomem *base; - int ret; data = device_get_match_data(dev); if (!data) @@ -378,21 +357,9 @@ static int eqc_probe(struct platform_device *pdev) if (!base) return -ENOMEM; - /* Init optional reset auxiliary device. */ - if (data->reset_auxdev_name) { - ret = eqc_auxdev_create(dev, base, data->reset_auxdev_name, 0); - if (ret) - dev_warn(dev, "failed creating auxiliary device %s.%s: %d\n", - KBUILD_MODNAME, data->reset_auxdev_name, ret); - } - - /* Init optional pinctrl auxiliary device. */ - if (data->pinctrl_auxdev_name) { - ret = eqc_auxdev_create(dev, base, data->pinctrl_auxdev_name, 0); - if (ret) - dev_warn(dev, "failed creating auxiliary device %s.%s: %d\n", - KBUILD_MODNAME, data->pinctrl_auxdev_name, ret); - } + /* Init optional auxiliary devices. */ + eqc_auxdev_create_optional(dev, base, data->reset_auxdev_name); + eqc_auxdev_create_optional(dev, base, data->pinctrl_auxdev_name); if (data->pll_count + data->div_count + data->fixed_factor_count == 0) return 0; /* Zero clocks, we are done. */ From a25ab518f355e1f0dcbea24ee26418dfcd6944b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 25 Feb 2026 17:55:06 +0100 Subject: [PATCH 104/972] clk: eyeq: add EyeQ5 children auxiliary device for generic PHYs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grow our clk-eyeq family; it knows how to spawn reset provider and pin controller children. Expand with a generic PHY driver on EyeQ5. Reviewed-by: Luca Ceresoli Signed-off-by: Théo Lebrun Signed-off-by: Stephen Boyd --- drivers/clk/clk-eyeq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clk/clk-eyeq.c b/drivers/clk/clk-eyeq.c index ae1aaf274336dd..d9303c2c7aa55e 100644 --- a/drivers/clk/clk-eyeq.c +++ b/drivers/clk/clk-eyeq.c @@ -110,6 +110,7 @@ struct eqc_match_data { const char *reset_auxdev_name; const char *pinctrl_auxdev_name; + const char *eth_phy_auxdev_name; unsigned int early_clk_count; }; @@ -360,6 +361,7 @@ static int eqc_probe(struct platform_device *pdev) /* Init optional auxiliary devices. */ eqc_auxdev_create_optional(dev, base, data->reset_auxdev_name); eqc_auxdev_create_optional(dev, base, data->pinctrl_auxdev_name); + eqc_auxdev_create_optional(dev, base, data->eth_phy_auxdev_name); if (data->pll_count + data->div_count + data->fixed_factor_count == 0) return 0; /* Zero clocks, we are done. */ @@ -520,6 +522,7 @@ static const struct eqc_match_data eqc_eyeq5_match_data = { .reset_auxdev_name = "reset", .pinctrl_auxdev_name = "pinctrl", + .eth_phy_auxdev_name = "phy", .early_clk_count = ARRAY_SIZE(eqc_eyeq5_early_plls) + ARRAY_SIZE(eqc_eyeq5_early_fixed_factors), From 4ac170432cf74b753cf59bcd0d449dced48585da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 25 Feb 2026 17:55:07 +0100 Subject: [PATCH 105/972] reset: eyeq: drop device_set_of_node_from_dev() done by parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our parent driver (clk-eyeq) now does the device_set_of_node_from_dev(dev, dev->parent) call through the newly introduced devm_auxiliary_device_create() helper. Doing it again in the reset-eyeq probe would be redundant. Drop both the WARN_ON() and the device_set_of_node_from_dev() call. Also fix the following comment that talks about "our newfound OF node". Signed-off-by: Jerome Brunet Reviewed-by: Philipp Zabel Acked-by: Philipp Zabel Signed-off-by: Théo Lebrun Signed-off-by: Stephen Boyd --- drivers/reset/reset-eyeq.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/reset/reset-eyeq.c b/drivers/reset/reset-eyeq.c index 791b7283111e2d..1a385798389709 100644 --- a/drivers/reset/reset-eyeq.c +++ b/drivers/reset/reset-eyeq.c @@ -422,13 +422,6 @@ static int eqr_of_xlate_twocells(struct reset_controller_dev *rcdev, return eqr_of_xlate_internal(rcdev, reset_spec->args[0], reset_spec->args[1]); } -static void eqr_of_node_put(void *_dev) -{ - struct device *dev = _dev; - - of_node_put(dev->of_node); -} - static int eqr_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { @@ -439,21 +432,8 @@ static int eqr_probe(struct auxiliary_device *adev, int ret; /* - * We are an auxiliary device of clk-eyeq. We do not have an OF node by - * default; let's reuse our parent's OF node. - */ - WARN_ON(dev->of_node); - device_set_of_node_from_dev(dev, dev->parent); - if (!dev->of_node) - return -ENODEV; - - ret = devm_add_action_or_reset(dev, eqr_of_node_put, dev); - if (ret) - return ret; - - /* - * Using our newfound OF node, we can get match data. We cannot use - * device_get_match_data() because it does not match reused OF nodes. + * Get match data. We cannot use device_get_match_data() because it does + * not accept reused OF nodes; see device_set_of_node_from_dev(). */ match = of_match_node(dev->driver->of_match_table, dev->of_node); if (!match || !match->data) From 69056753231f483cc1e40db52228aac42fd4c93d Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Wed, 15 Apr 2026 16:30:49 -0400 Subject: [PATCH 106/972] MAINTAINERS: add myself as a reviewer for the clk subsystem I've reviewed a lot clk patches for parts of the subsystem that typically doesn't get much review. Add myself as a reviewer so that I don't miss anything. Link: https://lore.kernel.org/linux-clk/?q=f%3Abmasney%40redhat.com Signed-off-by: Brian Masney Signed-off-by: Stephen Boyd --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2fb1c75afd1638..384ce5ee732367 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6358,6 +6358,7 @@ F: include/uapi/linux/comedi.h COMMON CLK FRAMEWORK M: Michael Turquette M: Stephen Boyd +R: Brian Masney L: linux-clk@vger.kernel.org S: Maintained Q: http://patchwork.kernel.org/project/linux-clk/list/ From de019f203b0d472c98ead4081ad4f05d92c9b826 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 7 Apr 2026 11:50:27 +0200 Subject: [PATCH 107/972] clk: rk808: fix OF node reference imbalance The driver reuses the OF node of the parent multi-function device but fails to take another reference to balance the one dropped by the platform bus code when unbinding the MFD and deregistering the child devices. Fix this by using the intended helper for reusing OF nodes. Fixes: 2dc51ca822e4 ("clk: RK808: Reduce 'struct rk808' usage") Cc: stable@vger.kernel.org # 6.5 Cc: Sebastian Reichel Signed-off-by: Johan Hovold Reviewed-by: Sebastian Reichel Reviewed-by: Brian Masney Reviewed-by: Heiko Stuebner Signed-off-by: Stephen Boyd --- drivers/clk/clk-rk808.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/clk-rk808.c b/drivers/clk/clk-rk808.c index f7412b137e5ef4..5a75b5c9155519 100644 --- a/drivers/clk/clk-rk808.c +++ b/drivers/clk/clk-rk808.c @@ -153,7 +153,7 @@ static int rk808_clkout_probe(struct platform_device *pdev) struct rk808_clkout *rk808_clkout; int ret; - dev->of_node = pdev->dev.parent->of_node; + device_set_of_node_from_dev(dev, dev->parent); rk808_clkout = devm_kzalloc(dev, sizeof(*rk808_clkout), GFP_KERNEL); From 883a32793c86091ea37bb84f88cc697d019e7a5d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 28 Apr 2026 12:38:47 +0200 Subject: [PATCH 108/972] efi/libstub: Move efi_relocate_kernel() into its only remaining user LoongArch is the only arch that still uses efi_relocate_kernel(), so before making changes to it that LoongArch needs, turn it into a private function. Move efi_low_alloc_above() into mem.c while at it, and drop the relocate.c source file altogether. Tested-by: WANG Rui Reviewed-by: Thomas Huth Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/Makefile | 2 +- drivers/firmware/efi/libstub/efistub.h | 7 - drivers/firmware/efi/libstub/loongarch-stub.c | 79 +++++++++ drivers/firmware/efi/libstub/mem.c | 82 +++++++++ drivers/firmware/efi/libstub/relocate.c | 166 ------------------ 5 files changed, 162 insertions(+), 174 deletions(-) delete mode 100644 drivers/firmware/efi/libstub/relocate.c diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 983a438e35f3d3..cfedb3025c2638 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -66,7 +66,7 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ lib-y := efi-stub-helper.o gop.o secureboot.o tpm.o \ file.o mem.o random.o randomalloc.o pci.o \ skip_spaces.o lib-cmdline.o lib-ctype.o \ - alignedmem.o relocate.o printk.o vsprintf.o + alignedmem.o printk.o vsprintf.o # include the stub's libfdt dependencies from lib/ when needed libfdt-deps := fdt_rw.c fdt_ro.c fdt_wip.c fdt.c \ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 979a21818cc1c6..fd91fc15ec810b 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -1104,13 +1104,6 @@ efi_status_t efi_allocate_pages_aligned(unsigned long size, unsigned long *addr, efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, unsigned long *addr, unsigned long min); -efi_status_t efi_relocate_kernel(unsigned long *image_addr, - unsigned long image_size, - unsigned long alloc_size, - unsigned long preferred_addr, - unsigned long alignment, - unsigned long min_addr); - efi_status_t efi_parse_options(char const *cmdline); void efi_parse_option_graphics(char *option); diff --git a/drivers/firmware/efi/libstub/loongarch-stub.c b/drivers/firmware/efi/libstub/loongarch-stub.c index 736b6aae323d35..8c538a5243d923 100644 --- a/drivers/firmware/efi/libstub/loongarch-stub.c +++ b/drivers/firmware/efi/libstub/loongarch-stub.c @@ -14,6 +14,85 @@ extern int kernel_asize; extern int kernel_fsize; extern int kernel_entry; +/** + * efi_relocate_kernel() - copy memory area + * @image_addr: pointer to address of memory area to copy + * @image_size: size of memory area to copy + * @alloc_size: minimum size of memory to allocate, must be greater or + * equal to image_size + * @preferred_addr: preferred target address + * @alignment: minimum alignment of the allocated memory area. It + * should be a power of two. + * @min_addr: minimum target address + * + * Copy a memory area to a newly allocated memory area aligned according + * to @alignment but at least EFI_ALLOC_ALIGN. If the preferred address + * is not available, the allocated address will not be below @min_addr. + * On exit, @image_addr is updated to the target copy address that was used. + * + * This function is used to copy the Linux kernel verbatim. It does not apply + * any relocation changes. + * + * Return: status code + */ +static +efi_status_t efi_relocate_kernel(unsigned long *image_addr, + unsigned long image_size, + unsigned long alloc_size, + unsigned long preferred_addr, + unsigned long alignment, + unsigned long min_addr) +{ + unsigned long cur_image_addr; + unsigned long new_addr = 0; + efi_status_t status; + unsigned long nr_pages; + efi_physical_addr_t efi_addr = preferred_addr; + + if (!image_addr || !image_size || !alloc_size) + return EFI_INVALID_PARAMETER; + if (alloc_size < image_size) + return EFI_INVALID_PARAMETER; + + cur_image_addr = *image_addr; + + /* + * The EFI firmware loader could have placed the kernel image + * anywhere in memory, but the kernel has restrictions on the + * max physical address it can run at. Some architectures + * also have a preferred address, so first try to relocate + * to the preferred address. If that fails, allocate as low + * as possible while respecting the required alignment. + */ + nr_pages = round_up(alloc_size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &efi_addr); + new_addr = efi_addr; + /* + * If preferred address allocation failed allocate as low as + * possible. + */ + if (status != EFI_SUCCESS) { + status = efi_low_alloc_above(alloc_size, alignment, &new_addr, + min_addr); + } + if (status != EFI_SUCCESS) { + efi_err("Failed to allocate usable memory for kernel.\n"); + return status; + } + + /* + * We know source/dest won't overlap since both memory ranges + * have been allocated by UEFI, so we can safely use memcpy. + */ + memcpy((void *)new_addr, (void *)cur_image_addr, image_size); + + /* Return the new address of the relocated image. */ + *image_addr = new_addr; + + return status; +} + efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, diff --git a/drivers/firmware/efi/libstub/mem.c b/drivers/firmware/efi/libstub/mem.c index 9c82259eea8166..59f3f83de50c2d 100644 --- a/drivers/firmware/efi/libstub/mem.c +++ b/drivers/firmware/efi/libstub/mem.c @@ -124,3 +124,85 @@ void efi_free(unsigned long size, unsigned long addr) nr_pages = round_up(size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; efi_bs_call(free_pages, addr, nr_pages); } + +/** + * efi_low_alloc_above() - allocate pages at or above given address + * @size: size of the memory area to allocate + * @align: minimum alignment of the allocated memory area. It should + * a power of two. + * @addr: on exit the address of the allocated memory + * @min: minimum address to used for the memory allocation + * + * Allocate at the lowest possible address that is not below @min as + * EFI_LOADER_DATA. The allocated pages are aligned according to @align but at + * least EFI_ALLOC_ALIGN. The first allocated page will not below the address + * given by @min. + * + * Return: status code + */ +efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, + unsigned long *addr, unsigned long min) +{ + struct efi_boot_memmap *map __free(efi_pool) = NULL; + efi_status_t status; + unsigned long nr_pages; + int i; + + status = efi_get_memory_map(&map, false); + if (status != EFI_SUCCESS) + return status; + + /* + * Enforce minimum alignment that EFI or Linux requires when + * requesting a specific address. We are doing page-based (or + * larger) allocations, and both the address and size must meet + * alignment constraints. + */ + if (align < EFI_ALLOC_ALIGN) + align = EFI_ALLOC_ALIGN; + + size = round_up(size, EFI_ALLOC_ALIGN); + nr_pages = size / EFI_PAGE_SIZE; + for (i = 0; i < map->map_size / map->desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map->map; + u64 start, end; + + desc = efi_memdesc_ptr(m, map->desc_size, i); + + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->attribute & EFI_MEMORY_HOT_PLUGGABLE) + continue; + + if (efi_soft_reserve_enabled() && + (desc->attribute & EFI_MEMORY_SP)) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * EFI_PAGE_SIZE; + + if (start < min) + start = min; + + start = round_up(start, align); + if ((start + size) > end) + continue; + + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &start); + if (status == EFI_SUCCESS) { + *addr = start; + break; + } + } + + if (i == map->map_size / map->desc_size) + return EFI_NOT_FOUND; + + return EFI_SUCCESS; +} diff --git a/drivers/firmware/efi/libstub/relocate.c b/drivers/firmware/efi/libstub/relocate.c deleted file mode 100644 index d4264bfb6dc174..00000000000000 --- a/drivers/firmware/efi/libstub/relocate.c +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -#include "efistub.h" - -/** - * efi_low_alloc_above() - allocate pages at or above given address - * @size: size of the memory area to allocate - * @align: minimum alignment of the allocated memory area. It should - * a power of two. - * @addr: on exit the address of the allocated memory - * @min: minimum address to used for the memory allocation - * - * Allocate at the lowest possible address that is not below @min as - * EFI_LOADER_DATA. The allocated pages are aligned according to @align but at - * least EFI_ALLOC_ALIGN. The first allocated page will not below the address - * given by @min. - * - * Return: status code - */ -efi_status_t efi_low_alloc_above(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long min) -{ - struct efi_boot_memmap *map __free(efi_pool) = NULL; - efi_status_t status; - unsigned long nr_pages; - int i; - - status = efi_get_memory_map(&map, false); - if (status != EFI_SUCCESS) - return status; - - /* - * Enforce minimum alignment that EFI or Linux requires when - * requesting a specific address. We are doing page-based (or - * larger) allocations, and both the address and size must meet - * alignment constraints. - */ - if (align < EFI_ALLOC_ALIGN) - align = EFI_ALLOC_ALIGN; - - size = round_up(size, EFI_ALLOC_ALIGN); - nr_pages = size / EFI_PAGE_SIZE; - for (i = 0; i < map->map_size / map->desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map->map; - u64 start, end; - - desc = efi_memdesc_ptr(m, map->desc_size, i); - - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (desc->attribute & EFI_MEMORY_HOT_PLUGGABLE) - continue; - - if (efi_soft_reserve_enabled() && - (desc->attribute & EFI_MEMORY_SP)) - continue; - - if (desc->num_pages < nr_pages) - continue; - - start = desc->phys_addr; - end = start + desc->num_pages * EFI_PAGE_SIZE; - - if (start < min) - start = min; - - start = round_up(start, align); - if ((start + size) > end) - continue; - - status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, &start); - if (status == EFI_SUCCESS) { - *addr = start; - break; - } - } - - if (i == map->map_size / map->desc_size) - return EFI_NOT_FOUND; - - return EFI_SUCCESS; -} - -/** - * efi_relocate_kernel() - copy memory area - * @image_addr: pointer to address of memory area to copy - * @image_size: size of memory area to copy - * @alloc_size: minimum size of memory to allocate, must be greater or - * equal to image_size - * @preferred_addr: preferred target address - * @alignment: minimum alignment of the allocated memory area. It - * should be a power of two. - * @min_addr: minimum target address - * - * Copy a memory area to a newly allocated memory area aligned according - * to @alignment but at least EFI_ALLOC_ALIGN. If the preferred address - * is not available, the allocated address will not be below @min_addr. - * On exit, @image_addr is updated to the target copy address that was used. - * - * This function is used to copy the Linux kernel verbatim. It does not apply - * any relocation changes. - * - * Return: status code - */ -efi_status_t efi_relocate_kernel(unsigned long *image_addr, - unsigned long image_size, - unsigned long alloc_size, - unsigned long preferred_addr, - unsigned long alignment, - unsigned long min_addr) -{ - unsigned long cur_image_addr; - unsigned long new_addr = 0; - efi_status_t status; - unsigned long nr_pages; - efi_physical_addr_t efi_addr = preferred_addr; - - if (!image_addr || !image_size || !alloc_size) - return EFI_INVALID_PARAMETER; - if (alloc_size < image_size) - return EFI_INVALID_PARAMETER; - - cur_image_addr = *image_addr; - - /* - * The EFI firmware loader could have placed the kernel image - * anywhere in memory, but the kernel has restrictions on the - * max physical address it can run at. Some architectures - * also have a preferred address, so first try to relocate - * to the preferred address. If that fails, allocate as low - * as possible while respecting the required alignment. - */ - nr_pages = round_up(alloc_size, EFI_ALLOC_ALIGN) / EFI_PAGE_SIZE; - status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, - EFI_LOADER_DATA, nr_pages, &efi_addr); - new_addr = efi_addr; - /* - * If preferred address allocation failed allocate as low as - * possible. - */ - if (status != EFI_SUCCESS) { - status = efi_low_alloc_above(alloc_size, alignment, &new_addr, - min_addr); - } - if (status != EFI_SUCCESS) { - efi_err("Failed to allocate usable memory for kernel.\n"); - return status; - } - - /* - * We know source/dest won't overlap since both memory ranges - * have been allocated by UEFI, so we can safely use memcpy. - */ - memcpy((void *)new_addr, (void *)cur_image_addr, image_size); - - /* Return the new address of the relocated image. */ - *image_addr = new_addr; - - return status; -} From ad6f4f3ea72f866176f9dd6031c8778da088c686 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Mon, 27 Apr 2026 16:47:20 +0800 Subject: [PATCH 109/972] efi/loongarch: Implement efi_cache_sync_image() Provide a LoongArch implementation of efi_cache_sync_image() to ensure instruction cache coherency after the kernel image is relocated. Signed-off-by: WANG Rui Reviewed-by: Huacai Chen Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/loongarch.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/firmware/efi/libstub/loongarch.c b/drivers/firmware/efi/libstub/loongarch.c index 9825f5218137f2..f7938d5c196aad 100644 --- a/drivers/firmware/efi/libstub/loongarch.c +++ b/drivers/firmware/efi/libstub/loongarch.c @@ -18,6 +18,11 @@ efi_status_t check_platform_features(void) return EFI_SUCCESS; } +void efi_cache_sync_image(unsigned long image_base, unsigned long alloc_size) +{ + asm volatile ("ibar 0" ::: "memory"); +} + struct exit_boot_struct { efi_memory_desc_t *runtime_map; int runtime_entry_count; From cda92ac47c024d84f6b8294e462d6272039a10ac Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Mon, 27 Apr 2026 16:47:21 +0800 Subject: [PATCH 110/972] efi/libstub: Synchronize instruction cache after kernel relocation The relocated kernel image is copied to its new location using memcpy(). On architectures with separate instruction and data caches, the copied instructions may remain stale in the instruction cache, leading to the execution of outdated contents. Call efi_cache_sync_image() after the relocation copy to ensure the instruction cache is synchronized with the updated memory contents before control is transferred to the relocated kernel. Signed-off-by: WANG Rui Reviewed-by: Huacai Chen Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/loongarch-stub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/efi/libstub/loongarch-stub.c b/drivers/firmware/efi/libstub/loongarch-stub.c index 8c538a5243d923..c87ac70251076c 100644 --- a/drivers/firmware/efi/libstub/loongarch-stub.c +++ b/drivers/firmware/efi/libstub/loongarch-stub.c @@ -86,6 +86,7 @@ efi_status_t efi_relocate_kernel(unsigned long *image_addr, * have been allocated by UEFI, so we can safely use memcpy. */ memcpy((void *)new_addr, (void *)cur_image_addr, image_size); + efi_cache_sync_image(new_addr, image_size); /* Return the new address of the relocated image. */ *image_addr = new_addr; From 28465227c80fe417b4013c432be1f3737cb9f9a3 Mon Sep 17 00:00:00 2001 From: Ruijie Li Date: Wed, 29 Apr 2026 00:41:43 +0800 Subject: [PATCH 111/972] xfrm: provide message size for XFRM_MSG_MAPPING The compat 64=>32 translation path handles XFRM_MSG_MAPPING, but xfrm_msg_min[] does not provide the native payload size for this message type. Add the missing XFRM_MSG_MAPPING entry so compat translation can size and translate mapping notifications correctly. Fixes: 5461fc0c8d9f ("xfrm/compat: Add 64=>32-bit messages translator") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Ruijie Li Signed-off-by: Ren Wei Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d56450f6166912..38a90e5ee3d935 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3323,6 +3323,7 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping), [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; From 14acf9652e5690de3c7486c6db5fb8dafd0a32a3 Mon Sep 17 00:00:00 2001 From: Michal Kosiorek Date: Wed, 29 Apr 2026 10:54:51 +0200 Subject: [PATCH 112/972] xfrm: defensively unhash xfrm_state lists in __xfrm_state_delete KASAN reproduces a slab-use-after-free in __xfrm_state_delete()'s hlist_del_rcu calls under syzkaller load on linux-6.12.y stable (reproduced on 6.12.47, also reachable via the same code path on torvalds/master and on the ipsec tree). Nine unique signatures cluster in the xfrm_state lifecycle, the load-bearing one being: BUG: KASAN: slab-use-after-free in __hlist_del include/linux/list.h:990 [inline] BUG: KASAN: slab-use-after-free in hlist_del_rcu include/linux/rculist.h:516 [inline] BUG: KASAN: slab-use-after-free in __xfrm_state_delete net/xfrm/xfrm_state.c Write of size 8 at addr ffff8881198bcb70 by task kworker/u8:9/435 Workqueue: netns cleanup_net Call Trace: __hlist_del / hlist_del_rcu __xfrm_state_delete xfrm_state_delete xfrm_state_flush xfrm_state_fini ops_exit_list cleanup_net The other observed signatures hit the same slab object from __xfrm_state_lookup, xfrm_alloc_spi, __xfrm_state_insert and an OOB write variant of __xfrm_state_delete, all on the byseq/byspi hash chains. __xfrm_state_delete() guards its byseq and byspi unhashes with value-based predicates: if (x->km.seq) hlist_del_rcu(&x->byseq); if (x->id.spi) hlist_del_rcu(&x->byspi); while everywhere else in the file (e.g. state_cache, state_cache_input) the safer hlist_unhashed() check is used. xfrm_alloc_spi() sets x->id.spi = newspi inside xfrm_state_lock and then immediately inserts into byspi, but a path that observes x->id.spi != 0 outside of xfrm_state_lock can still skip-or-hit the byspi unhash inconsistently with whether x is actually on the list. The same holds for x->km.seq versus byseq, and the bydst/bysrc unhashes have no predicate at all, so a second __xfrm_state_delete() on the same object writes through LIST_POISON pprev. The defensive change here: - Use hlist_del_init_rcu() instead of hlist_del_rcu() on bydst, bysrc, byseq and byspi so a second deletion is a no-op rather than a write through LIST_POISON pprev. The byseq/byspi nodes are already initialised in xfrm_state_alloc(). - Test hlist_unhashed() rather than the value predicate for byseq/byspi, so the unhash decision tracks list state rather than mutable scalar fields. Empirical verification: applied this patch on top of v6.12.47, rebuilt, and re-ran the same syzkaller harness for 1h16m on a previously-crashy configuration that produced ~100 hits each of slab-use-after-free Read in xfrm_alloc_spi / Read in __xfrm_state_lookup / Write in __xfrm_state_delete. After the patch, 7.1M execs across 32 VMs at ~1550 exec/sec produced zero xfrm_state UAF/OOB hits. /proc/slabinfo confirms the xfrm_state slab is actively allocated and freed during the run (~143 KiB resident), so the fuzzer is still exercising those code paths -- they just no longer crash. Reproduction: - Linux 6.12.47 x86_64 + KASAN_GENERIC + KASAN_INLINE + KCOV - syzkaller @ 746545b8b1e4c3a128db8652b340d3df90ce61db - 32 QEMU/KVM VMs x 2 vCPU on AWS c5.metal bare metal - 9 unique signatures collected in ~9h, all within xfrm_state lifecycle Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Fixes: 7b4dc3600e48 ("[XFRM]: Do not add a state whose SPI is zero to the SPI hash.") Reported-by: Michal Kosiorek Tested-by: Michal Kosiorek Cc: stable@vger.kernel.org Signed-off-by: Michal Kosiorek Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1748d374abcab3..686014d394298c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -818,17 +818,17 @@ int __xfrm_state_delete(struct xfrm_state *x) spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); - hlist_del_rcu(&x->bydst); - hlist_del_rcu(&x->bysrc); - if (x->km.seq) - hlist_del_rcu(&x->byseq); + hlist_del_init_rcu(&x->bydst); + hlist_del_init_rcu(&x->bysrc); + if (!hlist_unhashed(&x->byseq)) + hlist_del_init_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); if (!hlist_unhashed(&x->state_cache_input)) hlist_del_rcu(&x->state_cache_input); - if (x->id.spi) - hlist_del_rcu(&x->byspi); + if (!hlist_unhashed(&x->byspi)) + hlist_del_init_rcu(&x->byspi); net->xfrm.state_num--; xfrm_nat_keepalive_state_updated(x); spin_unlock(&net->xfrm.xfrm_state_lock); From 38694f4639c45599161860e828dc4ac77abf8cea Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 27 Apr 2026 14:02:32 +0300 Subject: [PATCH 113/972] RDMA/mlx5: Fix UAF in SRQ destroy due to race with create A race condition exists between mlx5_cmd_destroy_srq() and mlx5_cmd_create_srq() that can lead to a use-after-free (UAF) [1]. After destroy_srq_split() releases the SRQ to firmware, the SRQN can be immediately reallocated for a new SRQ being created concurrently. If the create path stores the new SRQ in the xarray before the destroy path erases it, the destroy will incorrectly delete the new SRQ's entry. Later accesses then hit freed memory. Fix by replacing the unconditional xa_erase_irq() with xa_cmpxchg_irq() that only erases the entry if it hasn't already been replaced (still contains XA_ZERO_ENTRY), preserving any newly created SRQ. [1] RIP: 0010:mlx5_cmd_destroy_srq+0xd8/0x110 [mlx5_ib] Code: 89 e1 ba 06 04 00 00 4c 89 f6 48 89 ef e8 80 19 70 e1 c6 83 a0 0f 00 00 00 fb 5b 44 89 e8 5d 41 5c 41 5d 41 5e c3 cc cc cc cc <0f> 0b 48 89 c2 83 e2 03 48 83 fa 02 75 08 48 3d 05 c0 ff ff 77 08 RSP: 0018:ff110001037b7d08 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ff1100010bb9c000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ff110001037b7c90 RBP: ff1100010bb9cfa0 R08: 0000000000000000 R09: 0000000000000000 R10: ff110001037b7da0 R11: ff11000104f29580 R12: ff1100010e2ac090 R13: 000000000000000d R14: 0000000000000001 R15: ff11000105336300 FS: 00007fa24787c740(0000) GS:ff1100046eb8d000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa247984e90 CR3: 0000000109d59005 CR4: 0000000000373eb0 Call Trace: mlx5_ib_destroy_srq+0x25/0xa0 [mlx5_ib] ib_destroy_srq_user+0x21/0x90 [ib_core] uverbs_free_srq+0x1b/0x50 [ib_uverbs] destroy_hw_idr_uobject+0x1e/0x50 [ib_uverbs] uverbs_destroy_uobject+0x35/0x180 [ib_uverbs] __uverbs_cleanup_ufile+0xdd/0x140 [ib_uverbs] uverbs_destroy_ufile_hw+0x38/0xf0 [ib_uverbs] ib_uverbs_close+0x17/0xa0 [ib_uverbs] __fput+0xe0/0x2a0 __x64_sys_close+0x3a/0x80 do_syscall_64+0x55/0xac0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fa247984ea4 Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 80 3d a5 51 0e 00 00 74 13 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3c c3 0f 1f 00 55 48 89 e5 48 83 ec 10 89 7d RSP: 002b:00007ffecfa79498 EFLAGS: 00000202 ORIG_RAX: 0000000000000003 RAX: ffffffffffffffda RBX: 0000200000000080 RCX: 00007fa247984ea4 RDX: 0000000000000040 RSI: 0000200000000200 RDI: 0000000000000003 RBP: 00007ffecfa794e0 R08: 00007ffecfa794e0 R09: 00007ffecfa794e0 R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 0000200000000000 R15: 0000200000000009 ---[ end trace 0000000000000000 ]--- Fixes: fd89099d635e ("RDMA/mlx5: Issue FW command to destroy SRQ on reentry") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-1-4621fa52de0e@nvidia.com Signed-off-by: Edward Srouji Reviewed-by: Michael Guralnik Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq_cmd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index 8b338539659933..c1a088120915c5 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -683,7 +683,14 @@ int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, srq, 0); return err; } - xa_erase_irq(&table->array, srq->srqn); + + /* + * A race can occur where a concurrent create gets the same srqn + * (after hardware released it) and overwrites XA_ZERO_ENTRY with + * its new SRQ before we reach here. In that case, we must not erase + * the entry as it now belongs to the new SRQ. + */ + xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, NULL, 0); mlx5_core_res_put(&srq->common); wait_for_completion(&srq->common.free); From 9bee81cc5e8811c8bbe67fbf5214a7998457324b Mon Sep 17 00:00:00 2001 From: Edward Srouji Date: Mon, 27 Apr 2026 14:02:33 +0300 Subject: [PATCH 114/972] RDMA/mlx5: Fix UAF in DCT destroy due to race with create A potential race condition exists between mlx5_core_destroy_dct() and mlx5_core_create_dct() that can lead to a use-after-free. After _mlx5_core_destroy_dct() releases the DCT to firmware, the DCTN can be immediately reallocated for a new DCT being created concurrently. If the create path stores the new DCT in the xarray before the destroy path erases it, the destroy will incorrectly delete the new DCT's entry. Later accesses then hit freed memory. Fix by replacing the unconditional xa_erase_irq() with xa_cmpxchg_irq() that only erases the entry if it hasn't already been replaced (still contains XA_ZERO_ENTRY), preserving any newly created DCT. Fixes: afff24899846 ("RDMA/mlx5: Handle DCT QP logic separately from low level QP interface") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-2-4621fa52de0e@nvidia.com Signed-off-by: Edward Srouji Reviewed-by: Michael Guralnik Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qpc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 146d03ae40bd9f..a7a4f9420271a2 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -314,7 +314,14 @@ int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, dct, 0); return err; } - xa_erase_irq(&table->dct_xa, dct->mqp.qpn); + + /* + * A race can occur where a concurrent create gets the same dctn + * (after hardware released it) and overwrites XA_ZERO_ENTRY with + * its new DCT before we reach here. In that case, we must not erase + * the entry as it now belongs to the new DCT. + */ + xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, NULL, 0); return 0; } From 610771c62e2ac5bca851fc5a6f8af1cdd83f189a Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 27 Apr 2026 14:02:34 +0300 Subject: [PATCH 115/972] IB/core: Fix IPv6 netlink message size in ib_nl_ip_send_msg() When resolving an RDMA-CM IPv6 address, ib_nl_ip_send_msg() sends a netlink request to the userspace daemon to perform IP-to-GID resolution in certain cases. The function allocates the netlink message buffer using nla_total_size(sizeof(size)), which passes 8 bytes (the size of size_t) instead of 16 bytes (the size of an IPv6 address). This results in an 8-byte under-allocation. This is currently masked by nlmsg_new() over-allocation of the skb in its internal logic. However, the code remains incorrect. Fix the issue by supplying the proper IPv6 address length to nla_total_size(). Fixes: ae43f8286730 ("IB/core: Add IP to GID netlink offload") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-3-4621fa52de0e@nvidia.com Signed-off-by: Maher Sanalla Reviewed-by: Patrisious Haddad Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index a40a765f030728..27992c38ad9020 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -149,7 +149,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; } - len = nla_total_size(sizeof(size)); + len = nla_total_size(size); len += NLMSG_ALIGN(sizeof(*header)); skb = nlmsg_new(len, GFP_KERNEL); From 1f3b337af2231b1e83c9052f771b201f5cbb9997 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 27 Apr 2026 14:02:35 +0300 Subject: [PATCH 116/972] RDMA/core: Fix rereg_mr use-after-free race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a driver creates a new MR during rereg_user_mr, a race window exists between rdma_alloc_commit_uobject() for the new MR and the point where the code reads that MR to populate the response keys. A concurrent rereg_mr or destroy_mr could destroy the MR in this window and cause UAF in the first thread. Racing flow between two rereg_mr calls: CPU0 CPU1 ---- ---- rereg_user_mr(mr_handle) uobj_get_write(mr_handle) -> mr0 mr1 = driver→rereg() rdma_alloc_commit_uobject(mr1) // mr1 replaced mr0 and is unlocked uobj_put_destroy(mr0) rereg_user_mr(mr_handle) uobj_get_write(mr_handle) -> mr1 mr2 = driver→rereg() rdma_alloc_commit_uobject(mr2) // mr2 replaced mr1 and is unlocked uobj_put_destroy(mr1) // Destroys mr1! resp.lkey = mr1->lkey; // UAF - mr1 was freed! resp.rkey = mr1->rkey; // UAF - mr1 was freed! Fix by storing lkey/rkey in local variables before the new MR is unlocked and using the local variables to set the user response. Fixes: 6e0954b11c05 ("RDMA/uverbs: Allow drivers to create a new HW object during rereg_mr") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-4-4621fa52de0e@nvidia.com Signed-off-by: Michael Guralnik Reviewed-by: Maher Sanalla Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a768436ba46805..91a62d2ade4dd0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -778,6 +778,7 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) struct ib_pd *orig_pd; struct ib_pd *new_pd; struct ib_mr *new_mr; + u32 lkey, rkey; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); if (ret) @@ -846,6 +847,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) new_mr->uobject = uobj; atomic_inc(&new_pd->usecnt); new_uobj->object = new_mr; + lkey = new_mr->lkey; + rkey = new_mr->rkey; rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); rdma_restrack_set_name(&new_mr->res, NULL); @@ -871,11 +874,13 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) mr->iova = cmd.hca_va; mr->length = cmd.length; } + lkey = mr->lkey; + rkey = mr->rkey; } memset(&resp, 0, sizeof(resp)); - resp.lkey = mr->lkey; - resp.rkey = mr->rkey; + resp.lkey = lkey; + resp.rkey = rkey; ret = uverbs_response(attrs, &resp, sizeof(resp)); From 6009cca96fcb01182cede725ad61e9e3810f3932 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 27 Apr 2026 14:02:36 +0300 Subject: [PATCH 117/972] RDMA/mlx5: Fix null-ptr-deref in Raw Packet QP creation Raw Packet QPs are unique in that they support separate send and receive queues, using 2 different user-provided buffers. They can also be created with one of the queues having size 0, allowing a send-only or receive-only QP. The Raw Packet RQ umem is created in the common user QP creation path, which allows zero-length queues. Add a later validation of the RQ umem in Raw Packet QP creation path when an RQ was requested. This prevents possible null-ptr dereference crashes, as seen in the below trace: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 6 UID: 0 PID: 3539 Comm: raw_packet_umem Not tainted 6.19.0-rc1+ #166 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:__mlx5_umem_find_best_quantized_pgoff+0x37/0x280 [mlx5_ib] Code: ff df 41 57 49 89 ff 41 56 41 55 41 89 d5 41 54 4d 89 cc 4c 8d 4f 30 55 4c 89 ca 48 89 f5 53 48 c1 ea 03 48 89 cb 48 83 ec 18 <80> 3c 02 00 44 89 04 24 0f 85 01 02 00 00 48 ba 00 00 00 00 00 fc RSP: 0018:ff1100013966f4e0 EFLAGS: 00010282 RAX: dffffc0000000000 RBX: 00000000ffffffc0 RCX: 00000000ffffffc0 RDX: 0000000000000006 RSI: 00000ffffffff000 RDI: 0000000000000000 RBP: 00000ffffffff000 R08: 0000000000000040 R09: 0000000000000030 R10: 0000000000000000 R11: 0000000000000000 R12: ff1100013966f648 R13: 0000000000000005 R14: ff1100013966f980 R15: 0000000000000000 FS: 00007fae6c82f740(0000) GS:ff11000898ba1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000000 CR3: 000000010f96c005 CR4: 0000000000373eb0 Call Trace: create_qp+0x747d/0xc740 [mlx5_ib] ? is_module_address+0x18/0x110 ? _create_user_qp.constprop.0+0x18e0/0x18e0 [mlx5_ib] ? __module_address+0x49/0x210 ? is_module_address+0x68/0x110 ? static_obj+0x67/0x90 ? lockdep_init_map_type+0x58/0x200 mlx5_ib_create_qp+0xc85/0x2620 [mlx5_ib] ? find_held_lock+0x2b/0x80 ? create_qp+0xc740/0xc740 [mlx5_ib] ? lock_release+0xcb/0x260 ? lockdep_init_map_type+0x58/0x200 ? __init_swait_queue_head+0xcb/0x150 create_qp.part.0+0x558/0x7c0 [ib_core] ib_create_qp_user+0xa0/0x4f0 [ib_core] ? rdma_lookup_get_uobject+0x1e4/0x400 [ib_uverbs] create_qp+0xe4f/0x1d10 [ib_uverbs] ? ib_uverbs_rereg_mr+0xd40/0xd40 [ib_uverbs] ? ib_uverbs_cq_event_handler+0x120/0x120 [ib_uverbs] ? __might_fault+0x81/0x100 ? lock_release+0xcb/0x260 ? _copy_from_user+0x3e/0x90 ib_uverbs_create_qp+0x10a/0x150 [ib_uverbs] ? ib_uverbs_ex_create_qp+0xe0/0xe0 [ib_uverbs] ? __might_fault+0x81/0x100 ? lock_release+0xcb/0x260 ib_uverbs_write+0x7e5/0xc90 [ib_uverbs] ? uverbs_devnode+0xc0/0xc0 [ib_uverbs] ? lock_acquire+0xfa/0x2b0 ? find_held_lock+0x2b/0x80 ? finish_task_switch.isra.0+0x189/0x6c0 vfs_write+0x1c0/0xf70 ? lockdep_hardirqs_on_prepare+0xde/0x170 ? kernel_write+0x5a0/0x5a0 ? __switch_to+0x527/0xe60 ? __schedule+0x10a3/0x3950 ? io_schedule_timeout+0x110/0x110 ksys_write+0x170/0x1c0 ? __x64_sys_read+0xb0/0xb0 ? trace_hardirqs_off.part.0+0x4e/0xe0 do_syscall_64+0x70/0x1360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fae6ca3118d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 5b cc 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe678ca308 EFLAGS: 00000213 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ffe678ca448 RCX: 00007fae6ca3118d RDX: 0000000000000070 RSI: 0000200000000280 RDI: 0000000000000003 RBP: 00007ffe678ca320 R08: 00000000ffffffff R09: 00007fae6c8ec5b8 R10: 0000000000000064 R11: 0000000000000213 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fae6cb71000 R15: 0000000000404df0 Modules linked in: mlx5_ib mlx5_fwctl mlx5_core bonding ip6_gre ip6_tunnel tunnel6 ip_gre gre rdma_ucm ib_uverbs rdma_cm iw_cm ib_ipoib ib_cm ib_umad ib_core rpcsec_gss_krb5 auth_rpcgss oid_registry overlay nfnetlink zram zsmalloc fuse scsi_transport_iscsi [last unloaded: mlx5_core] ---[ end trace 0000000000000000 ]--- RIP: 0010:__mlx5_umem_find_best_quantized_pgoff+0x37/0x280 [mlx5_ib] Fixes: 0fb2ed66a14c ("IB/mlx5: Add create and destroy functionality for Raw Packet QP") Link: https://patch.msgid.link/r/20260427-security-bug-fixes-v3-5-4621fa52de0e@nvidia.com Signed-off-by: Michael Guralnik Reviewed-by: Maher Sanalla Signed-off-by: Edward Srouji Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8f50e7342a7694..1611a704c1b332 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1603,6 +1603,11 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (qp->rq.wqe_cnt) { + if (!rq->base.ubuffer.umem) { + err = -EINVAL; + goto err_destroy_sq; + } + rq->base.container_mibqp = qp; if (qp->flags & IB_QP_CREATE_CVLAN_STRIPPING) From 20e81c64c905bd765e69ef07920d2b1130dc79b6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 Apr 2026 09:44:16 -1000 Subject: [PATCH 118/972] workqueue: Annotate alloc_workqueue_va() with __printf(1, 0) alloc_workqueue_va() forwards its va_list to __alloc_workqueue() which ultimately feeds vsnprintf(). __alloc_workqueue() already carries __printf(1, 0); the new wrapper needs the same annotation so format string checking propagates through the forwarding. Fixes: 0de4cb473aed ("workqueue: fix devm_alloc_workqueue() va_list misuse") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604300347.2LgXyteh-lkp@intel.com/ Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 24d0265191d464..3d2e3b2ec5283f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5906,6 +5906,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, return NULL; } +__printf(1, 0) static struct workqueue_struct *alloc_workqueue_va(const char *fmt, unsigned int flags, int max_active, From 1049970d7583194eedc30e45a3c898b2cb1c30ba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 27 Apr 2026 14:34:45 +0200 Subject: [PATCH 119/972] netfilter: replace skb_try_make_writable() by skb_ensure_writable() skb_try_make_writable() only works on clones and uncloned packets might have their network header in paged fragments. nft_fwd needs to work for the ingress and egress hooks, but the egress hook where skb->data points to the mac header, use skb_network_offset() to include the mac header. The flowtable is fine since it already uses the transport offset. Fixes: d32de98ea70f ("netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 4 ++-- net/netfilter/nft_fwd_netdev.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index fd56d663cb5b02..dbd7644fdbebeb 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -524,7 +524,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); @@ -1037,7 +1037,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 4bce36c3a6a070..2cc809303ce81d 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -100,6 +100,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, int oif = regs->data[priv->sreg_dev]; unsigned int verdict = NF_STOLEN; struct sk_buff *skb = pkt->skb; + int nhoff = skb_network_offset(skb); struct net_device *dev; int neigh_table; @@ -111,7 +112,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*iph))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) { verdict = NF_DROP; goto out; } @@ -132,7 +133,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*ip6h))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) { verdict = NF_DROP; goto out; } From 0a0b35f0bf10b4c2be607465f5c9c12c8681305b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 27 Apr 2026 14:34:48 +0200 Subject: [PATCH 120/972] netfilter: nft_fwd_netdev: add device and headroom validate with neigh forwarding The ttl field has been decremented already and evaluation of this rule would proceed, just drop this packet instead if there is no destination device to forwards this packet. This is exactly what nf_dup already does in this case. Moreover, check for headroom and call skb_expand_head() like in the IP output path to ensure there is sufficient headroom when forwarding this via neigh_xmit(). Fixes: d32de98ea70f ("netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 2cc809303ce81d..605b1d42abce34 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -102,6 +102,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; int nhoff = skb_network_offset(skb); struct net_device *dev; + unsigned int hh_len; int neigh_table; switch (priv->nfproto) { @@ -153,8 +154,19 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, } dev = dev_get_by_index_rcu(nft_net(pkt), oif); - if (dev == NULL) - return; + if (dev == NULL) { + verdict = NF_DROP; + goto out; + } + + hh_len = LL_RESERVED_SPACE(dev); + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) { + verdict = NF_STOLEN; + goto out; + } + } skb->dev = dev; skb_clear_tstamp(skb); From 1d47b55b36d2ec73fe6901212c8b28a593c3b27c Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Mon, 27 Apr 2026 14:34:50 +0200 Subject: [PATCH 121/972] netfilter: nft_fwd_netdev: use recursion counter in neigh egress path nft_fwd_neigh can be used in egress chains (NF_NETDEV_EGRESS). When the forwarding rule targets the same device or two devices forward to each other, neigh_xmit() triggers dev_queue_xmit() which re-enters nf_hook_egress(), causing infinite recursion and stack overflow. Move the nf_get_nf_dup_skb_recursion() accessor and NF_RECURSION_LIMIT to the shared header nf_dup_netdev.h as a static inline, so that nft_fwd_netdev can use the recursion counter directly without exported function call overhead. Guard neigh_xmit() with the same recursion limit already used in nf_do_netdev_egress(). [ Updated to cache the nf_get_nf_dup_skb_recursion pointer. --pablo ] Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_dup_netdev.h | 13 +++++++++++++ net/netfilter/nf_dup_netdev.c | 16 ---------------- net/netfilter/nft_fwd_netdev.c | 8 ++++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h index b175d271aec953..609bcf422a9b31 100644 --- a/include/net/netfilter/nf_dup_netdev.h +++ b/include/net/netfilter/nf_dup_netdev.h @@ -3,10 +3,23 @@ #define _NF_DUP_NETDEV_H_ #include +#include +#include void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif); void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif); +#define NF_RECURSION_LIMIT 2 + +static inline u8 *nf_get_nf_dup_skb_recursion(void) +{ +#ifndef CONFIG_PREEMPT_RT + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +#else + return ¤t->net_xmit.nf_dup_skb_recursion; +#endif +} + struct nft_offload_ctx; struct nft_flow_rule; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index e348fb90b8dc3b..3b0a70e154cd8f 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,22 +13,6 @@ #include #include -#define NF_RECURSION_LIMIT 2 - -#ifndef CONFIG_PREEMPT_RT -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); -} -#else - -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return ¤t->net_xmit.nf_dup_skb_recursion; -} - -#endif - static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 605b1d42abce34..b9e88d7cf3081a 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -95,6 +95,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); struct nft_fwd_neigh *priv = nft_expr_priv(expr); void *addr = ®s->data[priv->sreg_addr]; int oif = regs->data[priv->sreg_dev]; @@ -153,6 +154,11 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, goto out; } + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) { + verdict = NF_DROP; + goto out; + } + dev = dev_get_by_index_rcu(nft_net(pkt), oif); if (dev == NULL) { verdict = NF_DROP; @@ -170,7 +176,9 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, skb->dev = dev; skb_clear_tstamp(skb); + (*nf_dup_skb_recursion)++; neigh_xmit(neigh_table, dev, addr, skb); + (*nf_dup_skb_recursion)--; out: regs->verdict.code = verdict; } From f2abc305aa93f5b12d5c929d7a9c1cf7d7fee8af Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 29 Apr 2026 20:38:17 -0600 Subject: [PATCH 122/972] riscv: Define __riscv_copy_{,vec_}{words,bytes}_unaligned() using SYM_TYPED_FUNC_START After commit 67bdd7b01387 ("riscv: Split out measure_cycles() for reuse") and commit c03ad15f7cf6 ("riscv: Reuse measure_cycles() in check_vector_unaligned_access()"), there are CFI failure when booting kernels with CONFIG_CFI=y: CFI failure at measure_cycles+0x38/0xe0 (target: __riscv_copy_words_unaligned+0x0/0x50; expected type: ...) CFI failure at measure_cycles+0x38/0xe0 (target: __riscv_copy_vec_words_unaligned+0x0/0x24; expected type: ...) The __riscv_copy_*_unaligned() functions are now called indirectly but they are not defined with SYM_TYPED_FUNC_START, which is required for assembly functions called indirectly from C to pass CFI checking. Switch to SYM_TYPED_FUNC_START to clear up the CFI failures. Fixes: 67bdd7b01387 ("riscv: Split out measure_cycles() for reuse") Fixes: c03ad15f7cf6 ("riscv: Reuse measure_cycles() in check_vector_unaligned_access()") Signed-off-by: Nathan Chancellor Reviewed-by: Sami Tolvanen Reviewed-by: Nam Cao Link: https://patch.msgid.link/20260406-measure_cycles-cfi-failure-v1-1-03e0234ae02f@kernel.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/copy-unaligned.S | 5 +++-- arch/riscv/kernel/vec-copy-unaligned.S | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/copy-unaligned.S b/arch/riscv/kernel/copy-unaligned.S index 2b3d9398c113fb..90f3549621f725 100644 --- a/arch/riscv/kernel/copy-unaligned.S +++ b/arch/riscv/kernel/copy-unaligned.S @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2023 Rivos Inc. */ +#include #include #include @@ -9,7 +10,7 @@ /* void __riscv_copy_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of 8 * SZREG */ -SYM_FUNC_START(__riscv_copy_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_words_unaligned) andi a4, a2, ~((8*SZREG)-1) beqz a4, 2f add a3, a1, a4 @@ -41,7 +42,7 @@ SYM_FUNC_END(__riscv_copy_words_unaligned) /* void __riscv_copy_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S index 7ce4de6f6e694a..361039f7b9441c 100644 --- a/arch/riscv/kernel/vec-copy-unaligned.S +++ b/arch/riscv/kernel/vec-copy-unaligned.S @@ -2,6 +2,7 @@ /* Copyright (C) 2024 Rivos Inc. */ #include +#include #include #include @@ -16,7 +17,7 @@ /* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using word loads and stores. */ /* Note: The size is truncated to a multiple of WORD_EEW */ -SYM_FUNC_START(__riscv_copy_vec_words_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_words_unaligned) andi a4, a2, ~(WORD_EEW-1) beqz a4, 2f add a3, a1, a4 @@ -38,7 +39,7 @@ SYM_FUNC_END(__riscv_copy_vec_words_unaligned) /* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */ /* Performs a memcpy without aligning buffers, using only byte accesses. */ /* Note: The size is truncated to a multiple of 8 */ -SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned) +SYM_TYPED_FUNC_START(__riscv_copy_vec_bytes_unaligned) andi a4, a2, ~(8-1) beqz a4, 2f add a3, a1, a4 From 6813985ca456d1f5677ad9554f55805cbf27e16f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 17:35:18 +0200 Subject: [PATCH 123/972] netfilter: x_tables: add .check_hooks to matches and targets Add a new .check_hooks interface for checking if the match/target is used from the validate hook according to its configuration. Move existing conditional hook check based on the match/target configuration from .checkentry to .check_hooks for the following matches/targets: - addrtype - devgroup - physdev - policy - set - TCPMSS - SET This is a preparation patch to fix nft_compat, not functional changes are intended. Based on patch from Florian Westphal. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 +++ net/netfilter/x_tables.c | 79 +++++++++++++++++++++++++++--- net/netfilter/xt_TCPMSS.c | 33 +++++++------ net/netfilter/xt_addrtype.c | 25 +++++++--- net/netfilter/xt_devgroup.c | 18 +++++-- net/netfilter/xt_physdev.c | 20 ++++++-- net/netfilter/xt_policy.c | 24 +++++++-- net/netfilter/xt_set.c | 39 +++++++++------ 8 files changed, 187 insertions(+), 59 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 77c778d84d4cba..a81b46af5118db 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -146,6 +146,9 @@ struct xt_match { /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); + /* Called to validate hooks based on the match configuration. */ + int (*check_hooks)(const struct xt_mtchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -187,6 +190,9 @@ struct xt_target { /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); + /* Called to validate hooks based on the target configuration. */ + int (*check_hooks)(const struct xt_tgchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -279,8 +285,10 @@ bool xt_find_jump_offset(const unsigned int *offsets, int xt_check_proc_name(const char *name, unsigned int size); +int xt_check_hooks_match(struct xt_mtchk_param *par); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); +int xt_check_hooks_target(struct xt_tgchk_param *par); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9f837fb5ceb47d..2c67c2e6b13288 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -477,11 +477,9 @@ int xt_check_proc_name(const char *name, unsigned int size) } EXPORT_SYMBOL(xt_check_proc_name); -int xt_check_match(struct xt_mtchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_match_common(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* @@ -530,6 +528,14 @@ int xt_check_match(struct xt_mtchk_param *par, par->match->proto); return -EINVAL; } + + return 0; +} + +static int xt_checkentry_match(struct xt_mtchk_param *par) +{ + int ret; + if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) @@ -538,8 +544,34 @@ int xt_check_match(struct xt_mtchk_param *par, /* Flag up potential errors. */ return -EIO; } + + return 0; +} + +int xt_check_hooks_match(struct xt_mtchk_param *par) +{ + if (par->match->check_hooks != NULL) + return par->match->check_hooks(par); + return 0; } +EXPORT_SYMBOL_GPL(xt_check_hooks_match); + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_match_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_match(par); + if (ret < 0) + return ret; + + return xt_checkentry_match(par); +} EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target @@ -1012,11 +1044,9 @@ bool xt_find_jump_offset(const unsigned int *offsets, } EXPORT_SYMBOL(xt_find_jump_offset); -int xt_check_target(struct xt_tgchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_target_common(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, @@ -1061,6 +1091,23 @@ int xt_check_target(struct xt_tgchk_param *par, par->target->proto); return -EINVAL; } + + return 0; +} + +int xt_check_hooks_target(struct xt_tgchk_param *par) +{ + if (par->target->check_hooks != NULL) + return par->target->check_hooks(par); + + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_hooks_target); + +static int xt_checkentry_target(struct xt_tgchk_param *par) +{ + int ret; + if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) @@ -1071,6 +1118,22 @@ int xt_check_target(struct xt_tgchk_param *par, } return 0; } + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_target_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_target(par); + if (ret < 0) + return ret; + + return xt_checkentry_target(par); +} EXPORT_SYMBOL_GPL(xt_check_target); /** diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 116a885adb3cd6..80e1634bc51f85 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + + return 0; +} + /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { @@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m) static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), @@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV6, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index a7708894310716..913dbe3aa5e29e 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par) { - const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && @@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) goto err; } + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + goto err; + #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { @@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE @@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c index 9520dd00070b22..6d1a44ab5eeeb6 100644 --- a/net/netfilter/xt_devgroup.c +++ b/net/netfilter/xt_devgroup.c @@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } -static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; - if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | - XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) - return -EINVAL; - if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) return 0; } +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + return 0; +} + static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, + .check_hooks = devgroup_mt_check_hooks, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d2b0b52434fa90..dd98f758176c2e 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,14 +91,10 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par) return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } -static int physdev_mt_check(const struct xt_mtchk_param *par) +static int physdev_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; - static bool brnf_probed __read_mostly; - if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || - info->bitmask & ~XT_PHYSDEV_OP_MASK) - return -EINVAL; if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && @@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } + return 0; +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + static bool brnf_probed __read_mostly; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + #define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb) if (info->bitmask & XT_PHYSDEV_OP_IN) { if (info->physindev[0] == '\0') @@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV4, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV6, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b5fa65558318f5..ff54e3a8581e1f 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int policy_mt_check(const struct xt_mtchk_param *par) +static int policy_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; - const char *errmsg = "neither incoming nor outgoing policy selected"; - - if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { @@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par) errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } + + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + const char *errmsg = "neither incoming nor outgoing policy selected"; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) + goto err; + if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; @@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV6, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 731bc2cafae4bc..4ae04bba93581c 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -430,6 +430,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } +static int +set_target_v3_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_info_ratelimited("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + (par->hook_mask & ~(1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + } + + return 0; +} + static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (info->map_set.index != IPSET_INVALID_ID) { - if (strncmp(par->table, "mangle", 7)) { - pr_info_ratelimited("--map-set only usable from mangle table\n"); - ret = -EINVAL; - goto cleanup_del; - } - if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | - (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - (par->hook_mask & ~(1 << NF_INET_FORWARD | - 1 << NF_INET_LOCAL_OUT | - 1 << NF_INET_POST_ROUTING))) { - pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - ret = -EINVAL; - goto cleanup_del; - } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { @@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV4, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE @@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV6, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE From 2f768d638d977eff824f64dcc9639e3fea32da8f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 19:04:07 +0200 Subject: [PATCH 124/972] netfilter: nft_compat: run xt_check_hooks_{match,target}() from .validate Several matches and one target check that the hook is correct from checkentry(), however, the basechain is only available from nft_table_validate(). This patch uses xt_check_hooks_{match,target}() from the nft_compat expression .validate path. This patch sets the table in the nft_ctx struct in nft_table_validate() which is required by this patch. Based on patch from Florian Westphal. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + net/netfilter/nft_compat.c | 45 +++++++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d20ce5c36d3187..38e33c66c61838 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4205,6 +4205,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) struct nft_chain *chain; struct nft_ctx ctx = { .net = net, + .table = (struct nft_table *)table, .family = table->family, }; int err = 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index decc725a33c251..0caa9304d2d030 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { if (ret == -ENOENT) { @@ -353,8 +353,6 @@ static int nft_target_dump(struct sk_buff *skb, static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_target *target = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgchk_param par; + union nft_entry e = {}; - hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; + nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false); + + ret = xt_check_hooks_target(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; @@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + return xt_check_match(&par, size, proto, inv); } @@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb, static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_match *match = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_match *match = expr->ops->data; + size_t size = XT_ALIGN(match->matchsize); + struct xt_mtchk_param par; + union nft_entry e = {}; + void *info; - hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; + if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) { + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + + info = priv->info; + } else { + info = nft_expr_priv(expr); + } + + nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false); + + ret = xt_check_hooks_match(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; From 8bedb6c46945752a688d9b0cf2021e0e68b1876c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 28 Apr 2026 19:37:57 +0200 Subject: [PATCH 125/972] netfilter: xt_CT: fix usersize for v1 and v2 revision While resurrecting the conntrack-tool test cases I found following bug: In: iptables -I OUTPUT -t raw -p 13 -j CT --timeout test-generic Out: [0:0] -A OUTPUT -p 13 -j CT --timeout test Data after first four bytes of the timeout policy name is never copied to userspace because its treated as kernel-only. Fixes: ec2318904965 ("xtables: extend matches and targets with .usersize") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 498f5871c84a0e..d2aeacf94230f8 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, From 63bac027860308d1344f761cb47aabb3b30973fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Apr 2026 08:21:35 +0200 Subject: [PATCH 126/972] netfilter: nf_tables: fix netdev hook allocation memleak with dormant tables sashiko says: could the related code in __nf_tables_abort() leak the struct nft_hook objects when the table is dormant? In __nf_tables_abort(), when rolling back a NEWCHAIN transaction that updates hooks, the code conditionally unregisters and frees the hooks only if the table is not dormant [..] if (!(table->flags & NFT_TABLE_F_DORMANT)) { nft_netdev_unregister_hooks(net, &nft_trans_chain_hooks(trans), true); } ... nft_trans_destroy(trans); Unfortunately netdev family mixes hook registration and allocation. Push table struct down and only check for the flag to unregister. Fixes: 216e7bf7402c ("netfilter: nf_tables: skip netdev hook unregistration if table is dormant") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38e33c66c61838..87387adbca655f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net, } static void nft_netdev_unregister_hooks(struct net *net, + const struct nft_table *table, struct list_head *hook_list, bool release_netdev) { @@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net, struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - list_for_each_entry(ops, &hook->ops_list, list) - nf_unregister_net_hook(net, ops); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } if (release_netdev) nft_netdev_hook_unlink_free_rcu(hook); } @@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net, struct nft_base_chain *basechain; const struct nf_hook_ops *ops; - if (table->flags & NFT_TABLE_F_DORMANT || - !nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) return; basechain = nft_base_chain(chain); ops = &basechain->ops; + /* must also be called for dormant tables */ + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { + nft_netdev_unregister_hooks(net, table, &basechain->hook_list, + release_netdev); + return; + } + + if (table->flags & NFT_TABLE_F_DORMANT) + return; + if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) - nft_netdev_unregister_hooks(net, &basechain->hook_list, - release_netdev); - else - nf_unregister_net_hook(net, &basechain->ops); + nf_unregister_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, @@ -11282,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_hooks(net, table, + &nft_trans_chain_hooks(trans), + true); free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); From e6a650acbd991bba279f2580853aed9a8d166e6f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 29 Apr 2026 21:18:25 +0200 Subject: [PATCH 127/972] parisc: Fix build failure for 32-bit kernel with PA2.0 instruction set The CONFIG_PA11 option can not be used as a reliable check if we build a 32-bit kernel which needs the 32-bit VDSO. Instead depend on CONFIG_64BIT and CONFIG_COMPAT only. Reported-by: Christoph Biedl Tested-by: Christoph Biedl Signed-off-by: Helge Deller --- arch/parisc/Makefile | 16 +++++++++++----- arch/parisc/kernel/Makefile | 7 +++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index edab2a94835255..4391783521bd9a 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -174,15 +174,21 @@ ifeq ($(KBUILD_EXTMOD),) # this hack. prepare: vdso_prepare vdso_prepare: prepare0 - $(if $(CONFIG_64BIT),$(Q)$(MAKE) \ - $(build)=arch/parisc/kernel/vdso64 include/generated/vdso64-offsets.h) - $(if $(CONFIG_PA11)$(CONFIG_COMPAT),$(Q)$(MAKE) \ +ifdef CONFIG_64BIT + $(Q)$(MAKE) $(build)=arch/parisc/kernel/vdso64 include/generated/vdso64-offsets.h + $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ $(build)=arch/parisc/kernel/vdso32 include/generated/vdso32-offsets.h) +else + $(Q)$(MAKE) $(build)=arch/parisc/kernel/vdso32 include/generated/vdso32-offsets.h +endif endif -vdso-install-$(CONFIG_PA11) += arch/parisc/kernel/vdso32/vdso32.so +ifdef CONFIG_64BIT +vdso-install-y += arch/parisc/kernel/vdso64/vdso64.so vdso-install-$(CONFIG_COMPAT) += arch/parisc/kernel/vdso32/vdso32.so -vdso-install-$(CONFIG_64BIT) += arch/parisc/kernel/vdso64/vdso64.so +else +vdso-install-y += arch/parisc/kernel/vdso32/vdso32.so +endif install: KBUILD_IMAGE := vmlinux zinstall: KBUILD_IMAGE := vmlinuz diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile index 2f3441769ac543..49f937c2abbeb9 100644 --- a/arch/parisc/kernel/Makefile +++ b/arch/parisc/kernel/Makefile @@ -46,6 +46,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec_file.o # vdso obj-y += vdso.o -obj-$(CONFIG_64BIT) += vdso64/ -obj-$(CONFIG_PA11) += vdso32/ +ifdef CONFIG_64BIT +obj-y += vdso64/ obj-$(CONFIG_COMPAT) += vdso32/ +else +obj-y += vdso32/ +endif From 100201d349edd226ca3470c894c92dccc67ee7a8 Mon Sep 17 00:00:00 2001 From: Fabio Porcedda Date: Mon, 27 Apr 2026 11:17:46 +0200 Subject: [PATCH 128/972] USB: serial: option: add Telit Cinterion LE910Cx compositions Add the following Telit Cinterion LE910Cx compositions: 0x1251: RNDIS + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (SAP) T: Bus=01 Lev=01 Prnt=21 Port=06 Cnt=01 Dev#=108 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1251 Rev=03.18 S: Manufacturer=Android S: Product=LE910C1-EU S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=02 Prot=ff Driver=rndis_host E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1253: ECM + tty (AT/NMEA) + tty (AT) + tty (AT) + tty (SAP) T: Bus=01 Lev=01 Prnt=21 Port=06 Cnt=01 Dev#=121 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1253 Rev=03.18 S: Manufacturer=Android S: Product=LE910C1-EU S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=06 Prot=00 Driver=cdc_ether E: Ad=82(I) Atr=03(Int.) MxPS= 16 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=cdc_ether E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=89(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8a(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1254: tty (AT) + tty (AT) T: Bus=01 Lev=01 Prnt=21 Port=06 Cnt=01 Dev#=122 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1254 Rev=03.18 S: Manufacturer=Android S: Product=LE910C1-EU S: SerialNumber=0123456789ABCDEF C: #Ifs= 2 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms 0x1255: tty (AT/NMEA) + tty (AT) + tty (AT) + tty (SAP) T: Bus=01 Lev=01 Prnt=21 Port=06 Cnt=01 Dev#=123 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1bc7 ProdID=1255 Rev=03.18 S: Manufacturer=Android S: Product=LE910C1-EU S: SerialNumber=0123456789ABCDEF C: #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=86(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=88(I) Atr=03(Int.) MxPS= 10 Ivl=32ms Cc: stable@vger.kernel.org Signed-off-by: Fabio Porcedda Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index c71461893d20c2..42e4cecd28aca6 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1513,7 +1513,11 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1231, 0xff), /* Telit LE910Cx (RNDIS) */ .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE_AND_INTERFACE_INFO(TELIT_VENDOR_ID, 0x1250, 0xff, 0x00, 0x00) }, /* Telit LE910Cx (rmnet) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1251, 0xff) }, /* Telit LE910Cx (RNDIS) */ { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1252, 0xff) }, /* Telit LE910Cx (MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1253, 0xff) }, /* Telit LE910Cx (ECM) */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1254, 0xff) }, /* Telit LE910Cx */ + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1255, 0xff) }, /* Telit LE910Cx */ { USB_DEVICE(TELIT_VENDOR_ID, 0x1260), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x1261), From 7dd57d7a6350770dfc283287125c409e995200e0 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Thu, 30 Apr 2026 11:56:44 +0200 Subject: [PATCH 129/972] accel/ivpu: Disallow re-exporting imported GEM objects Prevent re-exporting of imported GEM buffers by adding a custom prime_handle_to_fd callback that checks if the object is imported and returns -EOPNOTSUPP if so. Re-exporting imported GEM buffers causes loss of buffer flags settings, leading to incorrect device access and data corruption. Reported-by: Yametsu Fixes: 57557964b582 ("accel/ivpu: Add support for userptr buffer objects") Reviewed-by: Andrzej Kacprowski Signed-off-by: Karol Wachowski Cc: # v6.19+ --- drivers/accel/ivpu/ivpu_drv.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 2801378e3e1927..3b7b008bccfec7 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -537,6 +537,26 @@ static const struct file_operations ivpu_fops = { #endif }; +static int ivpu_gem_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, + u32 handle, u32 flags, int *prime_fd) +{ + struct drm_gem_object *obj; + + obj = drm_gem_object_lookup(file_priv, handle); + if (!obj) + return -ENOENT; + + if (drm_gem_is_imported(obj)) { + /* Do not allow re-exporting */ + drm_gem_object_put(obj); + return -EOPNOTSUPP; + } + + drm_gem_object_put(obj); + + return drm_gem_prime_handle_to_fd(dev, file_priv, handle, flags, prime_fd); +} + static const struct drm_driver driver = { .driver_features = DRIVER_GEM | DRIVER_COMPUTE_ACCEL, @@ -545,6 +565,7 @@ static const struct drm_driver driver = { .gem_create_object = ivpu_gem_create_object, .gem_prime_import = ivpu_gem_prime_import, + .prime_handle_to_fd = ivpu_gem_prime_handle_to_fd, .ioctls = ivpu_drm_ioctls, .num_ioctls = ARRAY_SIZE(ivpu_drm_ioctls), From d3b7a868f1aeb6a43de880a3709ef3a0341f2c2a Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 29 Apr 2026 08:20:56 -0500 Subject: [PATCH 130/972] platform/wmi: Fix unchecked min_size in wmidev_invoke_method() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After calling wmidev_evaluate_method(), if the ACPI core does not return an out object, then wmidev_invoke_method() bypasses the min_size check and returns 0. Add a check for min_size if there is not an out object. Fixes: 1aeded2f55f0 ("platform/wmi: Extend wmidev_query_block() to reject undersized data") Closes: https://sashiko.dev/#/patchset/20260406203237.2970-1-W_Armin%40gmx.de Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://patch.msgid.link/20260429-invoke-fix-v1-1-ce938eb80cd3@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/wmi/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/platform/wmi/core.c b/drivers/platform/wmi/core.c index 7aa40dab6145ab..5a2ffcbab6af25 100644 --- a/drivers/platform/wmi/core.c +++ b/drivers/platform/wmi/core.c @@ -411,6 +411,9 @@ int wmidev_invoke_method(struct wmi_device *wdev, u8 instance, u32 method_id, obj = aout.pointer; if (!obj) { + if (min_size != 0) + return -ENOMSG; + out->length = 0; out->data = ZERO_SIZE_PTR; From c2d4b76458c9ebf81eae2916970320f1f61ad002 Mon Sep 17 00:00:00 2001 From: Krishna Chomal Date: Wed, 29 Apr 2026 23:39:53 +0530 Subject: [PATCH 131/972] platform/x86: hp-wmi: silence unknown board warning for 8D41 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP Omen Max 16-ah0xxx, DMI board ID 8D41 is currently marked with victus_s_thermal_params in the victus_s_thermal_profile_boards[] list. This disables thermal profile readback and adds a dmesg warning during driver init for "unknown board". After testing we know that (similar to another HP Omen Max 16 device, board ID 8D87), the embedded controller on this board does not expose thermal profile which means we have to intentionally disable EC readback. Changing its driver_data to omen_v1_no_ec_thermal_params is sufficient to silence the warning. Tested-by: Benjamin Y Signed-off-by: Krishna Chomal Link: https://patch.msgid.link/20260429180953.129885-1-krishna.chomal108@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index d1cc6e7d176ca9..24c151289dd3dd 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -243,7 +243,7 @@ static const struct dmi_system_id victus_s_thermal_profile_boards[] __initconst }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "8D41") }, - .driver_data = (void *)&victus_s_thermal_params, + .driver_data = (void *)&omen_v1_no_ec_thermal_params, }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "8D87") }, From 863810d4985ad214f70c1623f24384ccc850f2a2 Mon Sep 17 00:00:00 2001 From: Yufei CHENG Date: Mon, 27 Apr 2026 00:50:34 +0800 Subject: [PATCH 132/972] platform/x86: lenovo: wmi-other: Fix uninitialized variable in lwmi_om_hwmon_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the flag relax_fan_constraint is set, local variable 'raw' is never assigned, and lwmi_om_hwmon_write() will pass uninitialized value to lwmi_om_fan_get_set() resulting in undefined behavior. This flag allows user to bypass minimum fan RPM divisor rounding, but assignment to 'raw' only happens in the non-relaxed path. Fix by defaulting 'raw' to user provided 'val' in the else branch. Fixes: 51ed34282f63 ("platform/x86: lenovo-wmi-other: Add HWMON for fan reporting/tuning") Reviewed-by: Rong Zhang Signed-off-by: Yufei CHENG Link: https://patch.msgid.link/20260426165034.9073-1-cd345al@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 6040f45aa2b0d6..6c2febe1a595ca 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -349,6 +349,8 @@ static int lwmi_om_hwmon_write(struct device *dev, enum hwmon_sensor_types type, */ if (!relax_fan_constraint) raw = val / LWMI_FAN_DIV * LWMI_FAN_DIV; + else + raw = val; err = lwmi_om_fan_get_set(priv, channel, &raw, true); if (err) From 92a8b5e2eff6920bf815cd6a80b088ec3fdf01a3 Mon Sep 17 00:00:00 2001 From: Yuriy Padlyak Date: Thu, 30 Apr 2026 01:09:03 +0300 Subject: [PATCH 133/972] ALSA: hda/realtek: Fix speaker silence after S3 resume on Xiaomi Mi Laptop Pro 15 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Xiaomi Mi Laptop Pro 15 (TM1905, subsystem 1d72:1905) ships with the Realtek ALC256 codec on Intel Comet Lake PCH-LP. After S3 resume the codec sets coefficient register 0x10 to 0x0220 instead of 0x0020 — bit 9 is erroneously set, which silences the internal speaker. Bluetooth and HDMI audio are unaffected because they use different paths. This is the same mechanism fixed for Clevo NJ51CU by commit edca7cc4b0ac ("ALSA: hda/realtek: Fix quirk for Clevo NJ51CU"), but the existing ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME also reconfigures pin 0x19 as a front mic, which is wrong for this Xiaomi where pin 0x19 default is 0x411111f0 (disabled). Add a minimal fixup that only clears the stuck coef bit, and add the Xiaomi SSID to the quirk table. Verified by reading coef 0x10 with hda-verb after resume (returns 0x0220), writing 0x0020, and confirming the internal speaker resumes output. With this fixup applied the bit is cleared on every codec init, including post-resume. Signed-off-by: Yuriy Padlyak Cc: Tested-by: Yuriy Padlyak Link: https://patch.msgid.link/20260429220903.14918-1-yuriypadlyak@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index a9cd03bb73c410..9a5747c2e359b6 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -3390,6 +3390,19 @@ static void alc256_fixup_mic_no_presence_and_resume(struct hda_codec *codec, } } +static void alc256_fixup_xiaomi_pro15_resume(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + /* + * On the Xiaomi Mi Laptop Pro 15 (TM1905, SSID 1d72:1905) the ALC256 + * codec sets coefficient 0x10 bit 9 to 1 after S3 resume, silencing + * the internal speaker. Bluetooth and HDMI audio are unaffected. + * Clear the bit so the speaker keeps working across suspend cycles. + */ + alc_update_coef_idx(codec, 0x10, 1<<9, 0); +} + static void alc256_decrease_headphone_amp_val(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -4052,6 +4065,7 @@ enum { ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE, ALC233_FIXUP_NO_AUDIO_JACK, ALC256_FIXUP_MIC_NO_PRESENCE_AND_RESUME, + ALC256_FIXUP_XIAOMI_PRO15_RESUME, ALC285_FIXUP_LEGION_Y9000X_SPEAKERS, ALC285_FIXUP_LEGION_Y9000X_AUTOMUTE, ALC287_FIXUP_LEGION_16ACHG6, @@ -6240,6 +6254,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC256_FIXUP_XIAOMI_PRO15_RESUME] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc256_fixup_xiaomi_pro15_resume, + }, [ALC287_FIXUP_LEGION_16ACHG6] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_legion_16achg6_speakers, @@ -7774,6 +7792,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), + SND_PCI_QUIRK(0x1d72, 0x1905, "Xiaomi Mi Laptop Pro 15", ALC256_FIXUP_XIAOMI_PRO15_RESUME), SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1e39, 0xca14, "MEDION NM14LNL", ALC233_FIXUP_MEDION_MTL_SPK), From 0bf00859d7a5ab685901c36f29df063b825cfaaa Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:46 +0200 Subject: [PATCH 134/972] netfilter: nf_socket: skip socket lookup for non-first fragments Both nft_socket and xt_socket relies on L4 headers to perform socket lookup in the slow path. For fragmented packets, while the IP protocol remains constant across all fragments, only the first fragment contains the actual L4 header. As the expression/match could be attached to a chain with a priority lower than -400, it could bypass defragmentation. Add a check for fragmentation in the lookup functions directly so the problem is handled for both nft_socket and xt_socket at the same time. In addition, future users of the functions would not need to care about this. Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set") Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_socket_ipv4.c | 3 +++ net/ipv6/netfilter/nf_socket_ipv6.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 5080fa5fbf6a02..f9c6755f5ec576 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -94,6 +94,9 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, #endif int doff = 0; + if (ntohs(iph->frag_off) & IP_OFFSET) + return NULL; + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { struct tcphdr _hdr; struct udphdr *hp; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index ced8bd44828ef7..893f2aeb471145 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; + unsigned short fragoff = 0; int doff = 0; int thoff = 0, tproto; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -107,8 +108,8 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct nf_conn const *ct; #endif - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) { + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); return NULL; } From 009d203e56dbe8db2589455b9e3644955f30313a Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:47 +0200 Subject: [PATCH 135/972] netfilter: nf_tables: skip L4 header parsing for non-first fragments The tproxy, osf and exthdr (SCTP) expressions rely on the presence of transport layer headers to perform socket lookups, fingerprint matching, or chunk extraction. For fragmented packets, while the IP protocol remains constant across all fragments, only the first fragment contains the actual L4 header. The expressions could be attached to a chain with a priority lower than -400, bypassing defragmentation. Or could be used in stateless environments where defragmentation is not happening at all. This could result in garbage data being used for the matching. Add a check for pkt->fragoff so only unfragmented packets or the first fragment is processed. Fixes: 133dc203d77d ("netfilter: nft_exthdr: Support SCTP chunks") Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nft_exthdr.c | 2 +- net/netfilter/nft_osf.c | 2 +- net/netfilter/nft_tproxy.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 5ddd5b6e135f1c..8ab186f86dd43a 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb) + pkt->nhoff; else { - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) return false; ptr = skb->data + nft_thoff(pkt); } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 0407d6f708ae9e..e6a07c0df2079f 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; - if (pkt->tprot != IPPROTO_SCTP) + if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff) goto err; do { diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index c02d5cb52143bc..45fe56da50442b 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - if (pkt->tprot != IPPROTO_TCP) { + if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f2101af8c867f4..89be443734f66f 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } @@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, memset(&taddr, 0, sizeof(taddr)); - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } From 952e121c96137c73bd3e59bb20a93ef659376947 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:48 +0200 Subject: [PATCH 136/972] netfilter: xtables: fix L4 header parsing for non-first fragments Multiple targets and matches relies on L4 header to operate. For fragmented packets, every fragment carries the transport protocol identifier, but only the first fragment contains the L4 header. As the 'raw' table can be configured to run at priority -450 (before defragmentation at -400), the target/match can be reached before reassembly. In this case, non-first fragments have their payload incorrectly parsed as a TCP/UDP header. This would be of course a misconfiguration scenario. In most of the cases this just lead to a unreliable behavior for fragmented traffic. Add a fragment check to ensure target/match only evaluates unfragmented packets or the first fragment in the stream. Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TPROXY.c | 11 +++++++++-- net/netfilter/xt_ecn.c | 4 ++++ net/netfilter/xt_hashlimit.c | 4 +++- net/netfilter/xt_osf.c | 3 +++ net/netfilter/xt_tcpmss.c | 4 ++++ 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e4bea1d346cf92..5f60e7298a1ea9 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + unsigned short fragoff = 0; struct udphdr _hdr, *hp; struct sock *sk; const struct in6_addr *laddr; @@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) int thoff = 0; int tproto; - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) return NF_DROP; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c index b96e8203ac5498..a8503f5d26bf41 100644 --- a/net/netfilter/xt_ecn.c +++ b/net/netfilter/xt_ecn.c @@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) struct tcphdr _tcph; const struct tcphdr *th; + /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */ + if (par->fragoff) + return false; + /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3bd127bfc11494..2704b4b60d1e04 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; + if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET) + return -1; nexthdr = ip_hdr(skb)->protocol; break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; nexthdr = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if ((int)protoff < 0) + if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET) return -1; break; } diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index dc9485854002aa..e8807caede68e6 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -27,6 +27,9 @@ static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { + if (p->fragoff) + return false; + return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers); } diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 0d32d4841cb325..b9da8269161d8f 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) u8 _opt[15 * 4 - sizeof(_tcph)]; unsigned int i, optlen; + /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */ + if (par->fragoff) + return false; + /* If we don't have the whole header, drop packet. */ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) From ef4f741e8627512cb8c82f59a1fc7aacd854aadf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:48 +0200 Subject: [PATCH 137/972] netfilter: flowtable: ensure sufficient headroom in xmit path Check for headroom and call skb_expand_head() like in the IP output path to ensure there is sufficient headroom for the mac header when forwarding this packet as suggested by sashiko. Fixes: b5964aac51e0 ("netfilter: flowtable: consolidate xmit path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index dbd7644fdbebeb..8d5fb7e940a173 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -471,8 +471,17 @@ struct nf_flow_xmit { static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, struct nf_flow_xmit *xmit) { - skb->dev = xmit->outdev; - dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + struct net_device *dev = xmit->outdev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return NF_STOLEN; + } + + skb->dev = dev; + dev_hard_header(skb, dev, ntohs(skb->protocol), xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); From d6cc7c99bf1f73eda7d565d224d791d16239bb41 Mon Sep 17 00:00:00 2001 From: Sanman Pradhan Date: Thu, 16 Apr 2026 21:59:30 +0000 Subject: [PATCH 138/972] hwmon: (ltc2992) Clamp threshold writes to hardware range ltc2992_set_voltage(), ltc2992_set_current(), and ltc2992_set_power() do not validate the user-supplied value before converting it to a register value. This can result in: 1. Negative input values wrapping to large positive register values. For power, the negative long is implicitly cast to u64 in mul_u64_u32_div(), producing an incorrect value. For voltage and current, the negative converted value wraps when passed to ltc2992_write_reg() as a u32. 2. Intermediate arithmetic exceeding the range representable in u64 on 64-bit platforms. In ltc2992_set_voltage(), (u64)val * 1000 can exceed U64_MAX when val is a large positive long. In ltc2992_set_current(), (u64)val * r_sense_uohm can overflow similarly. In ltc2992_set_power(), the computed value may not fit in u64. 3. Register values exceeding the hardware field width. Voltage and current threshold registers are 12-bit (stored left-justified in 16 bits), and power threshold registers are 24-bit. Without clamping, bits above the field width are truncated in ltc2992_write_reg(). Fix by clamping negative values to zero, clamping positive values to the rounded hardware-representable maximum (the value returned by the read path for a full-scale register) to prevent intermediate overflow, and clamping the converted register value to the hardware field width before writing. The existing conversion formula and rounding behavior are preserved. In the power write path, cancel the factor of 1000 from both the numerator (r_sense_uohm * 1000) and the denominator (VADC_UV_LSB * IADC_NANOV_LSB) to also eliminate a u32 overflow of r_sense_uohm * 1000 when r_sense_uohm exceeds about 4.29 ohms. Fixes: b0bd407e94b03 ("hwmon: (ltc2992) Add support") Cc: stable@vger.kernel.org Signed-off-by: Sanman Pradhan Link: https://lore.kernel.org/r/20260416215904.101969-2-sanman.pradhan@hpe.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2992.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 1fcd320d616197..1069736196763d 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -431,10 +431,16 @@ static int ltc2992_get_voltage(struct ltc2992_state *st, u32 reg, u32 scale, lon static int ltc2992_set_voltage(struct ltc2992_state *st, u32 reg, u32 scale, long val) { - val = DIV_ROUND_CLOSEST(val * 1000, scale); - val = val << 4; + u32 reg_val; + long vmax; + + vmax = DIV_ROUND_CLOSEST_ULL(0xFFFULL * scale, 1000); + val = max(val, 0L); + val = min(val, vmax); + reg_val = min(DIV_ROUND_CLOSEST_ULL((u64)val * 1000, scale), + 0xFFFULL) << 4; - return ltc2992_write_reg(st, reg, 2, val); + return ltc2992_write_reg(st, reg, 2, reg_val); } static int ltc2992_read_gpio_alarm(struct ltc2992_state *st, int nr_gpio, u32 attr, long *val) @@ -559,9 +565,15 @@ static int ltc2992_get_current(struct ltc2992_state *st, u32 reg, u32 channel, l static int ltc2992_set_current(struct ltc2992_state *st, u32 reg, u32 channel, long val) { u32 reg_val; + long cmax; - reg_val = DIV_ROUND_CLOSEST(val * st->r_sense_uohm[channel], LTC2992_IADC_NANOV_LSB); - reg_val = reg_val << 4; + cmax = DIV_ROUND_CLOSEST_ULL(0xFFFULL * LTC2992_IADC_NANOV_LSB, + st->r_sense_uohm[channel]); + val = max(val, 0L); + val = min(val, cmax); + reg_val = min(DIV_ROUND_CLOSEST_ULL((u64)val * st->r_sense_uohm[channel], + LTC2992_IADC_NANOV_LSB), + 0xFFFULL) << 4; return ltc2992_write_reg(st, reg, 2, reg_val); } @@ -634,9 +646,18 @@ static int ltc2992_get_power(struct ltc2992_state *st, u32 reg, u32 channel, lon static int ltc2992_set_power(struct ltc2992_state *st, u32 reg, u32 channel, long val) { u32 reg_val; - - reg_val = mul_u64_u32_div(val, st->r_sense_uohm[channel] * 1000, - LTC2992_VADC_UV_LSB * LTC2992_IADC_NANOV_LSB); + u64 pmax, uval; + + uval = max(val, 0L); + pmax = mul_u64_u32_div(0xFFFFFFULL, + LTC2992_VADC_UV_LSB / 1000 * + LTC2992_IADC_NANOV_LSB, + st->r_sense_uohm[channel]); + uval = min(uval, pmax); + reg_val = min(mul_u64_u32_div(uval, st->r_sense_uohm[channel], + LTC2992_VADC_UV_LSB / 1000 * + LTC2992_IADC_NANOV_LSB), + 0xFFFFFFULL); return ltc2992_write_reg(st, reg, 3, reg_val); } From 2da0c1fd01dbd6b22844e8676585153dfc660cbe Mon Sep 17 00:00:00 2001 From: Sanman Pradhan Date: Thu, 16 Apr 2026 21:59:40 +0000 Subject: [PATCH 139/972] hwmon: (ltc2992) Fix u32 overflow in power read path ltc2992_get_power() computes the divisor for mul_u64_u32_div() as r_sense_uohm * 1000. This multiplication overflows u32 when r_sense_uohm exceeds about 4.29 ohms (4294967 micro-ohms), producing a truncated divisor and an incorrect power reading. Cancel the factor of 1000 from both the numerator (VADC_UV_LSB * IADC_NANOV_LSB = 312500000) and the divisor (r_sense_uohm * 1000), giving (VADC_UV_LSB / 1000) * IADC_NANOV_LSB = 312500 as the numerator and plain r_sense_uohm as the divisor. The cancellation is exact because LTC2992_VADC_UV_LSB (25000) is divisible by 1000. This is the read-path counterpart of the write-path fix applied in the preceding patch. Fixes: b0bd407e94b03 ("hwmon: (ltc2992) Add support") Cc: stable@vger.kernel.org Signed-off-by: Sanman Pradhan Link: https://lore.kernel.org/r/20260416215904.101969-3-sanman.pradhan@hpe.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ltc2992.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 1069736196763d..2617c4538af91d 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -637,8 +637,10 @@ static int ltc2992_get_power(struct ltc2992_state *st, u32 reg, u32 channel, lon if (reg_val < 0) return reg_val; - *val = mul_u64_u32_div(reg_val, LTC2992_VADC_UV_LSB * LTC2992_IADC_NANOV_LSB, - st->r_sense_uohm[channel] * 1000); + *val = mul_u64_u32_div(reg_val, + LTC2992_VADC_UV_LSB / 1000 * + LTC2992_IADC_NANOV_LSB, + st->r_sense_uohm[channel]); return 0; } From 5ed26ffe57ffca054b7a141ff0c1a07bf3a80f6c Mon Sep 17 00:00:00 2001 From: Ninad Naik Date: Sat, 18 Apr 2026 00:44:11 +0530 Subject: [PATCH 140/972] Documentation: hwmon: fix link to ideapad-laptop.c file The ideapad-laptop.c file now exists inside drivers/platform/x86/lenovo/ directory. Updating the GitHub link to the correct path. Signed-off-by: Ninad Naik Link: https://lore.kernel.org/r/20260417191411.713958-1-ninadnaik07@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/yogafan.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/hwmon/yogafan.rst b/Documentation/hwmon/yogafan.rst index c553a381f772ee..68761947a1a838 100644 --- a/Documentation/hwmon/yogafan.rst +++ b/Documentation/hwmon/yogafan.rst @@ -135,4 +135,4 @@ References 4. **Lenovo IdeaPad Laptop Driver:** Reference for DMI-based hardware feature gating in Lenovo laptops. - https://github.com/torvalds/linux/blob/master/drivers/platform/x86/ideapad-laptop.c + https://github.com/torvalds/linux/blob/master/drivers/platform/x86/lenovo/ideapad-laptop.c From 1a1414c675ee1b637bbe3840241555a49c61b123 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 25 Apr 2026 20:03:19 -0400 Subject: [PATCH 141/972] hwmon: Remove stale CONFIG_SENSORS_SBRMI Makefile reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kconfiglint reports: X001: CONFIG_SENSORS_SBRMI referenced in Makefile but not defined in any Kconfig The SB-RMI hardware monitoring driver was originally introduced in commit 5a0f50d110b3 ("hwmon: Add support for SB-RMI power module") with both a Kconfig entry (CONFIG_SENSORS_SBRMI) and a Makefile line (obj-$(CONFIG_SENSORS_SBRMI) += sbrmi.o) in drivers/hwmon/. Commit e156586764050 ("hwmon/misc: amd-sbi: Move core sbrmi from hwmon to misc") moved the driver to drivers/misc/amd-sbi/ to support additional functionality beyond hardware monitoring. That commit correctly removed the Kconfig entry from drivers/hwmon/Kconfig, moved the source file drivers/hwmon/sbrmi.c to drivers/misc/amd-sbi/sbrmi.c, and created new Kconfig/Makefile entries in drivers/misc/amd-sbi/ with a renamed symbol (CONFIG_AMD_SBRMI_I2C). However, the Makefile line in drivers/hwmon/Makefile was not removed in that commit. The orphaned line references a CONFIG symbol that no longer exists and a source file that is no longer present, so it has no effect on the build — but it is dead code that should be cleaned up. Remove the stale Makefile reference. Assisted-by: Claude:claude-opus-4-6 kconfiglint Signed-off-by: Sasha Levin Link: https://lore.kernel.org/r/20260426000319.55908-1-sashal@kernel.org Signed-off-by: Guenter Roeck --- drivers/hwmon/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile index 4788996aa1374d..982ee2c6f9deb6 100644 --- a/drivers/hwmon/Makefile +++ b/drivers/hwmon/Makefile @@ -201,7 +201,6 @@ obj-$(CONFIG_SENSORS_PWM_FAN) += pwm-fan.o obj-$(CONFIG_SENSORS_QNAP_MCU_HWMON) += qnap-mcu-hwmon.o obj-$(CONFIG_SENSORS_RASPBERRYPI_HWMON) += raspberrypi-hwmon.o obj-$(CONFIG_SENSORS_SBTSI) += sbtsi_temp.o -obj-$(CONFIG_SENSORS_SBRMI) += sbrmi.o obj-$(CONFIG_SENSORS_SCH56XX_COMMON)+= sch56xx-common.o obj-$(CONFIG_SENSORS_SCH5627) += sch5627.o obj-$(CONFIG_SENSORS_SCH5636) += sch5636.o From 174606451fbb17db506ebaacdd5e203e57773d5f Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Fri, 24 Apr 2026 22:50:51 +0900 Subject: [PATCH 142/972] hwmon: (corsair-psu) Close HID device on probe errors corsairpsu_probe() opens the HID device before sending the device init and firmware-info commands. If either command fails, the error path jumps directly to fail_and_stop and skips hid_hw_close(). Use the existing fail_and_close label for those post-open failures so the open count and low-level close callback are balanced before hid_hw_stop(). Fixes: d115b51e0e56 ("hwmon: add Corsair PSU HID controller driver") Cc: stable@vger.kernel.org Signed-off-by: Myeonghun Pak Reviewed-by: Wilken Gottwalt Link: https://lore.kernel.org/r/20260424135107.13720-1-mhun512@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/corsair-psu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/corsair-psu.c b/drivers/hwmon/corsair-psu.c index dddbd2463f8da7..76f3e1da68d09e 100644 --- a/drivers/hwmon/corsair-psu.c +++ b/drivers/hwmon/corsair-psu.c @@ -796,13 +796,13 @@ static int corsairpsu_probe(struct hid_device *hdev, const struct hid_device_id ret = corsairpsu_init(priv); if (ret < 0) { dev_err(&hdev->dev, "unable to initialize device (%d)\n", ret); - goto fail_and_stop; + goto fail_and_close; } ret = corsairpsu_fwinfo(priv); if (ret < 0) { dev_err(&hdev->dev, "unable to query firmware (%d)\n", ret); - goto fail_and_stop; + goto fail_and_close; } corsairpsu_get_criticals(priv); From d272b8d2dd132de8579e3f79a77bc6ae58214a93 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 30 Apr 2026 12:53:50 +0800 Subject: [PATCH 143/972] riscv: cpufeature: Drop this_hwcap clear in T-Head vector workaround The variable this_hwcap is initialized to 0 for each loop, it is not necessary to do the bit clearance since this_hwcap is still 0 at this point, clearing the source_isa is enough here. Signed-off-by: Hui Wang Link: https://patch.msgid.link/20260430045350.22213-1-hui.wang@canonical.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/cpufeature.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 1734f9a4c2fd70..3dc4c0d31550fe 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -896,10 +896,8 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) * CPU cores with the ratified spec will contain non-zero * marchid. */ - if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) { - this_hwcap &= ~isa2hwcap[RISCV_ISA_EXT_v]; + if (acpi_disabled && boot_vendorid == THEAD_VENDOR_ID && boot_archid == 0x0) clear_bit(RISCV_ISA_EXT_v, source_isa); - } riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap); From 18ed60e33e6c77d62409c1343dec1c61bae3d2e7 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Apr 2026 16:21:41 +0800 Subject: [PATCH 144/972] net: mctp: test: use a zeroed struct sockaddr_mctp Invalid sockaddr padding will cause bind() to fail; ensure we have a zeroed address in the testcase. Fixes: 0d8647bc74cb ("net: mctp: don't require a route for null-EID ingress") Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-dev-mctp-test-fixes-v1-1-1127b7425809@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/route-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index e1033643fab0e2..e4b230ef609969 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -920,9 +920,9 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) static void mctp_test_route_input_null_eid(struct kunit *test) { struct mctp_hdr hdr = RX_HDR(1, 10, 0, FL_S | FL_E | FL_TO); + struct sockaddr_mctp addr = { 0 }; struct sk_buff *skb_pkt, *skb_sk; struct mctp_test_dev *dev; - struct sockaddr_mctp addr; struct socket *sock; u8 type = 0; int rc; From 76872971064133474d9b891da05db8f7586fcc11 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Apr 2026 16:21:42 +0800 Subject: [PATCH 145/972] net: mctp: test: Use dev_direct_xmit for TX to our test device In our test cases, we typically feed a packet sequence into the routing code, then inspect the device's TXed skbs to assert specific behaviours. Using dev_queue_xmit() for our TX path introduces a fair bit of complexity between the test packet sequence and the test device's ndo_start_xmit callback; which may mean that the skbs have not hit the device at the point we're inspecting the TXed skb list. Use dev_direct_xmit instead, as we want a direct a path as possible here, and the test dev does not need any queueing, scheduling or flow control. Fixes: 6ab578739a4c ("net: mctp: test: move TX packetqueue from dst to dev") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202604281320.525eee17-lkp@intel.com Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-dev-mctp-test-fixes-v1-2-1127b7425809@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index c3987d5ade7adc..6eef8d485c25e2 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -116,7 +116,7 @@ void mctp_test_destroy_dev(struct mctp_test_dev *dev) static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { skb->dev = dst->dev->dev; - dev_queue_xmit(skb); + dev_direct_xmit(skb, 0); return 0; } From ba6b328588f7cb7cf4aca33bae565c2914d74786 Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:23 +0800 Subject: [PATCH 146/972] rust: arch: um: Fix building 32-bit UML with GCC 32-bit UML builds can be configured either by setting CONFIG_64BIT=n or with SUBARCH=i386. Both work with Rust-for-Linux when clang is the compiler, but when SUBARCH=i386, we don't set a bindgen target correctly if gcc is the compiler. Add the appropriate bindgen target configuration for i386, as is done in Makefile.clang. [ For reference, the errors look like: BINDGEN rust/bindings/bindings_generated.rs error: unsupported option '-mno-sse' for target '' ... error: unknown target triple 'unknown' panicked at .../bindgen-0.72.1/ir/context.rs:562:15: libclang error; possible causes include: ... - Miguel ] Fixes: ab0f4cedc355 ("arch: um: rust: Add i386 support for Rust") Signed-off-by: David Gow Link: https://patch.msgid.link/20260425034125.53866-1-david@davidgow.net [ Added space in title. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rust/Makefile b/rust/Makefile index b361bfedfdf07a..b9e9f512cec314 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -403,6 +403,8 @@ BINDGEN_TARGET_x86 := x86_64-linux-gnu BINDGEN_TARGET_arm64 := aarch64-linux-gnu BINDGEN_TARGET_arm := arm-linux-gnueabi BINDGEN_TARGET_loongarch := loongarch64-linux-gnusf +# This is only for i386 UM builds, which need the 32-bit target not -m32 +BINDGEN_TARGET_i386 := i386-linux-gnu BINDGEN_TARGET_um := $(BINDGEN_TARGET_$(SUBARCH)) BINDGEN_TARGET := $(BINDGEN_TARGET_$(SRCARCH)) From 83ac2870310b694775ab7e8f0244fdd94fc21926 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Mon, 27 Apr 2026 16:43:00 +0100 Subject: [PATCH 147/972] rust: pin-init: internal: move alignment check to `make_field_check` Instead of having the reference creation serving dual-purpose as both for let bindings and alignment check, detangle them so that the alignment check is done explicitly in `make_field_check`. This is more robust against refactors that may change the way let bindings are created. Cc: stable@vger.kernel.org Reviewed-by: Alice Ryhl Signed-off-by: Gary Guo Link: https://patch.msgid.link/20260427-pin-init-fix-v3-1-496a699674dd@garyguo.net [ Reworded for typo. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/pin-init/internal/src/init.rs | 78 ++++++++++++++---------------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/rust/pin-init/internal/src/init.rs b/rust/pin-init/internal/src/init.rs index daa3f1c6466eff..0a6600e8156cdb 100644 --- a/rust/pin-init/internal/src/init.rs +++ b/rust/pin-init/internal/src/init.rs @@ -249,10 +249,6 @@ fn init_fields( }); // Again span for better diagnostics let write = quote_spanned!(ident.span()=> ::core::ptr::write); - // NOTE: the field accessor ensures that the initialized field is properly aligned. - // Unaligned fields will cause the compiler to emit E0793. We do not support - // unaligned fields since `Init::__init` requires an aligned pointer; the call to - // `ptr::write` below has the same requirement. let accessor = if pinned { let project_ident = format_ident!("__project_{ident}"); quote! { @@ -367,49 +363,49 @@ fn init_fields( } } -/// Generate the check for ensuring that every field has been initialized. +/// Generate the check for ensuring that every field has been initialized and aligned. fn make_field_check( fields: &Punctuated, init_kind: InitKind, path: &Path, ) -> TokenStream { - let field_attrs = fields + let field_attrs: Vec<_> = fields .iter() - .filter_map(|f| f.kind.ident().map(|_| &f.attrs)); - let field_name = fields.iter().filter_map(|f| f.kind.ident()); - match init_kind { - InitKind::Normal => quote! { - // We use unreachable code to ensure that all fields have been mentioned exactly once, - // this struct initializer will still be type-checked and complain with a very natural - // error message if a field is forgotten/mentioned more than once. - #[allow(unreachable_code, clippy::diverging_sub_expression)] - // SAFETY: this code is never executed. - let _ = || unsafe { - ::core::ptr::write(slot, #path { - #( - #(#field_attrs)* - #field_name: ::core::panic!(), - )* - }) - }; - }, - InitKind::Zeroing => quote! { - // We use unreachable code to ensure that all fields have been mentioned at most once. - // Since the user specified `..Zeroable::zeroed()` at the end, all missing fields will - // be zeroed. This struct initializer will still be type-checked and complain with a - // very natural error message if a field is mentioned more than once, or doesn't exist. - #[allow(unreachable_code, clippy::diverging_sub_expression, unused_assignments)] - // SAFETY: this code is never executed. - let _ = || unsafe { - ::core::ptr::write(slot, #path { - #( - #(#field_attrs)* - #field_name: ::core::panic!(), - )* - ..::core::mem::zeroed() - }) - }; - }, + .filter_map(|f| f.kind.ident().map(|_| &f.attrs)) + .collect(); + let field_name: Vec<_> = fields.iter().filter_map(|f| f.kind.ident()).collect(); + let zeroing_trailer = match init_kind { + InitKind::Normal => None, + InitKind::Zeroing => Some(quote! { + ..::core::mem::zeroed() + }), + }; + quote! { + #[allow(unreachable_code, clippy::diverging_sub_expression)] + // We use unreachable code to perform field checks. They're still checked by the compiler. + // SAFETY: this code is never executed. + let _ = || unsafe { + // Create references to ensure that the initialized field is properly aligned. + // Unaligned fields will cause the compiler to emit E0793. We do not support + // unaligned fields since `Init::__init` requires an aligned pointer; the call to + // `ptr::write` for value-initialization case has the same requirement. + #( + #(#field_attrs)* + let _ = &(*slot).#field_name; + )* + + // If the zeroing trailer is not present, this checks that all fields have been + // mentioned exactly once. If the zeroing trailer is present, all missing fields will be + // zeroed, so this checks that all fields have been mentioned at most once. The use of + // struct initializer will still generate very natural error messages for any misuse. + ::core::ptr::write(slot, #path { + #( + #(#field_attrs)* + #field_name: ::core::panic!(), + )* + #zeroing_trailer + }) + }; } } From 68bf102226cf2199dc609b67c1e847cad4de4b57 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Mon, 27 Apr 2026 16:43:01 +0100 Subject: [PATCH 148/972] rust: pin-init: fix incorrect accessor reference lifetime When a field has been initialized, `init!`/`pin_init!` create a reference or pinned reference to the field so it can be accessed later during the initialization of other fields. However, the reference it created is incorrectly `&'static` rather than just the scope of the initializer. This means that you can do init!(Foo { a: 1, _: { let b: &'static u32 = a; } }) which is unsound. This is caused by `&mut (*#slot).#ident`, which actually allows arbitrary lifetime, so this is effectively `'static`. Somewhat ironically, the safety justification of creating the accessor is.. "SAFETY: TODO". Fix it by adding `let_binding` method on `DropGuard` to shorten lifetime. This results in exactly what we want for these accessors. The safety and invariant comments of `DropGuard` have been reworked; instead of reasoning about what caller can do with the guard, express it in a way that the ownership is transferred to the guard and `forget` takes it back, so the unsafe operations within the `DropGuard` can be more easily justified. Fixes: 42415d163e5d ("rust: pin-init: add references to previously initialized fields") Cc: stable@vger.kernel.org Signed-off-by: Gary Guo Link: https://patch.msgid.link/20260427-pin-init-fix-v3-2-496a699674dd@garyguo.net [ Reworded for missing word. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/pin-init/internal/src/init.rs | 106 +++++++++++++---------------- rust/pin-init/src/__internal.rs | 28 +++++--- 2 files changed, 66 insertions(+), 68 deletions(-) diff --git a/rust/pin-init/internal/src/init.rs b/rust/pin-init/internal/src/init.rs index 0a6600e8156cdb..487ee0013fafee 100644 --- a/rust/pin-init/internal/src/init.rs +++ b/rust/pin-init/internal/src/init.rs @@ -249,18 +249,6 @@ fn init_fields( }); // Again span for better diagnostics let write = quote_spanned!(ident.span()=> ::core::ptr::write); - let accessor = if pinned { - let project_ident = format_ident!("__project_{ident}"); - quote! { - // SAFETY: TODO - unsafe { #data.#project_ident(&mut (*#slot).#ident) } - } - } else { - quote! { - // SAFETY: TODO - unsafe { &mut (*#slot).#ident } - } - }; quote! { #(#attrs)* { @@ -268,51 +256,31 @@ fn init_fields( // SAFETY: TODO unsafe { #write(&raw mut (*#slot).#ident, #value_ident) }; } - #(#cfgs)* - #[allow(unused_variables)] - let #ident = #accessor; } } InitializerKind::Init { ident, value, .. } => { // Again span for better diagnostics let init = format_ident!("init", span = value.span()); - // NOTE: the field accessor ensures that the initialized field is properly aligned. - // Unaligned fields will cause the compiler to emit E0793. We do not support - // unaligned fields since `Init::__init` requires an aligned pointer; the call to - // `ptr::write` below has the same requirement. - let (value_init, accessor) = if pinned { - let project_ident = format_ident!("__project_{ident}"); - ( - quote! { - // SAFETY: - // - `slot` is valid, because we are inside of an initializer closure, we - // return when an error/panic occurs. - // - We also use `#data` to require the correct trait (`Init` or `PinInit`) - // for `#ident`. - unsafe { #data.#ident(&raw mut (*#slot).#ident, #init)? }; - }, - quote! { - // SAFETY: TODO - unsafe { #data.#project_ident(&mut (*#slot).#ident) } - }, - ) + let value_init = if pinned { + quote! { + // SAFETY: + // - `slot` is valid, because we are inside of an initializer closure, we + // return when an error/panic occurs. + // - We also use `#data` to require the correct trait (`Init` or `PinInit`) + // for `#ident`. + unsafe { #data.#ident(&raw mut (*#slot).#ident, #init)? }; + } } else { - ( - quote! { - // SAFETY: `slot` is valid, because we are inside of an initializer - // closure, we return when an error/panic occurs. - unsafe { - ::pin_init::Init::__init( - #init, - &raw mut (*#slot).#ident, - )? - }; - }, - quote! { - // SAFETY: TODO - unsafe { &mut (*#slot).#ident } - }, - ) + quote! { + // SAFETY: `slot` is valid, because we are inside of an initializer + // closure, we return when an error/panic occurs. + unsafe { + ::pin_init::Init::__init( + #init, + &raw mut (*#slot).#ident, + )? + }; + } }; quote! { #(#attrs)* @@ -320,9 +288,6 @@ fn init_fields( let #init = #value; #value_init } - #(#cfgs)* - #[allow(unused_variables)] - let #ident = #accessor; } } InitializerKind::Code { block: value, .. } => quote! { @@ -335,18 +300,41 @@ fn init_fields( if let Some(ident) = kind.ident() { // `mixed_site` ensures that the guard is not accessible to the user-controlled code. let guard = format_ident!("__{ident}_guard", span = Span::mixed_site()); + + // NOTE: The reference is derived from the guard so that it only lives as long as the + // guard does and cannot escape the scope. If it's created via `&mut (*#slot).#ident` + // like the unaligned field guard, it will become effectively `'static`. + let accessor = if pinned { + let project_ident = format_ident!("__project_{ident}"); + quote! { + // SAFETY: the initialization is pinned. + unsafe { #data.#project_ident(#guard.let_binding()) } + } + } else { + quote! { + #guard.let_binding() + } + }; + res.extend(quote! { #(#cfgs)* - // Create the drop guard: + // Create the drop guard. // - // We rely on macro hygiene to make it impossible for users to access this local - // variable. - // SAFETY: We forget the guard later when initialization has succeeded. - let #guard = unsafe { + // SAFETY: + // - `&raw mut (*slot).#ident` is valid. + // - `make_field_check` checks that `&raw mut (*slot).#ident` is properly aligned. + // - `(*slot).#ident` has been initialized above. + // - We only need the ownership to the pointee back when initialization has + // succeeded, where we `forget` the guard. + let mut #guard = unsafe { ::pin_init::__internal::DropGuard::new( &raw mut (*slot).#ident ) }; + + #(#cfgs)* + #[allow(unused_variables)] + let #ident = #accessor; }); guards.push(guard); guard_attrs.push(cfgs); diff --git a/rust/pin-init/src/__internal.rs b/rust/pin-init/src/__internal.rs index 90adbdc1893bbf..5720a621aed74b 100644 --- a/rust/pin-init/src/__internal.rs +++ b/rust/pin-init/src/__internal.rs @@ -238,32 +238,42 @@ fn stack_init_reuse() { /// When a value of this type is dropped, it drops a `T`. /// /// Can be forgotten to prevent the drop. +/// +/// # Invariants +/// +/// - `ptr` is valid and properly aligned. +/// - `*ptr` is initialized and owned by this guard. pub struct DropGuard { ptr: *mut T, } impl DropGuard { - /// Creates a new [`DropGuard`]. It will [`ptr::drop_in_place`] `ptr` when it gets dropped. + /// Creates a drop guard and transfer the ownership of the pointer content. /// - /// # Safety + /// The ownership is only relinguished if the guard is forgotten via [`core::mem::forget`]. /// - /// `ptr` must be a valid pointer. + /// # Safety /// - /// It is the callers responsibility that `self` will only get dropped if the pointee of `ptr`: - /// - has not been dropped, - /// - is not accessible by any other means, - /// - will not be dropped by any other means. + /// - `ptr` is valid and properly aligned. + /// - `*ptr` is initialized, and the ownership is transferred to this guard. #[inline] pub unsafe fn new(ptr: *mut T) -> Self { + // INVARIANT: By safety requirement. Self { ptr } } + + /// Create a let binding for accessor use. + #[inline] + pub fn let_binding(&mut self) -> &mut T { + // SAFETY: Per type invariant. + unsafe { &mut *self.ptr } + } } impl Drop for DropGuard { #[inline] fn drop(&mut self) { - // SAFETY: A `DropGuard` can only be constructed using the unsafe `new` function - // ensuring that this operation is safe. + // SAFETY: `self.ptr` is valid, properly aligned and `*self.ptr` is owned by this guard. unsafe { ptr::drop_in_place(self.ptr) } } } From 838d852da8503372f3a1779bfbd1ccb93153ab4e Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 26 Apr 2026 16:42:00 +0200 Subject: [PATCH 149/972] rust: allow `clippy::collapsible_match` globally The `clippy::collapsible_match` lint [1] can make code harder to read in certain cases [2], e.g. CLIPPY P rust/libmacros.so - due to command line change warning: this `if` can be collapsed into the outer `match` --> rust/pin-init/internal/src/helpers.rs:91:17 | 91 | / if nesting == 1 { 92 | | impl_generics.push(tt.clone()); 93 | | impl_generics.push(tt); 94 | | skip_until_comma = false; 95 | | } | |_________________^ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_match = note: `-W clippy::collapsible-match` implied by `-W clippy::all` = help: to override `-W clippy::all` add `#[allow(clippy::collapsible_match)]` help: collapse nested if block | 90 ~ TokenTree::Punct(p) if skip_until_comma && p.as_char() == ',' 91 ~ && nesting == 1 => { 92 | impl_generics.push(tt.clone()); 93 | impl_generics.push(tt); 94 | skip_until_comma = false; 95 ~ } | The lint does not have much upside -- when the suggestion may be a good one, it would still read fine when nested anyway. And it is the kind of lint that may easily bias people to just apply the suggestion instead of allowing it. [ In addition, as Gary points out [3], the suggestion is also wrong [4] and in the process of being fixed [5], possibly for Rust 1.97.0: Link: https://lore.kernel.org/rust-for-linux/DI3YV94TH9I3.1SOHW51552497@garyguo.net/ [3] Link: https://github.com/rust-lang/rust-clippy/issues/16875 [4] Link: https://github.com/rust-lang/rust-clippy/pull/16878 [5] - Miguel ] Thus just let developers decide on their own. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Link: https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_match [1] Link: https://lore.kernel.org/rust-for-linux/CANiq72nWYJna_hdFxjQCQZK6yJBrr1Mb86iKavivV0U0BgufeA@mail.gmail.com/ [2] Reviewed-by: Gary Guo Link: https://patch.msgid.link/20260426144201.227108-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index e27c91ea56fcf8..621d84aa4700d8 100644 --- a/Makefile +++ b/Makefile @@ -486,6 +486,7 @@ export rust_common_flags := --edition=2021 \ -Wclippy::as_ptr_cast_mut \ -Wclippy::as_underscore \ -Wclippy::cast_lossless \ + -Aclippy::collapsible_match \ -Wclippy::ignored_unit_patterns \ -Aclippy::incompatible_msrv \ -Wclippy::mut_mut \ From 2adc8664018c1cc595c7c0c98474a33c7fe32a85 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 26 Apr 2026 16:42:01 +0200 Subject: [PATCH 150/972] rust: allow `clippy::collapsible_if` globally Similar to `clippy::collapsible_match` (globally allowed in the previous commit), the `clippy::collapsible_if` lint [1] can make code harder to read in certain cases. Thus just let developers decide on their own. In addition, remove the existing `expect` we had. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Suggested-by: Gary Guo Link: https://lore.kernel.org/rust-for-linux/DGROP5CHU1QZ.1OKJRAUZXE9WC@garyguo.net/ Link: https://rust-lang.github.io/rust-clippy/master/index.html#collapsible_if [1] Reviewed-by: Gary Guo Link: https://patch.msgid.link/20260426144201.227108-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- Makefile | 1 + drivers/android/binder/range_alloc/array.rs | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 621d84aa4700d8..28f4ae4524413a 100644 --- a/Makefile +++ b/Makefile @@ -486,6 +486,7 @@ export rust_common_flags := --edition=2021 \ -Wclippy::as_ptr_cast_mut \ -Wclippy::as_underscore \ -Wclippy::cast_lossless \ + -Aclippy::collapsible_if \ -Aclippy::collapsible_match \ -Wclippy::ignored_unit_patterns \ -Aclippy::incompatible_msrv \ diff --git a/drivers/android/binder/range_alloc/array.rs b/drivers/android/binder/range_alloc/array.rs index ada1d1b4302e53..081d19b09d4bb4 100644 --- a/drivers/android/binder/range_alloc/array.rs +++ b/drivers/android/binder/range_alloc/array.rs @@ -204,7 +204,6 @@ impl ArrayRangeAllocator { // caller will mark them as unused, which means that they can be freed if the system comes // under memory pressure. let mut freed_range = FreedRange::interior_pages(offset, size); - #[expect(clippy::collapsible_if)] // reads better like this if offset % PAGE_SIZE != 0 { if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) { freed_range.start_page_idx -= 1; From dff205550e714f1cb65f27409586e2612905fa88 Mon Sep 17 00:00:00 2001 From: Gui-Dong Han Date: Thu, 16 Apr 2026 21:57:03 +0800 Subject: [PATCH 151/972] hwmon: (lm63) Add locking to avoid TOCTOU The functions show_fan(), show_pwm1(), show_temp11(), temp2_crit_hyst_show(), and show_lut_temp_hyst() access shared cached data without holding the update lock. This can cause TOCTOU races if the cached values change between the checks and the later calculations. Those cached values are updated in lm63_update_device(). In the general case, the affected functions combine multiple cached values without locking and can therefore observe a mixed old/new snapshot. In addition, show_fan() reads data->fan[nr] locklessly while lm63_update_device() updates data->fan[0] in two steps, which can expose an intermediate torn value and potentially trigger a divide-by-zero error. This means that converting the macro to a function is not sufficient to fix show_fan(). Hold the update lock across the whole read and calculation sequence so that the values remain stable. Check the other functions in the driver as well. Keep them unchanged because they either do not access shared cached values multiple times or already do so under lock. Link: https://lore.kernel.org/linux-hwmon/CALbr=LYJ_ehtp53HXEVkSpYoub+XYSTU8Rg=o1xxMJ8=5z8B-g@mail.gmail.com/ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: e872c91e726e ("hwmon: (lm63) Add support for unsigned upper temperature limits") Fixes: d216f6809eb6 ("hwmon: (lm63) Expose automatic fan speed control lookup table") Signed-off-by: Gui-Dong Han Link: https://lore.kernel.org/r/20260416135703.53262-1-hanguidong02@gmail.com [groeck: Use lm63_update_device() to get driver data in temp2_crit_hyst_store] Signed-off-by: Guenter Roeck --- drivers/hwmon/lm63.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/lm63.c b/drivers/hwmon/lm63.c index 035176a98ce9c6..30500b4d222129 100644 --- a/drivers/hwmon/lm63.c +++ b/drivers/hwmon/lm63.c @@ -333,7 +333,13 @@ static ssize_t show_fan(struct device *dev, struct device_attribute *devattr, { struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); struct lm63_data *data = lm63_update_device(dev); - return sprintf(buf, "%d\n", FAN_FROM_REG(data->fan[attr->index])); + int fan; + + mutex_lock(&data->update_lock); + fan = FAN_FROM_REG(data->fan[attr->index]); + mutex_unlock(&data->update_lock); + + return sprintf(buf, "%d\n", fan); } static ssize_t set_fan(struct device *dev, struct device_attribute *dummy, @@ -366,12 +372,14 @@ static ssize_t show_pwm1(struct device *dev, struct device_attribute *devattr, int nr = attr->index; int pwm; + mutex_lock(&data->update_lock); if (data->pwm_highres) pwm = data->pwm1[nr]; else pwm = data->pwm1[nr] >= 2 * data->pwm1_freq ? 255 : (data->pwm1[nr] * 255 + data->pwm1_freq) / (2 * data->pwm1_freq); + mutex_unlock(&data->update_lock); return sprintf(buf, "%d\n", pwm); } @@ -529,6 +537,7 @@ static ssize_t show_temp11(struct device *dev, struct device_attribute *devattr, int nr = attr->index; int temp; + mutex_lock(&data->update_lock); if (!nr) { /* * Use unsigned temperature unless its value is zero. @@ -544,7 +553,10 @@ static ssize_t show_temp11(struct device *dev, struct device_attribute *devattr, else temp = TEMP11_FROM_REG(data->temp11[nr]); } - return sprintf(buf, "%d\n", temp + data->temp2_offset); + temp += data->temp2_offset; + mutex_unlock(&data->update_lock); + + return sprintf(buf, "%d\n", temp); } static ssize_t set_temp11(struct device *dev, struct device_attribute *devattr, @@ -592,9 +604,14 @@ static ssize_t temp2_crit_hyst_show(struct device *dev, struct device_attribute *dummy, char *buf) { struct lm63_data *data = lm63_update_device(dev); - return sprintf(buf, "%d\n", temp8_from_reg(data, 2) - + data->temp2_offset - - TEMP8_FROM_REG(data->temp2_crit_hyst)); + int temp; + + mutex_lock(&data->update_lock); + temp = temp8_from_reg(data, 2) + data->temp2_offset + - TEMP8_FROM_REG(data->temp2_crit_hyst); + mutex_unlock(&data->update_lock); + + return sprintf(buf, "%d\n", temp); } static ssize_t show_lut_temp_hyst(struct device *dev, @@ -602,10 +619,14 @@ static ssize_t show_lut_temp_hyst(struct device *dev, { struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); struct lm63_data *data = lm63_update_device(dev); + int temp; - return sprintf(buf, "%d\n", lut_temp_from_reg(data, attr->index) - + data->temp2_offset - - TEMP8_FROM_REG(data->lut_temp_hyst)); + mutex_lock(&data->update_lock); + temp = lut_temp_from_reg(data, attr->index) + data->temp2_offset + - TEMP8_FROM_REG(data->lut_temp_hyst); + mutex_unlock(&data->update_lock); + + return sprintf(buf, "%d\n", temp); } /* @@ -616,7 +637,7 @@ static ssize_t temp2_crit_hyst_store(struct device *dev, struct device_attribute *dummy, const char *buf, size_t count) { - struct lm63_data *data = dev_get_drvdata(dev); + struct lm63_data *data = lm63_update_device(dev); struct i2c_client *client = data->client; long val; int err; From c9e3878ae2f57fd6786279cf5d9dc6e6e1b52f5a Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Thu, 30 Apr 2026 17:38:29 -0500 Subject: [PATCH 152/972] Revert "drm/nouveau/gsp: add support for GA100" This reverts commit 20e0c197802c545db220157fafd567a10f2b7672. Despite claiming to add GA100 support, that commit actually has quite a few problems. It falsely claims that there is no VBIOS. GA100 does have a VBIOS, but it has no display engine, so it cannot use the PRAMIN method the read VBIOS and must fall back to using PROM. For whatever reason, the VBIOS on GA100 has an "Init-from-ROM" (IFR) header where the PCI Expansion ROM would normally be found. So to find that ROM, Nouveau needs to parse the IFR header. The commit also falsely claimed that there is no graphics (GR) engine. So rather than try to fix that commit, just revert it and start over from scratch. Signed-off-by: Timur Tabi Link: https://patch.msgid.link/20260430223838.2530778-2-ttabi@nvidia.com Signed-off-by: Danilo Krummrich --- .../gpu/drm/nouveau/nvkm/engine/device/base.c | 11 +++++++++-- .../gpu/drm/nouveau/nvkm/subdev/gsp/ga100.c | 4 ++++ .../gpu/drm/nouveau/nvkm/subdev/gsp/tu102.c | 18 +++++------------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 72848ed80df73f..b101e14f841e03 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2513,6 +2513,7 @@ static const struct nvkm_device_chip nv170_chipset = { .name = "GA100", .bar = { 0x00000001, tu102_bar_new }, + .bios = { 0x00000001, nvkm_bios_new }, .devinit = { 0x00000001, ga100_devinit_new }, .fault = { 0x00000001, tu102_fault_new }, .fb = { 0x00000001, ga100_fb_new }, @@ -2529,7 +2530,6 @@ nv170_chipset = { .vfn = { 0x00000001, ga100_vfn_new }, .ce = { 0x000003ff, ga100_ce_new }, .fifo = { 0x00000001, ga100_fifo_new }, - .sec2 = { 0x00000001, tu102_sec2_new }, }; static const struct nvkm_device_chip @@ -3341,7 +3341,6 @@ nvkm_device_ctor(const struct nvkm_device_func *func, case 0x166: device->chip = &nv166_chipset; break; case 0x167: device->chip = &nv167_chipset; break; case 0x168: device->chip = &nv168_chipset; break; - case 0x170: device->chip = &nv170_chipset; break; case 0x172: device->chip = &nv172_chipset; break; case 0x173: device->chip = &nv173_chipset; break; case 0x174: device->chip = &nv174_chipset; break; @@ -3361,6 +3360,14 @@ nvkm_device_ctor(const struct nvkm_device_func *func, case 0x1b6: device->chip = &nv1b6_chipset; break; case 0x1b7: device->chip = &nv1b7_chipset; break; default: + if (nvkm_boolopt(device->cfgopt, "NvEnableUnsupportedChipsets", false)) { + switch (device->chipset) { + case 0x170: device->chip = &nv170_chipset; break; + default: + break; + } + } + if (!device->chip) { nvdev_error(device, "unknown chipset (%08x)\n", boot0); ret = -ENODEV; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/ga100.c index fdd820eeef815e..27a13aeccd3cb3 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/ga100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/ga100.c @@ -41,11 +41,15 @@ ga100_gsp_flcn = { static const struct nvkm_gsp_func ga100_gsp = { .flcn = &ga100_gsp_flcn, + .fwsec = &tu102_gsp_fwsec, .sig_section = ".fwsignature_ga100", .booter.ctor = tu102_gsp_booter_ctor, + .fwsec_sb.ctor = tu102_gsp_fwsec_sb_ctor, + .fwsec_sb.dtor = tu102_gsp_fwsec_sb_dtor, + .dtor = r535_gsp_dtor, .oneinit = tu102_gsp_oneinit, .init = tu102_gsp_init, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/tu102.c index dd82c76b8b9a5b..19cb269e7a2659 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/tu102.c @@ -318,13 +318,8 @@ tu102_gsp_oneinit(struct nvkm_gsp *gsp) if (ret) return ret; - /* - * Calculate FB layout. FRTS is a memory region created by the FWSEC-FRTS firmware. - * FWSEC comes from VBIOS. So on systems with no VBIOS (e.g. GA100), the FRTS does - * not exist. Therefore, use the existence of VBIOS to determine whether to reserve - * an FRTS region. - */ - gsp->fb.wpr2.frts.size = device->bios ? 0x100000 : 0; + /* Calculate FB layout. */ + gsp->fb.wpr2.frts.size = 0x100000; gsp->fb.wpr2.frts.addr = ALIGN_DOWN(gsp->fb.bios.addr, 0x20000) - gsp->fb.wpr2.frts.size; gsp->fb.wpr2.boot.size = gsp->boot.fw.size; @@ -348,12 +343,9 @@ tu102_gsp_oneinit(struct nvkm_gsp *gsp) if (ret) return ret; - /* Only boot FWSEC-FRTS if it actually exists */ - if (gsp->fb.wpr2.frts.size) { - ret = nvkm_gsp_fwsec_frts(gsp); - if (WARN_ON(ret)) - return ret; - } + ret = nvkm_gsp_fwsec_frts(gsp); + if (WARN_ON(ret)) + return ret; /* Reset GSP into RISC-V mode. */ ret = gsp->func->reset(gsp); From a177ae30f78688f75ef9c6277a152c5d6979b10e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:51 +0200 Subject: [PATCH 153/972] netfilter: flowtable: fix inline vlan encapsulation in xmit path Several issues in the inline vlan support: - The layer 2 encapsulation representation in the tuple takes encap[0] as the outer header and encap[1] as the inner header as seen from the ingress path. Reverse the encap loop to push first the inner then the outer vlan header. - Postpone pushing the layer 2 header once destination device is known. This allows to calculate the needed hearoom via LL_RESERVED_SPACE to accommodate the layer 2 headers. - Add and use nf_flow_vlan_push() as suggested by Eric Woudstra, this is a simplified version of skb_vlan_push() for egress path only. Fixes: c653d5a78f34 ("netfilter: flowtable: inline vlan encapsulation in xmit path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 110 ++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8d5fb7e940a173..0ce3c209050ce3 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -462,32 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, nf_flow_ip_tunnel_pop(ctx, skb); } -struct nf_flow_xmit { - const void *dest; - const void *source; - struct net_device *outdev; -}; - -static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - struct nf_flow_xmit *xmit) -{ - struct net_device *dev = xmit->outdev; - unsigned int hh_len = LL_RESERVED_SPACE(dev); - - if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - skb = skb_expand_head(skb, hh_len); - if (!skb) - return NF_STOLEN; - } - - skb->dev = dev; - dev_hard_header(skb, dev, ntohs(skb->protocol), - xmit->dest, xmit->source, skb->len); - dev_queue_xmit(skb); - - return NF_STOLEN; -} - static struct flow_offload_tuple_rhash * nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct sk_buff *skb) @@ -553,6 +527,32 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +/* Similar to skb_vlan_push. */ +static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id, + u32 needed_headroom) +{ + if (skb_vlan_tag_present(skb)) { + struct vlan_hdr *vhdr; + + if (skb_cow_head(skb, needed_headroom + VLAN_HLEN)) + return -1; + + __skb_push(skb, VLAN_HLEN); + if (skb_mac_header_was_set(skb)) + skb->mac_header -= VLAN_HLEN; + + vhdr = (struct vlan_hdr *)skb->data; + skb->network_header -= VLAN_HLEN; + vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + vhdr->h_vlan_encapsulated_proto = skb->protocol; + skb->protocol = skb->vlan_proto; + skb_postpush_rcsum(skb, skb->data, VLAN_HLEN); + } + __vlan_hwaccel_put_tag(skb, proto, id); + + return 0; +} + static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) { int data_len = skb->len + sizeof(__be16); @@ -739,17 +739,19 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, } static int nf_flow_encap_push(struct sk_buff *skb, - struct flow_offload_tuple *tuple) + struct flow_offload_tuple *tuple, + struct net_device *outdev) { + u32 needed_headroom = LL_RESERVED_SPACE(outdev); int i; - for (i = 0; i < tuple->encap_num; i++) { + for (i = tuple->encap_num - 1; i >= 0; i--) { switch (tuple->encap[i].proto) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): - skb_reset_mac_header(skb); - if (skb_vlan_push(skb, tuple->encap[i].proto, - tuple->encap[i].id) < 0) + if (nf_flow_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id, + needed_headroom) < 0) return -1; break; case htons(ETH_P_PPP_SES): @@ -762,6 +764,44 @@ static int nf_flow_encap_push(struct sk_buff *skb, return 0; } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; + struct flow_offload_tuple *tuple; +}; + +static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct net_device *dev = xmit->outdev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return; + } + + skb->dev = dev; + dev_hard_header(skb, dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); + dev_queue_xmit(skb); +} + +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + if (xmit->tuple->encap_num) { + if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0) + return NF_DROP; + } + + __nf_flow_queue_xmit(net, skb, xmit); + + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -806,9 +846,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); @@ -838,6 +875,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; return nf_flow_queue_xmit(state->net, skb, &xmit); } @@ -1128,9 +1166,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, &ip6_daddr, encap_limit) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); @@ -1160,6 +1195,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; return nf_flow_queue_xmit(state->net, skb, &xmit); } From 69c54f80f4a7072b51b5b5939185ca5e572be982 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:53 +0200 Subject: [PATCH 154/972] netfilter: flowtable: fix inline pppoe encapsulation in xmit path Address two issues in the inline pppoe encapsulation: - Add needs_gso_segment flag to segment PPPoE packets in software given that there is no GSO support for this. - Use FLOW_OFFLOAD_XMIT_DIRECT since neighbour cache is not available in point-to-point device, use the hardware address that is obtained via flowtable path discovery (ie. fill_forward_path). Fixes: 18d27bed0880 ("netfilter: flowtable: inline pppoe encapsulation in xmit path") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 4 ++- net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_ip.c | 42 +++++++++++++++++++++++++-- net/netfilter/nf_flow_table_path.c | 7 ++++- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b09c11c048d519..7b23b245a5a86a 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -148,9 +148,10 @@ struct flow_offload_tuple { /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir:2, + u16 dir:2, xmit_type:3, encap_num:2, + needs_gso_segment:1, tun_num:2, in_vlan_ingress:2; u16 mtu; @@ -232,6 +233,7 @@ struct nf_flow_route { u32 hw_ifindex; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + u8 needs_gso_segment:1; } out; enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 2c4140e6f53c51..785d8c244a7715 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment; flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 0ce3c209050ce3..2eba64eb393a2e 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -553,7 +553,8 @@ static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id, return 0; } -static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id, + u32 needed_headroom) { int data_len = skb->len + sizeof(__be16); struct ppp_hdr { @@ -562,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) } *ph; __be16 proto; - if (skb_cow_head(skb, PPPOE_SES_HLEN)) + if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN)) return -1; switch (skb->protocol) { @@ -755,7 +756,8 @@ static int nf_flow_encap_push(struct sk_buff *skb, return -1; break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + if (nf_flow_pppoe_push(skb, tuple->encap[i].id, + needed_headroom) < 0) return -1; break; } @@ -769,6 +771,7 @@ struct nf_flow_xmit { const void *source; struct net_device *outdev; struct flow_offload_tuple *tuple; + bool needs_gso_segment; }; static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -789,10 +792,41 @@ static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, dev_queue_xmit(skb); } +static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct sk_buff *segs, *nskb; + + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) + return NF_DROP; + + if (segs) + consume_skb(skb); + else + segs = skb; + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + + if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) { + kfree_skb(segs); + kfree_skb_list(nskb); + return NF_STOLEN; + } + __nf_flow_queue_xmit(net, segs, xmit); + } + + return NF_STOLEN; +} + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, struct nf_flow_xmit *xmit) { if (xmit->tuple->encap_num) { + if (skb_is_gso(skb) && xmit->needs_gso_segment) + return nf_flow_encap_gso_xmit(net, skb, xmit); + if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0) return NF_DROP; } @@ -876,6 +910,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_DROP; } xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } @@ -1196,6 +1231,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_DROP; } xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 6bb9579dcc2abb..9e88ea6a2eef78 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -86,6 +86,7 @@ struct nft_forward_info { u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + bool needs_gso_segment; enum flow_offload_xmit_type xmit_type; }; @@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, path->encap.proto; info->num_encaps++; } - if (path->type == DEV_PATH_PPPOE) + if (path->type == DEV_PATH_PPPOE) { memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + info->needs_gso_segment = 1; + } break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) @@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt, memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].xmit_type = info.xmit_type; } + route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment; } int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, From e027c218c482c6a0ae1948129ccda3b0a2033368 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Tue, 28 Apr 2026 15:41:01 +0200 Subject: [PATCH 155/972] net: phy: micrel: fix LAN8814 QSGMII soft reset LAN8814 QSGMII soft reset was moved into the probe function to avoid triggering it for each of 4 PHY-s in the package. However, that broke QSGMII link between the MAC and PHY on most LAN8814 PHY-s, specificaly for us on the Microchip LAN969x switch. Reading the QSGMII status registers it was visible that lanes were only partially synced. It looks like the reset timing is crucial, so lets move the reset back into the .config_init function but guard it with phy_package_init_once() to avoid it being triggered on each of 4 PHY-s in the package. Change the probe function to use phy_package_probe_once() for coma and PtP setup. Fixes: 96a9178a29a6 ("net: phy: micrel: lan8814 fix reset of the QSGMII interface") Signed-off-by: Robert Marko Link: https://patch.msgid.link/20260428134138.1741253-1-robert.marko@sartura.hr Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 2aa1dedd21b8eb..e211a523c2584e 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4548,6 +4548,13 @@ static int lan8814_config_init(struct phy_device *phydev) struct kszphy_priv *lan8814 = phydev->priv; int ret; + if (phy_package_init_once(phydev)) + /* Reset the PHY */ + lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + /* Based on the interface type select how the advertise ability is * encoded, to set as SGMII or as USGMII. */ @@ -4655,13 +4662,7 @@ static int lan8814_probe(struct phy_device *phydev) priv->is_ptp_available = err == LAN8814_REV_LAN8814 || err == LAN8814_REV_LAN8818; - if (phy_package_init_once(phydev)) { - /* Reset the PHY */ - lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, - LAN8814_QSGMII_SOFT_RESET, - LAN8814_QSGMII_SOFT_RESET_BIT, - LAN8814_QSGMII_SOFT_RESET_BIT); - + if (phy_package_probe_once(phydev)) { err = lan8814_release_coma_mode(phydev); if (err) return err; From 3744b0964d5267c0b651bcd8f8c25db6bf4ccbac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Apr 2026 17:46:48 +0200 Subject: [PATCH 156/972] ipv6: Implement limits on extension header parsing ipv6_{skip_exthdr,find_hdr}() and ip6_{tnl_parse_tlv_enc_lim, protocol_deliver_rcu}() iterate over IPv6 extension headers until they find a non-extension-header protocol or run out of packet data. The loops have no iteration counter, relying solely on the packet length to bound them. For a crafted packet with 8-byte extension headers filling a 64KB jumbogram, this means a worst case of up to ~8k iterations with a skb_header_pointer call each. ipv6_skip_exthdr(), for example, is used where it parses the inner quoted packet inside an incoming ICMPv6 error: - icmpv6_rcv - checksum validation - case ICMPV6_DEST_UNREACH - icmpv6_notify - pskb_may_pull() <- pull inner IPv6 header - ipv6_skip_exthdr() <- iterates here - pskb_may_pull() - ipprot->err_handler() <- sk lookup The per-iteration cost of ipv6_skip_exthdr itself is generally light, but skb_header_pointer becomes more costly on reassembled packets: the first ~1232 bytes of the inner packet are in the skb's linear area, but the remaining ~63KB are in the frag_list where skb_copy_bits is needed to read data. Initially, the idea was to add a configurable limit via a new sysctl knob with default 8, in line with knobs from commit 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options"), but two reasons eventually argued against it: - It adds to UAPI that needs to be maintained forever, and upcoming work is restricting extension header ordering anyway, leaving little reason for another sysctl knob - exthdrs_core.c is always built-in even when CONFIG_IPV6=n, where struct net has no .ipv6 member, so the read site would need an ifdef'd fallback to a constant anyway Therefore, just use a constant (IP6_MAX_EXT_HDRS_CNT). All four extension header walking functions are now bound by this limit. Note that the check in ip6_protocol_deliver_rcu() happens right before the goto resubmit, such that we don't have to have a test for ipv6_ext_hdr() in the fast-path. There's an ongoing IETF draft-iurman-6man-eh-occurrences to enforce IPv6 extension headers ordering and occurrence. The latter also discusses security implications. As per RFC8200 section 4.1, the occurrence rules for extension headers provide a practical upper bound which is 8. In order to be conservative, let's define IP6_MAX_EXT_HDRS_CNT as 12 to leave enough room for quirky setups. In the unlikely event that this is still not enough, then we might need to reconsider a sysctl. Signed-off-by: Daniel Borkmann Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260429154648.809751-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ include/net/ipv6.h | 3 +++ net/ipv6/exthdrs_core.c | 7 +++++++ net/ipv6/ip6_input.c | 5 +++++ net/ipv6/ip6_tunnel.c | 4 ++++ 5 files changed, 25 insertions(+) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e0ca3904ff8e0c..2f312d1f67d698 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -99,6 +99,7 @@ FN(FRAG_TOO_FAR) \ FN(TCP_MINTTL) \ FN(IPV6_BAD_EXTHDR) \ + FN(IPV6_TOO_MANY_EXTHDRS) \ FN(IPV6_NDISC_FRAG) \ FN(IPV6_NDISC_HOP_LIMIT) \ FN(IPV6_NDISC_BAD_CODE) \ @@ -494,6 +495,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_MINTTL, /** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */ SKB_DROP_REASON_IPV6_BAD_EXTHDR, + /** + * @SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS: Number of IPv6 extension + * headers in the packet exceeds IP6_MAX_EXT_HDRS_CNT. + */ + SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS, /** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */ SKB_DROP_REASON_IPV6_NDISC_FRAG, /** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d042afe7a24564..1dec81faff282f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -90,6 +90,9 @@ struct ip_tunnel_info; #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ +/* Hard limit on traversed IPv6 extension headers */ +#define IP6_MAX_EXT_HDRS_CNT 12 + /* * Addr type * diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 49e31e4ae7b7f6..9d06d487e8b103 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -73,6 +73,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + int exthdr_cnt = 0; *frag_offp = 0; @@ -82,6 +83,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, if (nexthdr == NEXTHDR_NONE) return -1; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; @@ -190,6 +193,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int exthdr_cnt = 0; bool found; if (fragoff) @@ -216,6 +220,9 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, return -ENOENT; } + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -EBADMSG; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 967b07aeb6831e..8972863c93ee52 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -403,6 +403,7 @@ INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final) { + int exthdr_cnt = IP6CB(skb)->flags & IP6SKB_HOPBYHOP ? 1 : 0; const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; @@ -487,6 +488,10 @@ void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, nexthdr = ret; goto resubmit_final; } else { + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) { + SKB_DR_SET(reason, IPV6_TOO_MANY_EXTHDRS); + goto discard; + } goto resubmit; } } else if (ret == 0) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c468c83af0f20e..9d1037ac082f6a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -399,11 +399,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; + int exthdr_cnt = 0; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + break; + if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; From 26ebd12e67bfc3543d77ce586c33ef29fcafab20 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 29 Apr 2026 16:19:30 +0800 Subject: [PATCH 157/972] net: enetc: fix VSI mailbox timeout handling and DMA lifecycle In the current VSI mailbox implementation, the VSI allocates a DMA buffer to store the message sent to the PSI. When the PSI receives the message request from the VSI, the hardware copies the message data from this DMA buffer to PSI's DMA buffer for processing. When enetc_msg_vsi_send() times out, two scenarios can occur: 1) Use-after-free: If the hardware hasn't completed message copying when the VSI frees the buffer, the hardware may subsequently copy the data from freed memory to PSI's DMA buffer. 2) Message race: If PSI hasn't processed the previous message when the next message is sent, the VSI may receive the previous message's reply, leading to incorrect handling. To address these issues, implement the following changes: - Check the mailbox busy status before sending a new message. If the mailbox is in busy state, it indicates the previous message is still being processed, so return an error immediately. - Add the 'msg' field to struct enetc_si to preserve the DMA buffer information. The caller of enetc_msg_vsi_send() no longer frees the DMA buffer. Instead, defer freeing until it is safe to do so (when mailbox is not busy on next send). - Add cleanup in enetc_vf_remove() to free the last message buffer. This ensures the DMA buffer remains valid during message copying and prevents message reply mismatches. Fixes: beb74ac878c8 ("enetc: Add vf to pf messaging support") Signed-off-by: Wei Fang Link: https://patch.msgid.link/20260429081930.3259824-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + .../net/ethernet/freescale/enetc/enetc_vf.c | 42 +++++++++++++++---- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index e663bb5e614eff..e691144e875671 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -330,6 +330,7 @@ struct enetc_si { struct workqueue_struct *workqueue; struct work_struct rx_mode_task; struct dentry *debugfs_root; + struct enetc_msg_swbd msg; /* Only valid for VSI */ }; #define ENETC_SI_ALIGN 32 diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index 6c4b374bcb0ef1..df8e95cc47d0fe 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -17,11 +17,36 @@ static void enetc_msg_vsi_write_msg(struct enetc_hw *hw, enetc_wr(hw, ENETC_VSIMSGSNDAR0, val); } +static void enetc_msg_dma_free(struct device *dev, struct enetc_msg_swbd *msg) +{ + if (msg->vaddr) { + dma_free_coherent(dev, msg->size, msg->vaddr, msg->dma); + msg->vaddr = NULL; + } +} + static int enetc_msg_vsi_send(struct enetc_si *si, struct enetc_msg_swbd *msg) { + struct device *dev = &si->pdev->dev; int timeout = 100; u32 vsimsgsr; + /* The VSI mailbox may be busy if last message was not yet processed + * by PSI. So need to check the mailbox status before sending. + */ + vsimsgsr = enetc_rd(&si->hw, ENETC_VSIMSGSR); + if (vsimsgsr & ENETC_VSIMSGSR_MB) { + /* It is safe to free the DMA buffer here, the caller does + * not access the DMA buffer if enetc_msg_vsi_send() fails. + */ + enetc_msg_dma_free(dev, msg); + dev_err(dev, "VSI mailbox is busy\n"); + return -EIO; + } + + /* Free the DMA buffer of the last message */ + enetc_msg_dma_free(dev, &si->msg); + si->msg = *msg; enetc_msg_vsi_write_msg(&si->hw, msg); do { @@ -32,12 +57,15 @@ static int enetc_msg_vsi_send(struct enetc_si *si, struct enetc_msg_swbd *msg) usleep_range(1000, 2000); } while (--timeout); - if (!timeout) + if (!timeout) { + dev_err(dev, "VSI mailbox timeout\n"); + return -ETIMEDOUT; + } /* check for message delivery error */ if (vsimsgsr & ENETC_VSIMSGSR_MS) { - dev_err(&si->pdev->dev, "VSI command execute error: %d\n", + dev_err(dev, "VSI command execute error: %d\n", ENETC_SIMSGSR_GET_MC(vsimsgsr)); return -EIO; } @@ -50,7 +78,6 @@ static int enetc_msg_vsi_set_primary_mac_addr(struct enetc_ndev_priv *priv, { struct enetc_msg_cmd_set_primary_mac *cmd; struct enetc_msg_swbd msg; - int err; msg.size = ALIGN(sizeof(struct enetc_msg_cmd_set_primary_mac), 64); msg.vaddr = dma_alloc_coherent(priv->dev, msg.size, &msg.dma, @@ -67,11 +94,7 @@ static int enetc_msg_vsi_set_primary_mac_addr(struct enetc_ndev_priv *priv, memcpy(&cmd->mac, saddr, sizeof(struct sockaddr)); /* send the command and wait */ - err = enetc_msg_vsi_send(priv->si, &msg); - - dma_free_coherent(priv->dev, msg.size, msg.vaddr, msg.dma); - - return err; + return enetc_msg_vsi_send(priv->si, &msg); } static int enetc_vf_set_mac_addr(struct net_device *ndev, void *addr) @@ -259,6 +282,7 @@ static void enetc_vf_remove(struct pci_dev *pdev) { struct enetc_si *si = pci_get_drvdata(pdev); struct enetc_ndev_priv *priv; + struct enetc_msg_swbd msg; priv = netdev_priv(si->ndev); unregister_netdev(si->ndev); @@ -270,7 +294,9 @@ static void enetc_vf_remove(struct pci_dev *pdev) free_netdev(si->ndev); + msg = si->msg; enetc_pci_remove(pdev); + enetc_msg_dma_free(&pdev->dev, &msg); } static const struct pci_device_id enetc_vf_id_table[] = { From 694de316f607fe2473d52ca0707e3918e72c1562 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 29 Apr 2026 16:37:42 +0800 Subject: [PATCH 158/972] net: libwx: fix VF illegal register access Register WX_CFG_PORT_ST is a PF restricted register. When a VF is initialized, attempting to read this register triggers an illegal register access, which lead to a system hang. When the device is VF, the bus function ID can be obtained directly from the PCI_FUNC(pdev->devfn). Fixes: a04ea57aae37 ("net: libwx: fix device bus LAN ID") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/4D1F4452D21DE107+20260429083743.88961-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index d3772d01e00bc8..2451f6b20b1115 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2480,8 +2480,11 @@ int wx_sw_init(struct wx *wx) wx->oem_svid = pdev->subsystem_vendor; wx->oem_ssid = pdev->subsystem_device; wx->bus.device = PCI_SLOT(pdev->devfn); - wx->bus.func = FIELD_GET(WX_CFG_PORT_ST_LANID, - rd32(wx, WX_CFG_PORT_ST)); + if (pdev->is_virtfn) + wx->bus.func = PCI_FUNC(pdev->devfn); + else + wx->bus.func = FIELD_GET(WX_CFG_PORT_ST_LANID, + rd32(wx, WX_CFG_PORT_ST)); if (wx->oem_svid == PCI_VENDOR_ID_WANGXUN || pdev->is_virtfn) { From 7a33345153eeeda195c55f15be27074e4c3b5109 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 29 Apr 2026 16:37:43 +0800 Subject: [PATCH 159/972] net: libwx: use request_irq for VF misc interrupt Currently, request_threaded_irq() is used with a primary handler but a NULL threaded handler, while also setting the IRQF_ONESHOT flag. This specific combination triggers a WARNING since the commit aef30c8d569c ("genirq: Warn about using IRQF_ONESHOT without a threaded handler"). WARNING: kernel/irq/manage.c:1502 at __setup_irq+0x4fa/0x760 Fix the issue by switching to request_irq(), which is the appropriate interface or a non-threaded interrupt handler, and removing the unnecessary IRQF_ONESHOT flag. Fixes: eb4898fde1de ("net: libwx: add wangxun vf common api") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/786DDC7D5CCA6D0A+20260429083743.88961-2-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_vf_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c index 29cdbed2e5ecd3..94ff8f5f0b4c8f 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c @@ -99,8 +99,8 @@ int wx_request_msix_irqs_vf(struct wx *wx) } } - err = request_threaded_irq(wx->msix_entry->vector, wx_msix_misc_vf, - NULL, IRQF_ONESHOT, netdev->name, wx); + err = request_irq(wx->msix_entry->vector, wx_msix_misc_vf, + 0, netdev->name, wx); if (err) { wx_err(wx, "request_irq for msix_other failed: %d\n", err); goto free_queue_irqs; From 75df490c9e8457990c8b227650f6491218ce018b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 29 Apr 2026 14:02:31 +0200 Subject: [PATCH 160/972] net: airoha: Move entries to queue head in case of DMA mapping failure in airoha_dev_xmit() In order to respect the original descriptor order and avoid any potential IOMMU fault or memory corruption, move pending queue entries to the head of hw queue tx_list if the DMA mapping of current inflight packet fails in airoha_dev_xmit routine. Fixes: 3f47e67dff1f7 ("net: airoha: Add the capability to consume out-of-order DMA tx descriptors") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260429-airoha-xmit-unmap-error-path-v2-1-32e43b7c6d25@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index f8b3d53bccadbd..d0c0c0ec8a80af 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2120,14 +2120,12 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, return NETDEV_TX_OK; error_unmap: - while (!list_empty(&tx_list)) { - e = list_first_entry(&tx_list, struct airoha_queue_entry, - list); + list_for_each_entry(e, &tx_list, list) { dma_unmap_single(dev->dev.parent, e->dma_addr, e->dma_len, DMA_TO_DEVICE); e->dma_addr = 0; - list_move_tail(&e->list, &q->tx_list); } + list_splice(&tx_list, &q->tx_list); spin_unlock_bh(&q->lock); error: From aaadccde312f1f6c752461e015adcaa25d463cbc Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:13 +0530 Subject: [PATCH 161/972] octeontx2-af: npc: cn20k: Propagate MCAM key-type errors on cn20k npc_mcam_idx_2_key_type() can fail; callers used to ignore it and still used kw_type when enabling, configuring, copying, and reading MCAM entries. That could program or decode hardware with an undefined key type. Return -EINVAL when key-type lookup fails. Return -EINVAL from npc_cn20k_copy_mcam_entry() when src and dest key types differ instead of failing silently. Change npc_cn20k_{enable,config,copy,read}_mcam_entry() to return int on success or error. Thread those errors through the cn20k MCAM write and read mbox handlers, the cn20k baseline steer read path, NPC defrag move (disable/copy/enable with dev_err and -EFAULT), and the DMAC update path in rvu_npc_fs.c. Make npc_copy_mcam_entry() return int so the cn20k branch can return npc_cn20k_copy_mcam_entry() without a void/int mismatch, and fail NPC_MCAM_SHIFT_ENTRY when copy fails. Cc: Suman Ghosh Cc: Dan Carpenter Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Link: https://lore.kernel.org/netdev/adiQJvuKlEhq2ILx@stanley.mountain/ Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-2-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 122 +++++++++++++----- .../ethernet/marvell/octeontx2/af/cn20k/npc.h | 20 +-- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 18 ++- .../marvell/octeontx2/af/rvu_npc_fs.c | 20 ++- 4 files changed, 124 insertions(+), 56 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 7291fdb89b03f6..7170dcf2620071 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -798,7 +798,7 @@ void npc_cn20k_load_mkex_profile(struct rvu *rvu, int blkaddr, iounmap(mkex_prfl_addr); } -void +int npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, int index, bool enable) { @@ -808,7 +808,9 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, u64 cfg, hw_prio; u8 kw_type; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; + if (kw_type == NPC_MCAM_KEY_X2) { cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, @@ -819,7 +821,7 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, rvu_write64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), cfg); - return; + return 0; } /* For NPC_CN20K_MCAM_KEY_X4 keys, both the banks @@ -836,6 +838,8 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), cfg); } + + return 0; } void @@ -1042,9 +1046,9 @@ npc_cn20k_set_mcam_bank_cfg(struct rvu *rvu, int blkaddr, int mcam_idx, } } -void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, - u8 intf, struct cn20k_mcam_entry *entry, - bool enable, u8 hw_prio, u8 req_kw_type) +int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, + u8 intf, struct cn20k_mcam_entry *entry, + bool enable, u8 hw_prio, u8 req_kw_type) { struct npc_mcam *mcam = &rvu->hw->mcam; int mcam_idx = index % mcam->banksize; @@ -1052,10 +1056,13 @@ void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, int kw = 0; u8 kw_type; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; + /* Disable before mcam entry update */ - npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false)) + return -EINVAL; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); /* CAM1 takes the comparison value and * CAM0 specifies match for a bit in key being '0' or '1' or 'dontcare'. * CAM1 = 0 & CAM0 = 1 => match if key = 0 @@ -1120,9 +1127,11 @@ void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, /* PF installing VF rule */ npc_cn20k_set_mcam_bank_cfg(rvu, blkaddr, mcam_idx, bank, kw_type, enable, hw_prio); + + return 0; } -void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) +int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) { struct npc_mcam *mcam = &rvu->hw->mcam; u64 cfg, sreg, dreg, soff, doff; @@ -1132,10 +1141,15 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) dbank = npc_get_bank(mcam, dest); sbank = npc_get_bank(mcam, src); - npc_mcam_idx_2_key_type(rvu, src, &src_kwtype); - npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype); + + if (npc_mcam_idx_2_key_type(rvu, src, &src_kwtype)) + return -EINVAL; + + if (npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype)) + return -EINVAL; + if (src_kwtype != dest_kwtype) - return; + return -EINVAL; src &= (mcam->banksize - 1); dest &= (mcam->banksize - 1); @@ -1170,6 +1184,8 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) if (src_kwtype == NPC_MCAM_KEY_X2) break; } + + return 0; } static void npc_cn20k_fill_entryword(struct cn20k_mcam_entry *entry, int idx, @@ -1179,16 +1195,17 @@ static void npc_cn20k_fill_entryword(struct cn20k_mcam_entry *entry, int idx, entry->kw_mask[idx] = cam1 ^ cam0; } -void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, - struct cn20k_mcam_entry *entry, - u8 *intf, u8 *ena, u8 *hw_prio) +int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, + struct cn20k_mcam_entry *entry, + u8 *intf, u8 *ena, u8 *hw_prio) { struct npc_mcam *mcam = &rvu->hw->mcam; u64 cam0, cam1, bank_cfg, cfg; int kw = 0, bank; u8 kw_type; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -1298,6 +1315,8 @@ void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 1)); entry->vtag_action = cfg; + + return 0; } int rvu_mbox_handler_npc_cn20k_mcam_write_entry(struct rvu *rvu, @@ -1335,11 +1354,10 @@ int rvu_mbox_handler_npc_cn20k_mcam_write_entry(struct rvu *rvu, if (is_pffunc_af(req->hdr.pcifunc)) nix_intf = req->intf; - npc_cn20k_config_mcam_entry(rvu, blkaddr, req->entry, nix_intf, - &req->entry_data, req->enable_entry, - req->hw_prio, req->req_kw_type); + rc = npc_cn20k_config_mcam_entry(rvu, blkaddr, req->entry, nix_intf, + &req->entry_data, req->enable_entry, + req->hw_prio, req->req_kw_type); - rc = 0; exit: mutex_unlock(&mcam->lock); return rc; @@ -1361,11 +1379,13 @@ int rvu_mbox_handler_npc_cn20k_mcam_read_entry(struct rvu *rvu, mutex_lock(&mcam->lock); rc = npc_mcam_verify_entry(mcam, pcifunc, req->entry); - if (!rc) - npc_cn20k_read_mcam_entry(rvu, blkaddr, req->entry, - &rsp->entry_data, &rsp->intf, - &rsp->enable, &rsp->hw_prio); + if (rc) + goto fail; + rc = npc_cn20k_read_mcam_entry(rvu, blkaddr, req->entry, + &rsp->entry_data, &rsp->intf, + &rsp->enable, &rsp->hw_prio); +fail: mutex_unlock(&mcam->lock); return rc; } @@ -1375,11 +1395,13 @@ int rvu_mbox_handler_npc_cn20k_mcam_alloc_and_write_entry(struct rvu *rvu, struct npc_mcam_alloc_and_write_entry_rsp *rsp) { struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc); + struct npc_mcam_free_entry_req free_req = { 0 }; struct npc_mcam_alloc_entry_req entry_req; struct npc_mcam_alloc_entry_rsp entry_rsp; struct npc_mcam *mcam = &rvu->hw->mcam; u16 entry = NPC_MCAM_ENTRY_INVALID; - int blkaddr, rc; + struct msg_rsp free_rsp; + int blkaddr, rc, err; u8 nix_intf; blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); @@ -1415,12 +1437,23 @@ int rvu_mbox_handler_npc_cn20k_mcam_alloc_and_write_entry(struct rvu *rvu, else nix_intf = pfvf->nix_rx_intf; - npc_cn20k_config_mcam_entry(rvu, blkaddr, entry, nix_intf, - &req->entry_data, req->enable_entry, - req->hw_prio, req->req_kw_type); + rc = npc_cn20k_config_mcam_entry(rvu, blkaddr, entry, nix_intf, + &req->entry_data, req->enable_entry, + req->hw_prio, req->req_kw_type); mutex_unlock(&mcam->lock); + if (rc) { + free_req.hdr.pcifunc = req->hdr.pcifunc; + free_req.entry = entry_rsp.entry; + err = rvu_mbox_handler_npc_mcam_free_entry(rvu, &free_req, &free_rsp); + if (err) + dev_err(rvu->dev, + "%s: Error to free mcam idx %u\n", + __func__, entry_rsp.entry); + return rc; + } + rsp->entry = entry_rsp.entry; return 0; } @@ -1480,9 +1513,9 @@ int rvu_mbox_handler_npc_cn20k_read_base_steer_rule(struct rvu *rvu, read_entry: /* Read the mcam entry */ - npc_cn20k_read_mcam_entry(rvu, blkaddr, index, - &rsp->entry, &intf, - &enable, &hw_prio); + rc = npc_cn20k_read_mcam_entry(rvu, blkaddr, index, + &rsp->entry, &intf, + &enable, &hw_prio); mutex_unlock(&mcam->lock); out: return rc; @@ -3607,9 +3640,30 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, NPC_AF_CN20K_MCAMEX_BANKX_STAT_EXT(midx, bank)); - npc_cn20k_enable_mcam_entry(rvu, blkaddr, old_midx, false); - npc_cn20k_copy_mcam_entry(rvu, blkaddr, old_midx, new_midx); - npc_cn20k_enable_mcam_entry(rvu, blkaddr, new_midx, true); + /* If bug happened during copy/enable mcam, then there is a bug in allocation + * algorithm itself. There is no point in rewinding and returning, as it + * will face further issue. Return error after printing error + */ + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, old_midx, false)) { + dev_err(rvu->dev, + "%s: Error happened while disabling old_mid=%u\n", + __func__, old_midx); + return -EFAULT; + } + + if (npc_cn20k_copy_mcam_entry(rvu, blkaddr, old_midx, new_midx)) { + dev_err(rvu->dev, + "%s: Error happened while copying old_midx=%u new_midx=%u\n", + __func__, old_midx, new_midx); + return -EFAULT; + } + + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, new_midx, true)) { + dev_err(rvu->dev, + "%s: Error happened while enabling new_mid=%u\n", + __func__, new_midx); + return -EFAULT; + } midx = new_midx % mcam->banksize; bank = new_midx / mcam->banksize; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 815d0b257a7e11..8f3eea9cfb1d30 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -320,16 +320,16 @@ void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc); int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, u16 *mcast, u16 *promisc, u16 *ucast); -void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, - u8 intf, struct cn20k_mcam_entry *entry, - bool enable, u8 hw_prio, u8 req_kw_type); -void npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, - int index, bool enable); -void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, - u16 src, u16 dest); -void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, - struct cn20k_mcam_entry *entry, u8 *intf, - u8 *ena, u8 *hw_prio); +int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, + u8 intf, struct cn20k_mcam_entry *entry, + bool enable, u8 hw_prio, u8 req_kw_type); +int npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, + int index, bool enable); +int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, + u16 src, u16 dest); +int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, + struct cn20k_mcam_entry *entry, u8 *intf, + u8 *ena, u8 *hw_prio); void npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index); int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index c2ca5ed1d028a4..ecaf0946b85207 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -241,7 +241,10 @@ void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, if (index < 0 || index >= mcam->banksize * mcam->banks) return; - return npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) + dev_err(rvu->dev, "Error to %s mcam %u entry\n", + enable ? "enable" : "disable", index); + return; } index &= (mcam->banksize - 1); @@ -589,8 +592,8 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, NPC_AF_MCAMEX_BANKX_CFG(src, sbank)) & 1; } -static void npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, - int blkaddr, u16 src, u16 dest) +static int npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, + int blkaddr, u16 src, u16 dest) { int dbank = npc_get_bank(mcam, dest); int sbank = npc_get_bank(mcam, src); @@ -630,6 +633,7 @@ static void npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, NPC_AF_MCAMEX_BANKX_CFG(src, sbank)); rvu_write64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_CFG(dest, dbank), cfg); + return 0; } u64 npc_get_mcam_action(struct rvu *rvu, struct npc_mcam *mcam, @@ -3266,7 +3270,10 @@ int rvu_mbox_handler_npc_mcam_shift_entry(struct rvu *rvu, npc_enable_mcam_entry(rvu, mcam, blkaddr, new_entry, false); /* Copy rule from old entry to new entry */ - npc_copy_mcam_entry(rvu, mcam, blkaddr, old_entry, new_entry); + if (npc_copy_mcam_entry(rvu, mcam, blkaddr, old_entry, new_entry)) { + rc = NPC_MCAM_INVALID_REQ; + break; + } /* Copy counter mapping, if any */ cntr = mcam->entry2cntr_map[old_entry]; @@ -3284,7 +3291,8 @@ int rvu_mbox_handler_npc_mcam_shift_entry(struct rvu *rvu, /* If shift has failed then report the failed index */ if (index != req->shift_count) { - rc = NPC_MCAM_PERM_DENIED; + if (!rc) + rc = NPC_MCAM_PERM_DENIED; rsp->failed_entry_idx = index; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index b45798d9fdabd3..fe10554b1f0e24 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -1980,13 +1980,15 @@ static int npc_update_dmac_value(struct rvu *rvu, int npcblkaddr, ether_addr_copy(rule->packet.dmac, pfvf->mac_addr); - if (is_cn20k(rvu->pdev)) - npc_cn20k_read_mcam_entry(rvu, npcblkaddr, rule->entry, - cn20k_entry, &intf, - &enable, &hw_prio); - else + if (is_cn20k(rvu->pdev)) { + if (npc_cn20k_read_mcam_entry(rvu, npcblkaddr, rule->entry, + cn20k_entry, &intf, + &enable, &hw_prio)) + return -EINVAL; + } else { npc_read_mcam_entry(rvu, mcam, npcblkaddr, rule->entry, entry, &intf, &enable); + } npc_update_entry(rvu, NPC_DMAC, &mdata, ether_addr_to_u64(pfvf->mac_addr), 0, @@ -2038,8 +2040,12 @@ void npc_mcam_enable_flows(struct rvu *rvu, u16 target) continue; } - if (rule->vfvlan_cfg) - npc_update_dmac_value(rvu, blkaddr, rule, pfvf); + if (rule->vfvlan_cfg) { + if (npc_update_dmac_value(rvu, blkaddr, rule, pfvf)) + dev_err(rvu->dev, + "Update dmac failed for %u, target=%#x\n", + rule->entry, target); + } if (rule->rx_action.op == NIX_RX_ACTION_DEFAULT) { if (!def_ucast_rule) From 1100af13fd14b523f1b0634c14be497b41c78958 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:14 +0530 Subject: [PATCH 162/972] octeontx2-af: npc: cn20k: Drop debugfs_create_file() error checks in init debugfs is not intended to be checked for allocation failures the way other kernel APIs are: callers should not fail probe or subsystem init because a debugfs node could not be created, including when debugfs is disabled in Kconfig. Replacing NULL checks with IS_ERR() checks is similarly wrong for optional debugfs. Remove dentry checks and -EFAULT returns from npc_cn20k_debugfs_init(). See: https://staticthinking.wordpress.com/2023/07/24/ debugfs-functions-are-not-supposed-to-be-checked/ Cc: Dan Carpenter Fixes: 528530dff56b ("octeontx2-af: npc: cn20k: add debugfs support") Link: https://lore.kernel.org/netdev/adjNGPWKMOk3KgWL@stanley.mountain/ Reviewed-by: Simon Horman Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-3-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../marvell/octeontx2/af/cn20k/debugfs.c | 33 ++++++------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c index 3debf2fae1a48e..6f13296303cb54 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c @@ -249,34 +249,21 @@ DEFINE_SHOW_ATTRIBUTE(npc_defrag); int npc_cn20k_debugfs_init(struct rvu *rvu) { struct npc_priv_t *npc_priv = npc_priv_get(); - struct dentry *npc_dentry; - npc_dentry = debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_mcam_layout_fops); + debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_mcam_layout_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc, + rvu, &npc_mcam_default_fops); - npc_dentry = debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc, - rvu, &npc_mcam_default_fops); + debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_vidx2idx_map_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_idx2vidx_map_fops); - npc_dentry = debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_vidx2idx_map_fops); - if (!npc_dentry) - return -EFAULT; - - npc_dentry = debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_idx2vidx_map_fops); - if (!npc_dentry) - return -EFAULT; - - npc_dentry = debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_defrag_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_defrag_fops); return 0; } From adb5ff41efbc0a9d86fabf880076973379db6e49 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:15 +0530 Subject: [PATCH 163/972] octeontx2-af: npc: cn20k: Propagate errors in defrag MCAM alloc rollback npc_defrag_alloc_free_slots() allocates MCAM indexes in up to two passes on bank0 then bank1. On failure it rolls back by freeing entries already placed in save[]. __npc_subbank_alloc() can return a negative errno while only part of the indexes are valid. The rollback loop used rc for npc_mcam_idx_2_subbank_idx() as well, so a successful lookup stored zero in rc and a later __npc_subbank_free() failure could still end with return 0 when the allocation path had also left rc at zero (for example shortfall after zero return values from the alloc helpers). Jump to the rollback path immediately when either __npc_subbank_alloc() call fails, preserving its errno. If both calls succeed but the total allocated count is still less than cnt, set rc to -ENOSPC before rollback. Use a separate err variable for npc_mcam_idx_2_subbank_idx() so a successful lookup no longer clears a non-zero rc from the allocation phase. Cc: Dan Carpenter Fixes: 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index support") Link: https://lore.kernel.org/netdev/adjNJEpILRZATB2N@stanley.mountain/ Reviewed-by: Simon Horman Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-4-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 7170dcf2620071..87da43088b675f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -2338,6 +2338,7 @@ static int __npc_subbank_alloc(struct rvu *rvu, struct npc_subbank *sb, __npc_subbank_mark_free(rvu, sb); err1: kfree(save); + *alloc_cnt = 0; return rc; } @@ -3515,7 +3516,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, { int alloc_cnt1, alloc_cnt2; struct npc_subbank *sb; - int rc, sb_off, i; + int rc, sb_off, i, err; bool deleted; sb = &npc_priv.sb[f->idx]; @@ -3529,6 +3530,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, NPC_MCAM_LOWER_PRIO, false, cnt, save, cnt, true, &alloc_cnt1); + if (alloc_cnt1 < cnt) { rc = __npc_subbank_alloc(rvu, sb, NPC_MCAM_KEY_X2, sb->b1b, @@ -3544,15 +3546,17 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, dev_err(rvu->dev, "%s: Failed to alloc cnt=%u alloc_cnt1=%u alloc_cnt2=%u\n", __func__, cnt, alloc_cnt1, alloc_cnt2); + rc = -ENOSPC; goto fail_free_alloc; } + return 0; fail_free_alloc: for (i = 0; i < alloc_cnt1 + alloc_cnt2; i++) { - rc = npc_mcam_idx_2_subbank_idx(rvu, save[i], - &sb, &sb_off); - if (rc) { + err = npc_mcam_idx_2_subbank_idx(rvu, save[i], + &sb, &sb_off); + if (err) { dev_err(rvu->dev, "%s: Error to find subbank for mcam idx=%u\n", __func__, save[i]); From d7e5940c4c508df73b15d9bc29628a83b3674fff Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:16 +0530 Subject: [PATCH 164/972] octeontx2-af: npc: cn20k: Fix target map and rule npc_defrag_move_vdx_to_free() disables, copies, and enables the MCAM entry at a new index but previously left entry2target_pffunc[] and the mcam_rules list still keyed to the old index. Copy the target PF association to the new slot, clear the old one, and retarget the rule entry so software state matches the relocated hardware context. Fixes: 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index support") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-5-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 87da43088b675f..70ce3f49adc1a8 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -3602,9 +3602,10 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, struct npc_defrag_node *v, int cnt, u16 *save) { + u16 new_midx, old_midx, vidx, target_pf; struct npc_mcam *mcam = &rvu->hw->mcam; + struct rvu_npc_mcam_rule *rule, *tmp; int i, vidx_cnt, rc, sb_off; - u16 new_midx, old_midx, vidx; struct npc_subbank *sb; bool deleted; u16 pcifunc; @@ -3723,8 +3724,21 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, mcam->entry2pfvf_map[new_midx] = pcifunc; /* Counter is not preserved */ mcam->entry2cntr_map[new_midx] = new_midx; + target_pf = mcam->entry2target_pffunc[old_midx]; + mcam->entry2target_pffunc[new_midx] = target_pf; + mcam->entry2target_pffunc[old_midx] = NPC_MCAM_INVALID_MAP; + npc_mcam_set_bit(mcam, new_midx); + /* Note: list order is not functionally required for mcam_rules */ + list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) { + if (rule->entry != old_midx) + continue; + + rule->entry = new_midx; + break; + } + /* Mark as invalid */ v->vidx[vidx_cnt - i - 1] = -1; save[cnt - i - 1] = -1; From d2dabf09632c84b7acdc0fb2eeb6b6fe9c0f9106 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:17 +0530 Subject: [PATCH 165/972] octeontx2-af: npc: cn20k: Clear MCAM entries by index and key width Replace the old four-argument CN20K MCAM clear with a per-bank static helper and npc_cn20k_clear_mcam_entry() that takes a logical MCAM index, resolves the key width via npc_mcam_idx_2_key_type(), and clears either one bank (X2) or every bank (X4). Call it from npc_clear_mcam_entry() on cn20k and log when key-type lookup fails. Use the per-bank helper from npc_cn20k_config_mcam_entry() for pre-program clears. For loopback VFs, use the promisc MCAM index as ucast_idx when copying RSS action for promisc, matching cn20k default-rule layout. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-6-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 37 ++++++++++++++++--- .../ethernet/marvell/octeontx2/af/cn20k/npc.h | 3 +- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 17 ++++++++- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 70ce3f49adc1a8..112c37c190b13d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -842,8 +842,8 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, return 0; } -void -npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index) +static void +npc_clear_x2_entry(struct rvu *rvu, int blkaddr, int bank, int index) { rvu_write64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CAMX_INTF_EXT(index, bank, 1), @@ -877,6 +877,33 @@ npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index) NPC_AF_CN20K_MCAMEX_BANKX_STAT_EXT(index, bank), 0); } +int +npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int mcam_idx) +{ + struct npc_mcam *mcam = &rvu->hw->mcam; + int bank = npc_get_bank(mcam, mcam_idx); + u8 kw_type; + int index; + + if (npc_mcam_idx_2_key_type(rvu, mcam_idx, &kw_type)) + return -EINVAL; + + index = mcam_idx & (mcam->banksize - 1); + + if (kw_type == NPC_MCAM_KEY_X2) { + npc_clear_x2_entry(rvu, blkaddr, bank, index); + return 0; + } + + /* For NPC_MCAM_KEY_X4 keys, both the banks + * need to be programmed with the same value. + */ + for (bank = 0; bank < mcam->banks_per_entry; bank++) + npc_clear_x2_entry(rvu, blkaddr, bank, index); + + return 0; +} + static void npc_cn20k_get_keyword(struct cn20k_mcam_entry *entry, int idx, u64 *cam0, u64 *cam1) { @@ -1071,7 +1098,7 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, */ if (kw_type == NPC_MCAM_KEY_X2) { /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_cn20k_clear_mcam_entry(rvu, blkaddr, bank, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, bank, mcam_idx); npc_cn20k_config_kw_x2(rvu, mcam, blkaddr, mcam_idx, intf, entry, bank, kw_type, kw, req_kw_type); @@ -1096,8 +1123,8 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, } /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_cn20k_clear_mcam_entry(rvu, blkaddr, 0, mcam_idx); - npc_cn20k_clear_mcam_entry(rvu, blkaddr, 1, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, mcam_idx, intf, entry, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 8f3eea9cfb1d30..2f761b97f91b11 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -330,8 +330,7 @@ int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, struct cn20k_mcam_entry *entry, u8 *intf, u8 *ena, u8 *hw_prio); -void npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, - int bank, int index); +int npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int index); int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); u16 npc_cn20k_vidx2idx(u16 index); u16 npc_cn20k_idx2vidx(u16 idx); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index ecaf0946b85207..44ca65efc80f09 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -261,6 +261,13 @@ static void npc_clear_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, int bank = npc_get_bank(mcam, index); int actbank = bank; + if (is_cn20k(rvu->pdev)) { + if (npc_cn20k_clear_mcam_entry(rvu, blkaddr, index)) + dev_err(rvu->dev, "%s Failed to clear mcam %u\n", + __func__, index); + return; + } + index &= (mcam->banksize - 1); for (; bank < (actbank + mcam->banks_per_entry); bank++) { rvu_write64(rvu, blkaddr, @@ -755,9 +762,15 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, /* If the corresponding PF's ucast action is RSS, * use the same action for promisc also + * Please note that for lbk(s) "index" and "ucast_idx" + * will be same. */ - ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, - nixlf, NIXLF_UCAST_ENTRY); + if (is_lbk_vf(rvu, pcifunc)) + ucast_idx = index; + else + ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, + nixlf, NIXLF_UCAST_ENTRY); + if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, blkaddr, ucast_idx); From 2b6d6bb7282c34dd8c04ee782393231acf5a26e2 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:18 +0530 Subject: [PATCH 166/972] octeontx2-af: npc: cn20k: Fix bank value For X4 keys its loop reused the bank parameter as the loop counter, so bank no longer reflected the caller's bank after the loop and the control flow was hard to follow. Program NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT directly in npc_cn20k_config_mcam_entry(): one CFG write for X2 using the computed bank, and one CFG write per bank inside the X4 action loop. Enable the entry at the end with npc_cn20k_enable_mcam_entry(..., true) instead of embedding the enable bit in bank_cfg via the removed helper. Cc: Suman Ghosh Fixes: 4e527f1e5c15 ("octeontx2-af: npc: cn20k: Add new mailboxes for CN20K silicon") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-7-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 92 ++++++++----------- 1 file changed, 37 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 112c37c190b13d..4773277fd40988 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -1045,34 +1045,6 @@ static void npc_cn20k_config_kw_x4(struct rvu *rvu, struct npc_mcam *mcam, kw, req_kw_type); } -static void -npc_cn20k_set_mcam_bank_cfg(struct rvu *rvu, int blkaddr, int mcam_idx, - int bank, u8 kw_type, bool enable, u8 hw_prio) -{ - struct npc_mcam *mcam = &rvu->hw->mcam; - u64 bank_cfg; - - bank_cfg = (u64)hw_prio << 24; - if (enable) - bank_cfg |= 0x1; - - if (kw_type == NPC_MCAM_KEY_X2) { - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), - bank_cfg); - return; - } - - /* For NPC_MCAM_KEY_X4 keys, both the banks - * need to be programmed with the same value. - */ - for (bank = 0; bank < mcam->banks_per_entry; bank++) { - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), - bank_cfg); - } -} - int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, u8 intf, struct cn20k_mcam_entry *entry, bool enable, u8 hw_prio, u8 req_kw_type) @@ -1080,6 +1052,7 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, struct npc_mcam *mcam = &rvu->hw->mcam; int mcam_idx = index % mcam->banksize; int bank = index / mcam->banksize; + u64 bank_cfg = (u64)hw_prio << 24; int kw = 0; u8 kw_type; @@ -1119,41 +1092,50 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, bank, 1), entry->vtag_action); - goto set_cfg; - } - /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); - npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); - - npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, - mcam_idx, intf, entry, - kw_type, req_kw_type); - for (bank = 0; bank < mcam->banks_per_entry; bank++) { - /* Set 'action' */ + /* Set HW priority */ rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 0), - entry->action); + NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), + bank_cfg); - /* Set TAG 'action' */ - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 1), - entry->vtag_action); + } else { + /* Clear mcam entry to avoid writes being suppressed by NPC */ + npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); - /* Set 'action2' for inline receive */ - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 2), - entry->action2); + npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, + mcam_idx, intf, entry, + kw_type, req_kw_type); + for (bank = 0; bank < mcam->banks_per_entry; bank++) { + /* Set 'action' */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 0), + entry->action); + + /* Set TAG 'action' */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 1), + entry->vtag_action); + + /* Set 'action2' for inline receive */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 2), + entry->action2); + + /* Set HW priority */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), + bank_cfg); + } } -set_cfg: /* TODO: */ /* PF installing VF rule */ - npc_cn20k_set_mcam_bank_cfg(rvu, blkaddr, mcam_idx, bank, - kw_type, enable, hw_prio); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) + return -EINVAL; return 0; } From f6803eb070bfb9a5114d16ae15053106bc7842ae Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:19 +0530 Subject: [PATCH 167/972] octeontx2-af: npc: cn20k: Fix MCAM actions read npc_cn20k_read_mcam_entry() always reloaded action and vtag_action from bank 0 after programming the CAM words. Use the bank returned by npc_get_bank() for the ACTION reads as well, and read those registers once up front so both X2 and X4 paths share the same metadata. Return directly from the X2 keyword path now that the action fields are already populated. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-8-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 4773277fd40988..bb0a9ac7aab3ba 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -1219,6 +1219,18 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 0)); + entry->action = cfg; + + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 1)); + entry->vtag_action = cfg; + + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 2)); + entry->action2 = cfg; + cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CAMX_INTF_EXT(index, bank, 1)) & 3; @@ -1268,7 +1280,7 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, bank, 0)); npc_cn20k_fill_entryword(entry, kw + 3, cam0, cam1); - goto read_action; + return 0; } for (bank = 0; bank < mcam->banks_per_entry; bank++, kw = kw + 4) { @@ -1313,18 +1325,6 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, npc_cn20k_fill_entryword(entry, kw + 3, cam0, cam1); } -read_action: - /* 'action' is set to same value for both bank '0' and '1'. - * Hence, reading bank '0' should be enough. - */ - cfg = rvu_read64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 0)); - entry->action = cfg; - - cfg = rvu_read64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 1)); - entry->vtag_action = cfg; - return 0; } From afb474bd4ffc314de766afc295ac64b42856f48e Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:20 +0530 Subject: [PATCH 168/972] octeontx2-af: npc: cn20k: Initialize default-rule index outputs up front npc_cn20k_dft_rules_idx_get() wrote USHRT_MAX into individual outputs only on some error paths (lbk promisc lookup, VF ucast lookup, and the PF rule walk), which could leave other caller slots stale across retries. Set every non-NULL bcast/mcast/promisc/ucast pointer to USHRT_MAX once at entry, then drop the duplicate assignments on failure. Successful lookups still overwrite the relevant slot before returning. Fixes: 09d3b7a1403f ("octeontx2-af: npc: cn20k: Allocate default MCAM indexes") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-9-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index bb0a9ac7aab3ba..b3f34b84c11499 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -4016,6 +4016,13 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, void *val; int i, j; + for (i = 0; i < ARRAY_SIZE(ptr); i++) { + if (!ptr[i]) + continue; + + *ptr[i] = USHRT_MAX; + } + if (!npc_priv.init_done) return 0; @@ -4031,7 +4038,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, npc_dft_rule_name[NPC_DFT_RULE_PROMISC_ID], pcifunc); - *ptr[0] = USHRT_MAX; return -ESRCH; } @@ -4051,7 +4057,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, npc_dft_rule_name[NPC_DFT_RULE_UCAST_ID], pcifunc); - *ptr[3] = USHRT_MAX; return -ESRCH; } @@ -4071,7 +4076,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, __func__, npc_dft_rule_name[i], pcifunc); - *ptr[j] = USHRT_MAX; continue; } From 013717353c03b65f5b00a5cefa1515b6b45777b7 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:21 +0530 Subject: [PATCH 169/972] octeontx2-af: npc: cn20k: Tear down default MCAM rules explicitly on free npc_cn20k_dft_rules_free() used the NPC MCAM mbox "free all" path, which does not match how cn20k tracks default-rule MCAM slots indexes. Resolve the default-rule indices, then for each valid slot clear the bitmap entry, drop the PF/VF map, disable the MCAM line, clear the target function, and npc_cn20k_idx_free(). Remove any matching software mcam_rules nodes. On hard failure from idx_free, WARN and stop so the box stays up for analysis. In npc_mcam_free_all_entries(), prefetch the same default-rule indices and, on cn20k, skip bitmap clear and idx_free when the scanned entry is one of those reserved defaults (they are released by npc_cn20k_dft_rules_free). Fixes: 09d3b7a1403f ("octeontx2-af: npc: cn20k: Allocate default MCAM indexes") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-10-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 51 ++++++++++++---- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 59 +++++++++++++------ 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index b3f34b84c11499..1129565a01bdf4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -4178,11 +4178,11 @@ static bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc) { - struct npc_mcam_free_entry_req free_req = { 0 }; + struct npc_mcam *mcam = &rvu->hw->mcam; + u16 ptr[4] = {[0 ... 3] = USHRT_MAX}; + struct rvu_npc_mcam_rule *rule, *tmp; unsigned long index; - struct msg_rsp rsp; - u16 ptr[4]; - int rc, i; + int blkaddr, rc, i; void *map; if (!npc_priv.init_done) @@ -4240,14 +4240,43 @@ void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc) } free_rules: + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); + if (blkaddr < 0) + return; + for (int i = 0; i < 4; i++) { + if (ptr[i] == USHRT_MAX) + continue; - free_req.hdr.pcifunc = pcifunc; - free_req.all = 1; - rc = rvu_mbox_handler_npc_mcam_free_entry(rvu, &free_req, &rsp); - if (rc) - dev_err(rvu->dev, - "%s: Error deleting default entries (pcifunc=%#x\n", - __func__, pcifunc); + mutex_lock(&mcam->lock); + npc_mcam_clear_bit(mcam, ptr[i]); + mcam->entry2pfvf_map[ptr[i]] = NPC_MCAM_INVALID_MAP; + npc_cn20k_enable_mcam_entry(rvu, blkaddr, ptr[i], false); + mcam->entry2target_pffunc[ptr[i]] = 0x0; + mutex_unlock(&mcam->lock); + + rc = npc_cn20k_idx_free(rvu, &ptr[i], 1); + if (rc) { + /* Non recoverable error. Let us WARN and return. Keep system alive to + * enable debugging + */ + WARN(1, "%s Error deleting default entries (pcifunc=%#x) mcam_idx=%u\n", + __func__, pcifunc, ptr[i]); + return; + } + } + + mutex_lock(&mcam->lock); + list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) { + for (int i = 0; i < 4; i++) { + if (ptr[i] != rule->entry) + continue; + + list_del(&rule->list); + kfree(rule); + break; + } + } + mutex_unlock(&mcam->lock); } int npc_cn20k_dft_rules_alloc(struct rvu *rvu, u16 pcifunc) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 44ca65efc80f09..5d349d131fdb63 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2521,33 +2521,58 @@ void npc_mcam_clear_bit(struct npc_mcam *mcam, u16 index) static void npc_mcam_free_all_entries(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr, u16 pcifunc) { + u16 dft_idxs[NPC_DFT_RULE_MAX_ID] = {[0 ... NPC_DFT_RULE_MAX_ID - 1] = USHRT_MAX}; + bool cn20k_dft_rl; u16 index, cntr; int rc; + npc_cn20k_dft_rules_idx_get(rvu, pcifunc, + &dft_idxs[NPC_DFT_RULE_BCAST_ID], + &dft_idxs[NPC_DFT_RULE_MCAST_ID], + &dft_idxs[NPC_DFT_RULE_PROMISC_ID], + &dft_idxs[NPC_DFT_RULE_UCAST_ID]); + /* Scan all MCAM entries and free the ones mapped to 'pcifunc' */ for (index = 0; index < mcam->bmap_entries; index++) { - if (mcam->entry2pfvf_map[index] == pcifunc) { + if (mcam->entry2pfvf_map[index] != pcifunc) + continue; + + cn20k_dft_rl = false; + + if (is_cn20k(rvu->pdev)) { + if (dft_idxs[NPC_DFT_RULE_BCAST_ID] == index || + dft_idxs[NPC_DFT_RULE_MCAST_ID] == index || + dft_idxs[NPC_DFT_RULE_PROMISC_ID] == index || + dft_idxs[NPC_DFT_RULE_UCAST_ID] == index) { + cn20k_dft_rl = true; + } + } + + /* Disable the entry */ + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false); + + if (!cn20k_dft_rl) { mcam->entry2pfvf_map[index] = NPC_MCAM_INVALID_MAP; /* Free the entry in bitmap */ npc_mcam_clear_bit(mcam, index); - /* Disable the entry */ - npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false); - - /* Update entry2counter mapping */ - cntr = mcam->entry2cntr_map[index]; - if (cntr != NPC_MCAM_INVALID_MAP) - npc_unmap_mcam_entry_and_cntr(rvu, mcam, - blkaddr, index, - cntr); mcam->entry2target_pffunc[index] = 0x0; - if (is_cn20k(rvu->pdev)) { - rc = npc_cn20k_idx_free(rvu, &index, 1); - if (rc) - dev_err(rvu->dev, - "Failed to free mcam idx=%u pcifunc=%#x\n", - index, pcifunc); - } } + + /* Update entry2counter mapping */ + cntr = mcam->entry2cntr_map[index]; + if (cntr != NPC_MCAM_INVALID_MAP) + npc_unmap_mcam_entry_and_cntr(rvu, mcam, + blkaddr, index, + cntr); + + if (!is_cn20k(rvu->pdev) || cn20k_dft_rl) + continue; + + rc = npc_cn20k_idx_free(rvu, &index, 1); + if (rc) + dev_err(rvu->dev, + "Failed to free mcam idx=%u pcifunc=%#x\n", + index, pcifunc); } } From bc968f61bf0ad4f085559e5e3d168105fdf88204 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:22 +0530 Subject: [PATCH 170/972] octeontx2-af: npc: cn20k: Reject missing default-rule MCAM indices When cn20k default L2 rules are not installed, npc_cn20k_dft_rules_idx_get() leaves broadcast, multicast, promiscuous, and unicast slots at USHRT_MAX. npc_get_nixlf_mcam_index() previously returned that sentinel as a valid MCAM index, so callers could program hardware with an invalid index. Return -EINVAL from the cn20k branches of npc_get_nixlf_mcam_index() when the requested slot is still USHRT_MAX. Harden cn20k NPC MCAM entry helpers to reject out-of-range indices before touching hardware. Drop the early bounds check in npc_enable_mcam_entry() for cn20k so invalid indices are validated inside npc_cn20k_enable_mcam_entry() instead of being silently ignored. In rvu_npc_update_flowkey_alg_idx(), treat negative MCAM indices like out-of-range values, and only update RSS actions for promiscuous and all-multi paths when the resolved index is non-negative. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-11-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/npc.c | 14 +- .../ethernet/marvell/octeontx2/af/cn20k/npc.h | 1 + .../ethernet/marvell/octeontx2/af/rvu_nix.c | 3 + .../ethernet/marvell/octeontx2/af/rvu_npc.c | 137 +++++++++++++++++- .../marvell/octeontx2/af/rvu_npc_fs.c | 10 +- 5 files changed, 155 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 1129565a01bdf4..6b3f453fd5004e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -808,6 +808,9 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, u64 cfg, hw_prio; u8 kw_type; + if (index < 0 || index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -1056,6 +1059,9 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, int kw = 0; u8 kw_type; + if (index < 0 || index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -1148,6 +1154,9 @@ int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) int bank, i, sb, db; int dbank, sbank; + if (src >= mcam->total_entries || dest >= mcam->total_entries) + return -EINVAL; + dbank = npc_get_bank(mcam, dest); sbank = npc_get_bank(mcam, src); @@ -1213,6 +1222,9 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, int kw = 0, bank; u8 kw_type; + if (index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -4170,7 +4182,7 @@ int rvu_mbox_handler_npc_get_dft_rl_idxs(struct rvu *rvu, struct msg_req *req, return 0; } -static bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) +bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) { return is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)) || is_lbk_vf(rvu, pcifunc); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 2f761b97f91b11..3d5eb952cc0736 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -335,5 +335,6 @@ int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); u16 npc_cn20k_vidx2idx(u16 index); u16 npc_cn20k_idx2vidx(u16 idx); int npc_cn20k_defrag(struct rvu *rvu); +bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc); #endif /* NPC_CN20K_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index ef5b081162ebfb..f977734ae712c6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -3577,6 +3577,9 @@ static int nix_update_mce_rule(struct rvu *rvu, u16 pcifunc, mcam_index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, type); + if (mcam_index < 0) + return -EINVAL; + err = nix_update_mce_list(rvu, pcifunc, mce_list, mce_idx, mcam_index, add); return err; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 5d349d131fdb63..3c814d157ab988 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -163,14 +163,35 @@ int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, if (rc) return -EFAULT; + if (is_lbk_vf(rvu, pcifunc)) { + if (promisc == USHRT_MAX) + return -EINVAL; + return promisc; + } + + if (is_cgx_vf(rvu, pcifunc)) { + if (ucast == USHRT_MAX) + return -EINVAL; + + return ucast; + } + switch (type) { case NIXLF_BCAST_ENTRY: + if (bcast == USHRT_MAX) + return -EINVAL; return bcast; case NIXLF_ALLMULTI_ENTRY: + if (mcast == USHRT_MAX) + return -EINVAL; return mcast; case NIXLF_PROMISC_ENTRY: + if (promisc == USHRT_MAX) + return -EINVAL; return promisc; case NIXLF_UCAST_ENTRY: + if (ucast == USHRT_MAX) + return -EINVAL; return ucast; default: return -EINVAL; @@ -238,9 +259,6 @@ void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, int actbank = bank; if (is_cn20k(rvu->pdev)) { - if (index < 0 || index >= mcam->banksize * mcam->banks) - return; - if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) dev_err(rvu->dev, "Error to %s mcam %u entry\n", enable ? "enable" : "disable", index); @@ -434,6 +452,15 @@ static u64 npc_get_default_entry_action(struct rvu *rvu, struct npc_mcam *mcam, index = npc_get_nixlf_mcam_index(mcam, pf_func, nixlf, NIXLF_UCAST_ENTRY); + + if (index < 0) { + dev_err(rvu->dev, + "%s: failed to get ucast entry pcifunc:0x%x\n", + __func__, pf_func); + /* Action 0 is drop */ + return 0; + } + bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -700,6 +727,12 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } /* Don't change the action if entry is already enabled * Otherwise RSS action may get overwritten. @@ -755,11 +788,21 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_PROMISC_ENTRY); + /* In cn20k, default indexes are installed only for CGX mapped + * and lbk interfaces + */ if (is_cgx_vf(rvu, pcifunc)) index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, NIXLF_PROMISC_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + /* If the corresponding PF's ucast action is RSS, * use the same action for promisc also * Please note that for lbk(s) "index" and "ucast_idx" @@ -770,6 +813,12 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, else ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast/promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, @@ -844,6 +893,14 @@ void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_PROMISC_ENTRY); + + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -884,6 +941,12 @@ void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_BCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get bcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } if (!hw->cap.nix_rx_multicast) { /* Early silicon doesn't support pkt replication, @@ -948,12 +1011,25 @@ void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get mcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } /* If the corresponding PF's ucast action is RSS, * use the same action for multicast entry also */ ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, blkaddr, ucast_idx); @@ -1018,6 +1094,13 @@ void rvu_npc_enable_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get mcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -1130,8 +1213,12 @@ void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf, index = mcam_index; } - if (index >= mcam->total_entries) + if (index < 0 || index >= mcam->total_entries) { + dev_err(rvu->dev, + "%s: Invalid mcam index, pcifunc=%#x\n", + __func__, pcifunc); return; + } bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -1175,16 +1262,18 @@ void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf, /* If PF's promiscuous entry is enabled, * Set RSS action for that entry as well */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, - blkaddr, alg_idx); + if (index >= 0) + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); /* If PF's allmulti entry is enabled, * Set RSS action for that entry as well */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, - blkaddr, alg_idx); + if (index >= 0) + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); } } @@ -1197,12 +1286,22 @@ void npc_enadis_default_mce_entry(struct rvu *rvu, u16 pcifunc, int index, blkaddr, mce_idx; struct rvu_pfvf *pfvf; + /* multicast pkt replication is not enabled for AF's VFs & SDP links */ + if (is_lbk_vf(rvu, pcifunc) || is_sdp_pfvf(rvu, pcifunc)) + return; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); if (blkaddr < 0) return; index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, type); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get entry for pcifunc=%#x, type=%u\n", + __func__, pcifunc, type); + return; + } /* disable MCAM entry when packet replication is not supported by hw */ if (!hw->cap.nix_rx_multicast && !is_vf(pcifunc)) { @@ -1231,6 +1330,10 @@ static void npc_enadis_default_entries(struct rvu *rvu, u16 pcifunc, struct npc_mcam *mcam = &rvu->hw->mcam; int index, blkaddr; + /* only CGX or LBK interfaces have default entries */ + if (is_cn20k(rvu->pdev) && !npc_is_cgx_or_lbk(rvu, pcifunc)) + return; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); if (blkaddr < 0) return; @@ -1240,6 +1343,12 @@ static void npc_enadis_default_entries(struct rvu *rvu, u16 pcifunc, pfvf->nix_rx_intf)) { index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -3897,6 +4006,12 @@ int rvu_mbox_handler_npc_read_base_steer_rule(struct rvu *rvu, /* Read the default ucast entry if there is no pkt steering rule */ index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + mutex_unlock(&mcam->lock); + rc = NIX_AF_ERR_AF_LF_INVALID; + goto out; + } + read_entry: /* Read the mcam entry */ npc_read_mcam_entry(rvu, mcam, blkaddr, index, &rsp->entry, &intf, @@ -3970,6 +4085,12 @@ void rvu_npc_clear_ucast_entry(struct rvu *rvu, int pcifunc, int nixlf) ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } npc_enable_mcam_entry(rvu, mcam, blkaddr, ucast_idx, false); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index fe10554b1f0e24..6ae9cdcb608b0c 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -1444,7 +1444,7 @@ static int npc_install_flow(struct rvu *rvu, int blkaddr, u16 target, struct msg_rsp write_rsp; struct mcam_entry *entry; bool new = false; - u16 entry_index; + int entry_index; int err; installed_features = req->features; @@ -1477,6 +1477,14 @@ static int npc_install_flow(struct rvu *rvu, int blkaddr, u16 target, if (req->default_rule) { entry_index = npc_get_nixlf_mcam_index(mcam, target, nixlf, NIXLF_UCAST_ENTRY); + + if (entry_index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for target=%#x\n", + __func__, target); + return -EINVAL; + } + enable = is_mcam_entry_enabled(rvu, mcam, blkaddr, entry_index); } From a2e5b58811c7bb7d63f366f586cc7317f20e62e7 Mon Sep 17 00:00:00 2001 From: Avi Radinsky Date: Wed, 29 Apr 2026 18:35:23 -0400 Subject: [PATCH 171/972] Documentation: riscv: cmodx: fix typos Fix typos in the dynamic ftrace section: atmoic -> atomic (twice), pacthable -> patchable, derect -> directed. Signed-off-by: Avi Radinsky Acked-by: Randy Dunlap Link: https://patch.msgid.link/391d16fb-5f11-45fa-8f3b-1debe095695e@tennr.com Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/cmodx.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/arch/riscv/cmodx.rst b/Documentation/arch/riscv/cmodx.rst index 40ba53bed5dffd..cbfa812a11b455 100644 --- a/Documentation/arch/riscv/cmodx.rst +++ b/Documentation/arch/riscv/cmodx.rst @@ -21,13 +21,13 @@ call at each patchable function entry, and patches it dynamically at runtime to enable or disable the redirection. In the case of RISC-V, 2 instructions, AUIPC + JALR, are required to compose a function call. However, it is impossible to patch 2 instructions and expect that a concurrent read-side executes them -without a race condition. This series makes atmoic code patching possible in +without a race condition. This series makes atomic code patching possible in RISC-V ftrace. Kernel preemption makes things even worse as it allows the old state to persist across the patching process with stop_machine(). In order to get rid of stop_machine() and run dynamic ftrace with full kernel preemption, we partially initialize each patchable function entry at boot-time, -setting the first instruction to AUIPC, and the second to NOP. Now, atmoic +setting the first instruction to AUIPC, and the second to NOP. Now, atomic patching is possible because the kernel only has to update one instruction. According to Ziccif, as long as an instruction is naturally aligned, the ISA guarantee an atomic update. @@ -36,8 +36,8 @@ By fixing down the first instruction, AUIPC, the range of the ftrace trampoline is limited to +-2K from the predetermined target, ftrace_caller, due to the lack of immediate encoding space in RISC-V. To address the issue, we introduce CALL_OPS, where an 8B naturally align metadata is added in front of each -pacthable function. The metadata is resolved at the first trampoline, then the -execution can be derect to another custom trampoline. +patchable function. The metadata is resolved at the first trampoline, then the +execution can be directed to another custom trampoline. CMODX in the User Space ----------------------- From 4d2b03699460b8fd5df34408a03a84a1a7ff8aa1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 9 Apr 2026 09:11:39 +0000 Subject: [PATCH 172/972] riscv: errata: Fix bitwise vs logical AND in MIPS errata patching The condition checking whether a specific errata needs patching uses logical AND (&&) instead of bitwise AND (&). Since logical AND only checks that both operands are non-zero, this causes all errata patches to be applied whenever any single errata is detected, rather than only applying the matching one. The SiFive errata implementation correctly uses bitwise AND for the same check. Fixes: 0b0ca959d206 ("riscv: errata: Fix the PAUSE Opcode for MIPS P8700") Signed-off-by: Michael Neuling Assisted-by: Cursor:claude-4.6-opus-high-thinking Link: https://patch.msgid.link/20260409091143.1348853-2-mikey@neuling.org [pjw@kernel.org: fixed checkpatch warning] Signed-off-by: Paul Walmsley --- arch/riscv/errata/mips/errata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/errata/mips/errata.c b/arch/riscv/errata/mips/errata.c index e984a8152208c3..2c3dc2259e93e9 100644 --- a/arch/riscv/errata/mips/errata.c +++ b/arch/riscv/errata/mips/errata.c @@ -57,7 +57,7 @@ void mips_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, } tmp = (1U << alt->patch_id); - if (cpu_req_errata && tmp) { + if (cpu_req_errata & tmp) { mutex_lock(&text_mutex); patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt), alt->alt_len); From baa3c65435fb3f450b262672bc06db887a92d397 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 21:55:01 +0200 Subject: [PATCH 173/972] netfilter: flowtable: use skb_pull_rcsum() to pop vlan/pppoe header This adjusts the checksum, if required, after pulling the layer 2 header, either the pppoe header or the inner vlan header in the double-tagged vlan packets. Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 2eba64eb393a2e..9c05a50d601384 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, switch (skb->protocol) { case htons(ETH_P_8021Q): vlan_hdr = (struct vlan_hdr *)skb->data; - __skb_pull(skb, VLAN_HLEN); + skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vlan_hdr); skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): skb->protocol = __nf_flow_pppoe_proto(skb); - skb_pull(skb, PPPOE_SES_HLEN); + skb_pull_rcsum(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; } From 845db023a8aeba8b14315a846dcfba31ee727fb1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 May 2026 19:23:12 +0800 Subject: [PATCH 174/972] ublk: don't issue uring_cmd from fallback task work When ublk_ch_uring_cmd_cb() runs as fallback task work (e.g., because the submitting task is exiting), the command should not be issued as current is a kworker, not the daemon task. This can cause io->task to capture the wrong task in __ublk_fetch(), leading to a task mismatch warning in ublk_uring_cmd_cancel_fn(). Check tw.cancel and return -ECANCELED instead of issuing the command from fallback context. Fixes: 3421c7f68bba ("ublk: make sure io cmd handled in submitter task context") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260501112312.947327-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8e5f3738c20330..d10460d29e4abc 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3496,8 +3496,10 @@ static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) { unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); - int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); + int ret = -ECANCELED; + if (!tw.cancel) + ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) io_uring_cmd_done(cmd, ret, issue_flags); } From 56722cfbb78d7eb41756cd78dc5192d08bd14f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A1mon=20van=20Raaij?= Date: Thu, 30 Apr 2026 21:12:24 +0200 Subject: [PATCH 175/972] ALSA: hda/realtek: Add codec SSID quirk for Lenovo Yoga Pro 9 16IMH9 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Yoga Pro 9 16IMH9 (codec SSID 17aa:38d6) shares PCI audio device subsystem ID 17aa:3811 with the Legion S7 15IMH05. The existing SND_PCI_QUIRK entry for the Legion routes both machines to ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS, which does not bind the TAS2781 smart amplifiers, resulting in near-silent built-in speakers. Add an HDA_CODEC_QUIRK entry immediately before the conflicting PCI quirk that matches the Yoga Pro 9's unique codec SSID and routes it to ALC287_FIXUP_TAS2781_I2C. Codec quirks are evaluated after PCI quirks and take precedence, leaving the Legion S7 15IMH05 entry unaffected. This follows the same pattern used to disambiguate PCI SSID 17aa:3847 (shared between Yoga Pro 7 14IMH9 and Legion 7 16ACHG6), where a HDA_CODEC_QUIRK for codec SSID 17aa:38cf resolves the conflict. Signed-off-by: Rámon van Raaij Link: https://patch.msgid.link/20260430191224.patch1-ramon@vanraaij.eu Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 9a5747c2e359b6..e061673d3c2eb7 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7647,6 +7647,10 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3801, "Lenovo Yoga9 14IAP7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), HDA_CODEC_QUIRK(0x17aa, 0x3802, "DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga Pro 9 14IRP8", ALC287_FIXUP_TAS2781_I2C), + /* Yoga Pro 9 16IMH9 shares PCI SSID 17aa:3811 with Legion S7 15IMH05; + * use codec SSID to distinguish them + */ + HDA_CODEC_QUIRK(0x17aa, 0x38d6, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3811, "Legion S7 15IMH05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), From ad39a189bfebb3de580f390bc000f9e121c6aca3 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 30 Apr 2026 23:49:50 +0300 Subject: [PATCH 176/972] ALSA: usb-audio: add min_mute quirk for Razer Nommo V2 X ID 1532:055e Razer USA, Ltd Razer Nommo V2 X is tested to have muted min playback volume. Apply quirk for that. Link: https://gitlab.freedesktop.org/pipewire/pipewire/-/work_items/5235 Signed-off-by: Pauli Virtanen Link: https://patch.msgid.link/94449577332d14d7974864903825f27e5824ddbc.1777579951.git.pav@iki.fi Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 0b4ecc2c6bcc40..f77ff05dff0be9 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2366,6 +2366,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x152a, 0x880a, /* NeuralDSP Quad Cortex */ 0), /* Doesn't have the vendor quirk which would otherwise apply */ + DEVICE_FLG(0x1532, 0x055e, /* Razer Nommo V2 X */ + QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x154e, 0x1002, /* Denon DCD-1500RE */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), DEVICE_FLG(0x154e, 0x1003, /* Denon DA-300USB */ From 2e42a17b8f6bc3c0cd69d7556b588011d3ec2394 Mon Sep 17 00:00:00 2001 From: Eliot Courtney Date: Thu, 23 Apr 2026 21:36:52 +0900 Subject: [PATCH 177/972] rust: drm: gem: clean up GEM state in init failure case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if `drm_gem_object_init` fails, the object is freed without any cleanup. Perform the cleanup in that case. Cc: stable@vger.kernel.org Fixes: c284d3e42338 ("rust: drm: gem: Add GEM object abstraction") Signed-off-by: Eliot Courtney Reviewed-by: Alice Ryhl Reviewed-by: Onur Özkan Link: https://patch.msgid.link/20260423-fix-gem-1-v1-1-e12e35f7bba9@nvidia.com [ Move safety comment closer to unsafe block to avoid a clippy warning. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/drm/gem/mod.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/rust/kernel/drm/gem/mod.rs b/rust/kernel/drm/gem/mod.rs index 75acda7ba50016..01b5bd47a3332d 100644 --- a/rust/kernel/drm/gem/mod.rs +++ b/rust/kernel/drm/gem/mod.rs @@ -277,8 +277,17 @@ impl Object { // SAFETY: `obj.as_raw()` is guaranteed to be valid by the initialization above. unsafe { (*obj.as_raw()).funcs = &Self::OBJECT_FUNCS }; - // SAFETY: The arguments are all valid per the type invariants. - to_result(unsafe { bindings::drm_gem_object_init(dev.as_raw(), obj.obj.get(), size) })?; + if let Err(err) = + // SAFETY: The arguments are all valid per the type invariants. + to_result(unsafe { + bindings::drm_gem_object_init(dev.as_raw(), obj.obj.get(), size) + }) + { + // SAFETY: `drm_gem_object_init()` initializes the private GEM object state before + // failing, so `drm_gem_private_object_fini()` is the matching cleanup. + unsafe { bindings::drm_gem_private_object_fini(obj.obj.get()) }; + return Err(err); + } // SAFETY: We will never move out of `Self` as `ARef` is always treated as pinned. let ptr = KBox::into_raw(unsafe { Pin::into_inner_unchecked(obj) }); From 41ca998fbe30755191342b58e4f642cf3052ef2b Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 1 May 2026 19:07:19 +0200 Subject: [PATCH 178/972] parisc: Fix 64-bit kernel build when CONFIG_COMPAT=n VDSO32_SYMBOL() is used in signal.c, defining the value to zero avoids liker issues when CONFIG_COMPAT=n. Signed-off-by: Helge Deller --- arch/parisc/include/asm/vdso.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h index 5501560f5ffe0f..e5cca3c9c8e7aa 100644 --- a/arch/parisc/include/asm/vdso.h +++ b/arch/parisc/include/asm/vdso.h @@ -6,13 +6,14 @@ #ifdef CONFIG_64BIT #include +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) #include -#endif - -#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) #define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) 0UL +#endif #endif /* __ASSEMBLER__ */ From cb48828f06afa232cc330f0f4d6be101067810b3 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 23 Apr 2026 20:17:45 +0100 Subject: [PATCH 179/972] selftests/rseq: Don't run tests with runner scripts outside of the scripts The rseq selftests include two runner scripts run_param_test.sh and run_syscall_errors_test.sh which set up the environment for test binaries and run them with various parameters. Currently we list these test binaries in TEST_GEN_PROGS but this results in the kselftest framework running them directly as well as via the runners, resulting in duplication and spurious failures when the environment is not correctly set up (eg, if glibc tries to use rseq). Move the binaries the runners invoke to TEST_GEN_PROGS_EXTENDED, binaries listed there are built but not run by the framework. The param_test benchmarks are not moved since they are not run by run_param_test.sh. Fixes: 830969e7821a ("selftests/rseq: Implement time slice extension test") Signed-off-by: Mark Brown Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260423-selftests-rseq-use-runner-v1-1-e13a133754c1@kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 4ef90823b6526f..0d1947c0d6235f 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -14,12 +14,15 @@ LDLIBS += -lpthread -ldl # still track changes to header files and depend on shared object. OVERRIDE_TARGETS = 1 -TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ - param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ - syscall_errors_test slice_test - -TEST_GEN_PROGS_EXTENDED = librseq.so +TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test \ + param_test_benchmark param_test_mm_cid_benchmark slice_test + +TEST_GEN_PROGS_EXTENDED = librseq.so \ + param_test \ + param_test_compare_twice \ + param_test_mm_cid \ + param_test_mm_cid_compare_twice \ + syscall_errors_test TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh From 2cb68e45120dfc66404c7547d95b8ac6ff0b25ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:10:19 +0200 Subject: [PATCH 180/972] rseq: Set rseq::cpu_id_start to 0 on unregistration The RSEQ rework changed that to RSEQ_CPU_UNINITILIZED, which is obviously incompatible. Revert back to the original behavior. Fixes: 0f085b41880e ("rseq: Provide and use rseq_set_ids()") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.271566313%40kernel.org Cc: stable@vger.kernel.org --- kernel/rseq.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/rseq.c b/kernel/rseq.c index 38d3ef540760f4..b9f11931ef7852 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void) } __initcall(rseq_debugfs_init); -static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) -{ - return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); -} - static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { struct rseq __user *urseq = t->rseq.usrptr; @@ -384,19 +379,22 @@ void rseq_syscall(struct pt_regs *regs) static bool rseq_reset_ids(void) { - struct rseq_ids ids = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, - .mm_cid = 0, - }; + struct rseq __user *rseq = current->rseq.usrptr; /* * If this fails, terminate it because this leaves the kernel in * stupid state as exit to user space will try to fixup the ids * again. */ - if (rseq_set_ids(current, &ids, 0)) - return true; + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); + } + return true; +efault: force_sig(SIGSEGV); return false; } From e9766e6f7d330dce7530918d8c6e3ec96d6c6e24 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 10:14:41 +0200 Subject: [PATCH 181/972] rseq: Protect rseq_reset() against interrupts rseq_reset() uses memset() to clear the tasks rseq data. That's racy against membarrier() and preemption. Guard it with irqsave to cure this. Fixes: faba9d250eae ("rseq: Introduce struct rseq_data") Reported-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.353887714%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b9d62fc2140dd1..f446909551df05 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -119,6 +119,8 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { + /* Protect against preemption and membarrier IPI */ + guard(irqsave)(); memset(&t->rseq, 0, sizeof(t->rseq)); t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } From 010b7723c0a3b9ad58f50b715dbe2e7781d29400 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Apr 2026 09:34:45 +0200 Subject: [PATCH 182/972] rseq: Don't advertise time slice extensions if disabled If time slice extensions have been disabled on the kernel command line, then advertising them in RSEQ flags is wrong. Adjust the conditionals to reflect reality, fixup the misleading comments about the gap of these flags and the rseq::flags field. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.437059375%40kernel.org Cc: stable@vger.kernel.org --- include/uapi/linux/rseq.h | 5 ++++- kernel/rseq.c | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index f69344fe6c0863..ca6fe1f9d05e7e 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -28,7 +28,7 @@ enum rseq_cs_flags_bit { RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, - /* (3) Intentional gap to put new bits into a separate byte */ + /* (3) Intentional gap to keep new bits separate */ /* User read only feature flags */ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, @@ -161,6 +161,9 @@ struct rseq { * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE + * + * It is now used for feature status advertisement by the kernel. + * See: enum rseq_cs_flags_bit for further information. */ __u32 flags; diff --git a/kernel/rseq.c b/kernel/rseq.c index b9f11931ef7852..586f58f652c6e7 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -462,10 +462,11 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EFAULT; if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; - if (rseq_slice_extension_enabled() && - (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } } scoped_user_write_access(rseq, efault) { From 01eb80b767430ae868c48ad106c60eb61a508c85 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 10 Apr 2026 04:20:12 -0700 Subject: [PATCH 183/972] accel/qaic: fix incorrect counter check in RAS message decode The UE and UE_NF cases check ce_count against UINT_MAX before incrementing their respective counters. This is logically incorrect and prevents ue_count and ue_nf_count from incrementing when ce_count reaches UINT_MAX. Fixes: c11a50b170e7 ("accel/qaic: Add Reliability, Accessibility, Serviceability (RAS)") Signed-off-by: Alok Tiwari Reviewed-by: Jeff Hugo Signed-off-by: Jeff Hugo Link: https://patch.msgid.link/20260410112015.592546-1-alok.a.tiwari@oracle.com --- drivers/accel/qaic/qaic_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/accel/qaic/qaic_ras.c b/drivers/accel/qaic/qaic_ras.c index cc0b75461e1ac3..6791af366cba73 100644 --- a/drivers/accel/qaic/qaic_ras.c +++ b/drivers/accel/qaic/qaic_ras.c @@ -497,11 +497,11 @@ static void decode_ras_msg(struct qaic_device *qdev, struct ras_data *msg) qdev->ce_count++; break; case UE: - if (qdev->ce_count != UINT_MAX) + if (qdev->ue_count != UINT_MAX) qdev->ue_count++; break; case UE_NF: - if (qdev->ce_count != UINT_MAX) + if (qdev->ue_nf_count != UINT_MAX) qdev->ue_nf_count++; break; default: From 5234094c0150338c35280a75cc7842015f76b725 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 29 Apr 2026 15:43:35 +0200 Subject: [PATCH 184/972] smb: smbdirect: make use of DEFAULT_SYMBOL_NAMESPACE and EXPORT_SYMBOL_GPL This is a better solution than EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd") as it makes it possible to rebuild smbdirect.ko against a running kernel and then load the existing cifs.ko and ksmbd.ko from the running kernel. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-cifs/aehrPuY60VMcYGU8@infradead.org/ Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Christoph Hellwig Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 2 ++ fs/smb/server/transport_rdma.c | 2 ++ fs/smb/smbdirect/accept.c | 2 +- fs/smb/smbdirect/connect.c | 4 ++-- fs/smb/smbdirect/connection.c | 16 ++++++++-------- fs/smb/smbdirect/debug.c | 2 +- fs/smb/smbdirect/devices.c | 2 +- fs/smb/smbdirect/internal.h | 1 + fs/smb/smbdirect/listen.c | 2 +- fs/smb/smbdirect/mr.c | 6 +++--- fs/smb/smbdirect/public.h | 2 -- fs/smb/smbdirect/rw.c | 2 +- fs/smb/smbdirect/socket.c | 20 ++++++++++---------- 13 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 75f9f91a7ec968..b9826185de18ba 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -558,3 +558,5 @@ void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) server->rdma_readwrite_threshold, m); } + +MODULE_IMPORT_NS("SMBDIRECT"); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index a8242c00096f37..346c051e31f502 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -540,3 +540,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .rdma_write = smb_direct_rdma_write, .free_transport = smb_direct_free_transport, }; + +MODULE_IMPORT_NS("SMBDIRECT"); diff --git a/fs/smb/smbdirect/accept.c b/fs/smb/smbdirect/accept.c index 704b271af3a8c0..5297400058385e 100644 --- a/fs/smb/smbdirect/accept.c +++ b/fs/smb/smbdirect/accept.c @@ -854,4 +854,4 @@ struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, return nsc; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept); +EXPORT_SYMBOL_GPL(smbdirect_socket_accept); diff --git a/fs/smb/smbdirect/connect.c b/fs/smb/smbdirect/connect.c index 8addee43a38113..cd726b399afecb 100644 --- a/fs/smb/smbdirect/connect.c +++ b/fs/smb/smbdirect/connect.c @@ -60,7 +60,7 @@ int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst) */ return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect); +EXPORT_SYMBOL_GPL(smbdirect_connect); static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc) { @@ -922,4 +922,4 @@ int smbdirect_connect_sync(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync); +EXPORT_SYMBOL_GPL(smbdirect_connect_sync); diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c index 822366718d457d..fe9912e53da69b 100644 --- a/fs/smb/smbdirect/connection.c +++ b/fs/smb/smbdirect/connection.c @@ -706,7 +706,7 @@ bool smbdirect_connection_is_connected(struct smbdirect_socket *sc) return false; return true; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_is_connected); +EXPORT_SYMBOL_GPL(smbdirect_connection_is_connected); int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc) { @@ -779,7 +779,7 @@ int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_wait_for_connected); +EXPORT_SYMBOL_GPL(smbdirect_connection_wait_for_connected); void smbdirect_connection_idle_timer_work(struct work_struct *work) { @@ -958,7 +958,7 @@ int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_batch_flush); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_batch_flush); struct smbdirect_send_batch * smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, @@ -976,7 +976,7 @@ smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, return batch; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_init_send_batch_storage); +EXPORT_SYMBOL_GPL(smbdirect_init_send_batch_storage); static int smbdirect_connection_wait_for_send_bcredit(struct smbdirect_socket *sc, struct smbdirect_send_batch *batch) @@ -1263,7 +1263,7 @@ int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, bcredit_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_single_iter); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_single_iter); int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc) { @@ -1288,7 +1288,7 @@ int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_wait_zero_pending); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_wait_zero_pending); int smbdirect_connection_send_iter(struct smbdirect_socket *sc, struct iov_iter *iter, @@ -1373,7 +1373,7 @@ int smbdirect_connection_send_iter(struct smbdirect_socket *sc, return total_count; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_iter); +EXPORT_SYMBOL_GPL(smbdirect_connection_send_iter); static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc) { @@ -1937,7 +1937,7 @@ int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, goto again; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_recvmsg); +EXPORT_SYMBOL_GPL(smbdirect_connection_recvmsg); static bool smbdirect_map_sges_single_page(struct smbdirect_map_sges *state, struct page *page, size_t off, size_t len) diff --git a/fs/smb/smbdirect/debug.c b/fs/smb/smbdirect/debug.c index a66a19d4a46341..05ba7c8d165eec 100644 --- a/fs/smb/smbdirect/debug.c +++ b/fs/smb/smbdirect/debug.c @@ -85,4 +85,4 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, atomic_read(&sc->mr_io.ready.count), atomic_read(&sc->mr_io.used.count)); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show); +EXPORT_SYMBOL_GPL(smbdirect_connection_legacy_debug_proc_show); diff --git a/fs/smb/smbdirect/devices.c b/fs/smb/smbdirect/devices.c index 44962f221c352f..7adacbdfe12e71 100644 --- a/fs/smb/smbdirect/devices.c +++ b/fs/smb/smbdirect/devices.c @@ -238,7 +238,7 @@ u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev) return RDMA_NODE_UNSPECIFIED; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type); +EXPORT_SYMBOL_GPL(smbdirect_netdev_rdma_capable_node_type); __init int smbdirect_devices_init(void) { diff --git a/fs/smb/smbdirect/internal.h b/fs/smb/smbdirect/internal.h index 2d5acf2c21bc55..82529b41708b33 100644 --- a/fs/smb/smbdirect/internal.h +++ b/fs/smb/smbdirect/internal.h @@ -6,6 +6,7 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ #define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ +#define DEFAULT_SYMBOL_NAMESPACE "SMBDIRECT" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "smbdirect.h" diff --git a/fs/smb/smbdirect/listen.c b/fs/smb/smbdirect/listen.c index 143a7618d95f3c..2f78bcaedbf82e 100644 --- a/fs/smb/smbdirect/listen.c +++ b/fs/smb/smbdirect/listen.c @@ -90,7 +90,7 @@ int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog) */ return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen); +EXPORT_SYMBOL_GPL(smbdirect_socket_listen); static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id, struct rdma_cm_event *event) diff --git a/fs/smb/smbdirect/mr.c b/fs/smb/smbdirect/mr.c index 5228e699cd5d41..9e6ac9d8717ab8 100644 --- a/fs/smb/smbdirect/mr.c +++ b/fs/smb/smbdirect/mr.c @@ -380,7 +380,7 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, mutex_unlock(&mr->mutex); return NULL; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io); +EXPORT_SYMBOL_GPL(smbdirect_connection_register_mr_io); void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, struct smbdirect_buffer_descriptor_v1 *v1) @@ -397,7 +397,7 @@ void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, } mutex_unlock(&mr->mutex); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor); +EXPORT_SYMBOL_GPL(smbdirect_mr_io_fill_buffer_descriptor); /* * Deregister a MR after I/O is done @@ -490,4 +490,4 @@ void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr) if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) mutex_unlock(&mr->mutex); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io); +EXPORT_SYMBOL_GPL(smbdirect_connection_deregister_mr_io); diff --git a/fs/smb/smbdirect/public.h b/fs/smb/smbdirect/public.h index 50088155e7c37f..d4fb36e65254a8 100644 --- a/fs/smb/smbdirect/public.h +++ b/fs/smb/smbdirect/public.h @@ -13,8 +13,6 @@ struct smbdirect_socket; struct smbdirect_send_batch; struct smbdirect_mr_io; -#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd") - #include u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); diff --git a/fs/smb/smbdirect/rw.c b/fs/smb/smbdirect/rw.c index c2f46b17731ec0..6fe38042cfb968 100644 --- a/fs/smb/smbdirect/rw.c +++ b/fs/smb/smbdirect/rw.c @@ -252,4 +252,4 @@ int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, kfree(msg); goto out; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit); +EXPORT_SYMBOL_GPL(smbdirect_connection_rdma_xmit); diff --git a/fs/smb/smbdirect/socket.c b/fs/smb/smbdirect/socket.c index 1b4ab01b745e65..39cca7219c4df8 100644 --- a/fs/smb/smbdirect/socket.c +++ b/fs/smb/smbdirect/socket.c @@ -20,7 +20,7 @@ bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs) return false; return true; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported); +EXPORT_SYMBOL_GPL(smbdirect_frwr_is_supported); static void smbdirect_socket_cleanup_work(struct work_struct *work); @@ -107,7 +107,7 @@ int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc) alloc_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern); +EXPORT_SYMBOL_GPL(smbdirect_socket_create_kern); int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc) { @@ -148,7 +148,7 @@ int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_so alloc_failed: return ret; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting); +EXPORT_SYMBOL_GPL(smbdirect_socket_create_accepting); int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, const struct smbdirect_socket_parameters *sp) @@ -189,14 +189,14 @@ int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_initial_parameters); const struct smbdirect_socket_parameters * smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc) { return &sc->parameters; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters); +EXPORT_SYMBOL_GPL(smbdirect_socket_get_current_parameters); int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, enum ib_poll_context poll_ctx, @@ -220,7 +220,7 @@ int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_kernel_settings); void smbdirect_socket_set_logging(struct smbdirect_socket *sc, void *private_ptr, @@ -240,7 +240,7 @@ void smbdirect_socket_set_logging(struct smbdirect_socket *sc, sc->logging.needed = needed; sc->logging.vaprintf = vaprintf; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging); +EXPORT_SYMBOL_GPL(smbdirect_socket_set_logging); static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc) { @@ -663,13 +663,13 @@ int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr) return 0; } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind); +EXPORT_SYMBOL_GPL(smbdirect_socket_bind); void smbdirect_socket_shutdown(struct smbdirect_socket *sc) { smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown); +EXPORT_SYMBOL_GPL(smbdirect_socket_shutdown); static void smbdirect_socket_release_disconnect(struct kref *kref) { @@ -712,7 +712,7 @@ void smbdirect_socket_release(struct smbdirect_socket *sc) */ kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy); } -__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release); +EXPORT_SYMBOL_GPL(smbdirect_socket_release); int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc, enum smbdirect_socket_status expected_status, From e768103cfbac30a49860aca08a7710d39dbdd470 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 29 Apr 2026 15:43:36 +0200 Subject: [PATCH 185/972] smb: smbdirect: introduce and use include/linux/smbdirect.h This makes it easier to rebuild cifs.ko and ksmbd.ko against a running kernel. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-cifs/aehrPuY60VMcYGU8@infradead.org/ Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: Namjae Jeon Cc: Christoph Hellwig Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- MAINTAINERS | 1 + fs/smb/client/smbdirect.c | 1 - fs/smb/client/smbdirect.h | 2 +- fs/smb/server/transport_rdma.c | 1 - fs/smb/server/transport_rdma.h | 2 +- fs/smb/smbdirect/internal.h | 3 +- fs/smb/smbdirect/smbdirect.h | 52 ------------------- .../public.h => include/linux/smbdirect.h | 50 ++++++++++++++++-- 8 files changed, 49 insertions(+), 63 deletions(-) delete mode 100644 fs/smb/smbdirect/smbdirect.h rename fs/smb/smbdirect/public.h => include/linux/smbdirect.h (76%) diff --git a/MAINTAINERS b/MAINTAINERS index 2c67ee25ffe6ac..060dca38dbf742 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24650,6 +24650,7 @@ S: Maintained F: fs/smb/client/smbdirect.* F: fs/smb/smbdirect/ F: fs/smb/server/transport_rdma.* +F: include/linux/smbdirect.h SMC91x ETHERNET DRIVER M: Nicolas Pitre diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index b9826185de18ba..563ef488a22585 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -9,7 +9,6 @@ #include "cifs_debug.h" #include "cifsproto.h" #include "smb2proto.h" -#include "../smbdirect/public.h" /* Port numbers for SMBD transport */ #define SMB_PORT 445 diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 287ac849213d40..be205ec02077e6 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -12,7 +12,7 @@ #include "cifsglob.h" -#include "../smbdirect/smbdirect.h" +#include extern int rdma_readwrite_threshold; extern int smbd_max_frmr_depth; diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 346c051e31f502..b6d63ff8a8a3d5 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -18,7 +18,6 @@ #include "smb_common.h" #include "../common/smb2status.h" #include "transport_rdma.h" -#include "../smbdirect/public.h" #define SMB_DIRECT_PORT_IWARP 5445 diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index bde3d88aecc719..8b78917a179580 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -25,6 +25,6 @@ static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif -#include "../smbdirect/smbdirect.h" +#include #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/smb/smbdirect/internal.h b/fs/smb/smbdirect/internal.h index 82529b41708b33..e9959e6dc13ae8 100644 --- a/fs/smb/smbdirect/internal.h +++ b/fs/smb/smbdirect/internal.h @@ -9,9 +9,8 @@ #define DEFAULT_SYMBOL_NAMESPACE "SMBDIRECT" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include "smbdirect.h" +#include #include "pdu.h" -#include "public.h" #include diff --git a/fs/smb/smbdirect/smbdirect.h b/fs/smb/smbdirect/smbdirect.h deleted file mode 100644 index bbab5f7f7cc9b1..00000000000000 --- a/fs/smb/smbdirect/smbdirect.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2025 Stefan Metzmacher - */ - -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ - -#include - -/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ -struct smbdirect_buffer_descriptor_v1 { - __le64 offset; - __le32 token; - __le32 length; -} __packed; - -/* - * Connection parameters mostly from [MS-SMBD] 3.1.1.1 - * - * These are setup and negotiated at the beginning of a - * connection and remain constant unless explicitly changed. - * - * Some values are important for the upper layer. - */ -struct smbdirect_socket_parameters { - __u64 flags; -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) -#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) - __u32 resolve_addr_timeout_msec; - __u32 resolve_route_timeout_msec; - __u32 rdma_connect_timeout_msec; - __u32 negotiate_timeout_msec; - __u16 initiator_depth; /* limited to U8_MAX */ - __u16 responder_resources; /* limited to U8_MAX */ - __u16 recv_credit_max; - __u16 send_credit_target; - __u32 max_send_size; - __u32 max_fragmented_send_size; - __u32 max_recv_size; - __u32 max_fragmented_recv_size; - __u32 max_read_write_size; - __u32 max_frmr_depth; - __u32 keepalive_interval_msec; - __u32 keepalive_timeout_msec; -} __packed; - -#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ - SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) - -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */ diff --git a/fs/smb/smbdirect/public.h b/include/linux/smbdirect.h similarity index 76% rename from fs/smb/smbdirect/public.h rename to include/linux/smbdirect.h index d4fb36e65254a8..97f5ba730fa74c 100644 --- a/fs/smb/smbdirect/public.h +++ b/include/linux/smbdirect.h @@ -3,11 +3,51 @@ * Copyright (C) 2025, Stefan Metzmacher */ -#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ -#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ +#ifndef __LINUX_SMBDIRECT_H__ +#define __LINUX_SMBDIRECT_H__ -struct smbdirect_buffer_descriptor_v1; -struct smbdirect_socket_parameters; +#include + +/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */ +struct smbdirect_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* + * Connection parameters mostly from [MS-SMBD] 3.1.1.1 + * + * These are setup and negotiated at the beginning of a + * connection and remain constant unless explicitly changed. + * + * Some values are important for the upper layer. + */ +struct smbdirect_socket_parameters { + __u64 flags; +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) + __u32 resolve_addr_timeout_msec; + __u32 resolve_route_timeout_msec; + __u32 rdma_connect_timeout_msec; + __u32 negotiate_timeout_msec; + __u16 initiator_depth; /* limited to U8_MAX */ + __u16 responder_resources; /* limited to U8_MAX */ + __u16 recv_credit_max; + __u16 send_credit_target; + __u32 max_send_size; + __u32 max_fragmented_send_size; + __u32 max_recv_size; + __u32 max_fragmented_recv_size; + __u32 max_read_write_size; + __u32 max_frmr_depth; + __u32 keepalive_interval_msec; + __u32 keepalive_timeout_msec; +} __packed; + +#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) struct smbdirect_socket; struct smbdirect_send_batch; @@ -143,4 +183,4 @@ void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, unsigned int rdma_readwrite_threshold, struct seq_file *m); -#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */ +#endif /* __LINUX_SMBDIRECT_H__ */ From 9900b9fee5a0e0f72d7c744b37c7c851d5785ac6 Mon Sep 17 00:00:00 2001 From: Yi Kuo Date: Wed, 29 Apr 2026 18:00:11 +0800 Subject: [PATCH 186/972] smb: smbdirect: fix MR registration for coalesced SG lists ib_dma_map_sg() modifies the provided scatterlist and returns the number of mapped entries, which can be fewer than the requested mr->sgt.nents if the DMA controller coalesces contiguous memory segments. Passing the original, uncoalesced count to ib_map_mr_sg() causes memory registration failures if coalescing actually occurs. Capture the actual mapped count returned by ib_dma_map_sg() and pass it to ib_map_mr_sg() to ensure correct MR registration. Also update the ib_dma_map_sg() error logging to drop the error pointer formatting, since the return value is an integer count rather than an error code. Ensure a proper error code (-EIO) is assigned when DMA mapping or MR registration fails. Fixes: de5ef8ec3c46 ("smb: smbdirect: introduce smbdirect_mr.c with client mr code") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221408 Reviewed-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Yi Kuo Signed-off-by: Steve French --- fs/smb/smbdirect/mr.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/smb/smbdirect/mr.c b/fs/smb/smbdirect/mr.c index 9e6ac9d8717ab8..15c6363a2f97af 100644 --- a/fs/smb/smbdirect/mr.c +++ b/fs/smb/smbdirect/mr.c @@ -269,7 +269,7 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, { const struct smbdirect_socket_parameters *sp = &sc->parameters; struct smbdirect_mr_io *mr; - int ret, num_pages; + int ret, num_pages, num_mapped; struct ib_reg_wr *reg_wr; num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); @@ -300,19 +300,22 @@ smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, num_pages, iov_iter_count(iter), sp->max_frmr_depth); smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth); - ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - if (!ret) { + num_mapped = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + if (!num_mapped) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, - "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n", - num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + "ib_dma_map_sg num_pages=%u dir=%x num_mapped=%d\n", + num_pages, mr->dir, num_mapped); + ret = -EIO; goto dma_map_error; } - ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); - if (ret != mr->sgt.nents) { + ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, num_mapped, NULL, PAGE_SIZE); + if (ret != num_mapped) { smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, - "ib_map_mr_sg failed ret = %d nents = %u\n", - ret, mr->sgt.nents); + "ib_map_mr_sg failed ret = %d num_mapped = %u\n", + ret, num_mapped); + if (ret >= 0) + ret = -EIO; goto map_mr_error; } From f93836b236773862e9ee268a82e3614caf77ea01 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 30 Apr 2026 23:34:33 +0200 Subject: [PATCH 187/972] net: usb: r8152: add TRENDnet TUC-ET2G v2.0 The TRENDnet TUC-ET2G V2.0 is an RTL8156B based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Andrew Lunn Reviewed-by: Birger Koblitz Link: https://patch.msgid.link/20260430213435.21821-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 7337bf1b7d6ad0..1ace1d2398c9c1 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -10138,6 +10138,7 @@ static const struct usb_device_id rtl8152_table[] = { { USB_DEVICE(VENDOR_ID_DELL, 0xb097) }, { USB_DEVICE(VENDOR_ID_ASUS, 0x1976) }, { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) }, + { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02c) }, {} }; From 4f34002e2e37639133693c13a2a9977fab86880d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 07:06:11 +0000 Subject: [PATCH 188/972] ipmr: prevent info-leak in pmr_cache_report() Yiming Qian reported: ipmr_cache_report()` allocates a report skb with `alloc_skb(128, GFP_ATOMIC)` and appends a `struct igmphdr` using `skb_put()`. In the non-`IGMPMSG_WHOLEPKT` path it initializes only: - `igmp->type` - `igmp->code` but does not initialize: - `igmp->csum` - `igmp->group` Later, `igmpmsg_netlink_event()` copies the bytes after `sizeof(struct igmpmsg)` into the `IPMRA_CREPORT_PKT` netlink attribute and emits `RTM_NEWCACHEREPORT` on `RTNLGRP_IPV4_MROUTE_R`. As a result, 6 bytes of stale heap data from the skb head are disclosed to userspace. Let's use skb_put_zero() instead of skb_put() to fix this bug. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yiming Qian Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260430070611.4004529-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2058ca860294b0..05fb6eefe0beb3 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1112,11 +1112,12 @@ static int ipmr_cache_report(const struct mr_table *mrt, msg->im_vif_hi = vifi >> 8; ipv4_pktinfo_prepare(mroute_sk, pkt, false); memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); - /* Add our header */ - igmp = skb_put(skb, sizeof(struct igmphdr)); + /* Add our header. + * Note that code, csum and group fields are cleared. + */ + igmp = skb_put_zero(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; - igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } From 4b9e327991815e128ad3af75c3a04630a63ce3e0 Mon Sep 17 00:00:00 2001 From: Kai Zen Date: Thu, 30 Apr 2026 18:26:48 +0300 Subject: [PATCH 189/972] net: rtnetlink: zero ifla_vf_broadcast to avoid stack infoleak in rtnl_fill_vfinfo rtnl_fill_vfinfo() declares struct ifla_vf_broadcast on the stack without initialisation: struct ifla_vf_broadcast vf_broadcast; The struct contains a single fixed 32-byte field: /* include/uapi/linux/if_link.h */ struct ifla_vf_broadcast { __u8 broadcast[32]; }; The function then copies dev->broadcast into it using dev->addr_len as the length: memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); On Ethernet devices (the overwhelming majority of SR-IOV NICs) dev->addr_len is 6, so only the first 6 bytes of broadcast[] are written. The remaining 26 bytes retain whatever was previously on the kernel stack. The full struct is then handed to userspace via: nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) leaking up to 26 bytes of uninitialised kernel stack per VF per RTM_GETLINK request, repeatable. The other vf_* structs in the same function are explicitly zeroed for exactly this reason - see the memset() calls for ivi, vf_vlan_info, node_guid and port_guid a few lines above. vf_broadcast was simply missed when it was added. Reachability: any unprivileged local process can open AF_NETLINK / NETLINK_ROUTE without capabilities and send RTM_GETLINK with an IFLA_EXT_MASK attribute carrying RTEXT_FILTER_VF. The kernel walks each VF and emits IFLA_VF_BROADCAST, leaking 26 bytes of stack per VF per request. Stack residue at this call site can include return addresses and transient sensitive data; KASAN with stack instrumentation, or KMSAN, will flag the nla_put() when reproduced. Zero the on-stack struct before the partial memcpy, matching the existing pattern used for the other vf_* structs in the same function. Fixes: 75345f888f70 ("ipoib: show VF broadcast address") Cc: stable@vger.kernel.org Signed-off-by: Kai Zen Link: https://patch.msgid.link/3c506e8f936e52b57620269b55c348af05d413a2.1777557228.git.kai.aizen.dev@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b613bb6e07df65..df042da422ef31 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memset(&vf_broadcast, 0, sizeof(vf_broadcast)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; From c6bebaa744f7579eb72800a262fbfeb93e40db04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 16:48:36 +0000 Subject: [PATCH 190/972] ipv4: igmp: annotate data-races in igmp_heard_query() Multiple cpus can run igmp_heard_query() concurrently. Add missing READ_ONCE()/WRITE_ONCE() over following in_dev fields. - mr_qrv - mr_qi - mr_qri - mr_v1_seen - mr_v2_seen Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+ae9a171f239b14485310@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f38675.050a0220.3cbe47.0002.GAE@google.com Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260430164836.872079-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/igmp.c | 58 ++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a674fb44ec25ba..a9ad39064f3bb7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -122,16 +122,29 @@ * contradict to specs provided this delay is small enough. */ -#define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ - ((in_dev)->mr_v1_seen && \ - time_before(jiffies, (in_dev)->mr_v1_seen))) -#define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ - ((in_dev)->mr_v2_seen && \ - time_before(jiffies, (in_dev)->mr_v2_seen))) +static bool IGMP_V1_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1) + return true; + seen = READ_ONCE(in_dev->mr_v1_seen); + return seen && time_before(jiffies, seen); +} + +static bool IGMP_V2_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2) + return true; + seen = READ_ONCE(in_dev->mr_v2_seen); + return seen && time_before(jiffies, seen); +} static int unsolicited_report_interval(struct in_device *in_dev) { @@ -954,23 +967,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); - + unsigned long seen; if (len == 8) { + seen = jiffies + READ_ONCE(in_dev->mr_qrv) * READ_ONCE(in_dev->mr_qi) + + READ_ONCE(in_dev->mr_qri); if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_v1_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v1_seen, seen); group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); - in_dev->mr_v2_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v2_seen, seen); } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); @@ -995,6 +1006,8 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ + unsigned long mr_qi; + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; @@ -1015,15 +1028,16 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); - in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; - + WRITE_ONCE(in_dev->mr_qrv, + ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); + mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + WRITE_ONCE(in_dev->mr_qi, mr_qi); /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ - if (in_dev->mr_qri >= in_dev->mr_qi) - in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (READ_ONCE(in_dev->mr_qri) >= mr_qi) + WRITE_ONCE(in_dev->mr_qri, (mr_qi/HZ - 1) * HZ); if (!group) { /* general query */ if (ih3->nsrcs) From a5148bc2fa27092862ac4b9e7b5c8340d60cff34 Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Wed, 29 Apr 2026 18:57:39 +0100 Subject: [PATCH 191/972] net: usb: cdc_ncm: add Apple Mac USB-C direct networking quirk Apple Silicon Macs expose two CDC NCM "private" data interfaces over USB-C with VID:PID 0x05ac:0x1905 and product string "Mac". This is the same protocol Apple already ships on iPhone (0x05ac:0x12a8) and iPad (0x05ac:0x12ab) for RemoteXPC since iOS 17 -- both data interfaces lack an interrupt status endpoint, so they rely on the FLAG_LINK_INTR- conditional bind path introduced in commit 3ec8d7572a69 ("CDC-NCM: add support for Apple's private interface"). The id_table currently has entries for iPhone and iPad but not for the Mac. Without a match, cdc_ncm falls through to the generic CDC NCM class-match entry, which uses the FLAG_LINK_INTR-having cdc_ncm_info struct, so bind_common() fails on the missing status endpoint and no netdev appears. Add id_table entries for both interface numbers (0 and 2) of the Mac, bound to the existing apple_private_interface_info driver_info. Verified empirically on a Mac Studio M3 Ultra running macOS 26.5: when a Mac is connected via USB-C, ioreg shows VID 0x05ac, PID 0x1905, product string "Mac", with two NCM data interfaces at numbers 0 and 2. The same PID is presented by all current Apple Silicon Mac models (MacBook Pro/Air, Mac mini, Mac Studio across the M-series), mirroring Apple's single-PID-per-family pattern from iPhone/iPad. After this patch, plugging a Mac into a Linux host running the patched kernel produces two enx... interfaces (one per data interface), "ip -br link" lists them as UP, and standard userspace networking (DHCP, NetworkManager shared mode, etc.) works without any modprobe overrides or out-of-tree modules. Signed-off-by: Alex Cheema Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429175739.34426-1-alex@exolabs.net Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index bb9929727eb932..0223a172851ecf 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -2012,6 +2012,14 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&apple_private_interface_info, }, + /* Mac */ + { USB_DEVICE_INTERFACE_NUMBER(0x05ac, 0x1905, 0), + .driver_info = (unsigned long)&apple_private_interface_info, + }, + { USB_DEVICE_INTERFACE_NUMBER(0x05ac, 0x1905, 2), + .driver_info = (unsigned long)&apple_private_interface_info, + }, + /* Ericsson MBM devices like F5521gw */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, From 6d4106e8df94c0c52cf3ca6a6a0d01567fb3844e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 08:00:56 +0000 Subject: [PATCH 192/972] net/sched: sch_pie: annotate more data-races in pie_dump_stats() My prior patch missed few READ_ONCE()/WRITE_ONCE() annotations. Fixes: 5154561d9b11 ("net/sched: sch_pie: annotate data-races in pie_dump_stats()") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260430080056.35104-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_pie.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fb53fbf0e32857..b41f2def2e2cc5 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -219,16 +219,14 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, * packet timestamp. */ if (!params->dq_rate_estimator) { - vars->qdelay = now - pie_get_enqueue_time(skb); + WRITE_ONCE(vars->qdelay, + backlog ? now - pie_get_enqueue_time(skb) : 0); if (vars->dq_tstamp != DTIME_INVALID) dtime = now - vars->dq_tstamp; vars->dq_tstamp = now; - if (backlog == 0) - vars->qdelay = 0; - if (dtime == 0) return; @@ -376,7 +374,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - vars->prob += delta; + WRITE_ONCE(vars->prob, vars->prob + delta); if (delta > 0) { /* prevent overflow */ @@ -401,7 +399,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - vars->prob -= vars->prob / 64; + WRITE_ONCE(vars->prob, vars->prob - vars->prob / 64); WRITE_ONCE(vars->qdelay, qdelay); vars->backlog_old = backlog; @@ -501,7 +499,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { - .prob = q->vars.prob << BITS_PER_BYTE, + .prob = READ_ONCE(q->vars.prob) << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) / NSEC_PER_USEC, .packets_in = READ_ONCE(q->stats.packets_in), @@ -512,7 +510,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ - st.dq_rate_estimating = q->params.dq_rate_estimator; + st.dq_rate_estimating = READ_ONCE(q->params.dq_rate_estimator); /* unscale and return dq_rate in bytes per sec */ if (st.dq_rate_estimating) From 4bc852006b62eae8ea77e797192d089367e854ff Mon Sep 17 00:00:00 2001 From: Sagarika Sharma Date: Thu, 30 Apr 2026 20:09:00 +0000 Subject: [PATCH 193/972] ipv6: update route serial number on NETDEV_CHANGE When using IPv6 ECMP routes, if a netdev listed as a nexthop experiences a carrier change event (e.g., a bond device generating a NETDEV_CHANGE event after its slaves go linkdown), established connections utilizing that nexthop fail to fail over to other available nexthops. Instead, these connections stall or drop. This happens because the IPv6 FIB code does not invalidate the socket's cached destination when a NETDEV_CHANGE event occurs. While fib6_ifdown() correctly marks the nexthop with RTNH_F_LINKDOWN, it leaves the route's serial number unchanged. As a result, sockets with a previously cached dst do not realize the route is no longer viable and continue to try using the non-functional nexthop. This behavior contrasts with IPv4, which actively flushes cached destinations on a NETDEV_CHANGE event (see fib_netdev_event() in net/ipv4/fib_frontend.c). Fix this by updating the route serial number in fib6_ifdown() when setting RTNH_F_LINKDOWN. This invalidates stale cached destinations, forcing sockets to perform a new route lookup and fail over to a functioning nexthop. Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Signed-off-by: Sagarika Sharma Reviewed-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430200909.527827-2-sharmasagarika@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 19eb6b70222785..0dc0316530ca18 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4995,6 +4995,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); break; } From d1ae37dc6881a6a9113c8545cdbba731393d8dcd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 30 Apr 2026 20:09:01 +0000 Subject: [PATCH 194/972] selftest: net: Add test for TCP flow failover with ECMP routes. Without the previous commit, TCP failed to switch to alternative IPv6 routes immediately upon carrier loss. It would persist with the dead route until reaching the threshold net.ipv4.tcp_retries1, leading to unnecessary delays in failover. Let's add a selftest for this scenario to ensure TCP fails over immediately upon a carrier loss event. Before: TEST: TCP IPv4 failover [ OK ] TEST: TCP IPv6 failover [FAIL] After: TEST: TCP IPv4 failover [ OK ] TEST: TCP IPv6 failover [ OK ] Signed-off-by: Kuniyuki Iwashima Signed-off-by: Sagarika Sharma Link: https://patch.msgid.link/20260430200909.527827-3-sharmasagarika@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/tcp_ecmp_failover.sh | 216 ++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100755 tools/testing/selftests/net/tcp_ecmp_failover.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index a275ed5840265c..f3da38c54d276d 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -96,6 +96,7 @@ TEST_PROGS := \ srv6_hl2encap_red_l2vpn_test.sh \ srv6_iptunnel_cache.sh \ stress_reuseport_listen.sh \ + tcp_ecmp_failover.sh \ tcp_fastopen_backup_key.sh \ test_bpf.sh \ test_bridge_backup_port.sh \ diff --git a/tools/testing/selftests/net/tcp_ecmp_failover.sh b/tools/testing/selftests/net/tcp_ecmp_failover.sh new file mode 100755 index 00000000000000..5768aa8bff6a61 --- /dev/null +++ b/tools/testing/selftests/net/tcp_ecmp_failover.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2026 Google LLC. +# +# This test verifies TCP flow failover between ECMP routes +# upon carrier loss on the active device. +# +# socat -----------------------------> socat +# | +# .-- veth-c1 -|- veth-s1 --. +# dummy0 -| | |-- dummy0 +# '-- veth-c2 -|- veth-s2 --' +# | +# + +REQUIRE_JQ=no +REQUIRE_MZ=no +NUM_NETIFS=0 + +source forwarding/lib.sh + +CLIENT_IP="10.0.59.1" +SERVER_IP="10.0.92.1" +CLIENT_IP6="2001:db8:5a9a::1" +SERVER_IP6="2001:db8:9292::1" + +setup_server() +{ + IP="ip -n $server" + NS_EXEC="ip netns exec $server" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $SERVER_IP/32 dev dummy0 + $IP -6 addr add $SERVER_IP6/128 dev dummy0 nodad + + $IP link set veth-s1 up + $IP link set veth-s2 up + + $IP -4 addr add 192.168.1.2/24 dev veth-s1 + $IP -4 addr add 192.168.2.2/24 dev veth-s2 + + $IP -4 route add $CLIENT_IP/32 \ + nexthop via 192.168.1.1 dev veth-s1 weight 1 \ + nexthop via 192.168.2.1 dev veth-s2 weight 1 + + $IP -6 addr add 2001:db8:1::2/64 dev veth-s1 nodad + $IP -6 addr add 2001:db8:2::2/64 dev veth-s2 nodad + + $IP -6 route add $CLIENT_IP6/128 \ + nexthop via 2001:db8:1::1 dev veth-s1 weight 1 \ + nexthop via 2001:db8:2::1 dev veth-s2 weight 1 +} + +setup_client() +{ + IP="ip -n $client" + NS_EXEC="ip netns exec $client" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $CLIENT_IP/32 dev dummy0 + $IP -6 addr add $CLIENT_IP6/128 dev dummy0 nodad + + $IP link set veth-c1 up + $IP link set veth-c2 up + + $IP -4 addr add 192.168.1.1/24 dev veth-c1 + $IP -4 addr add 192.168.2.1/24 dev veth-c2 + + $IP -4 route add $SERVER_IP/32 \ + nexthop via 192.168.1.2 dev veth-c1 weight 1 \ + nexthop via 192.168.2.2 dev veth-c2 weight 1 + + $IP -6 addr add 2001:db8:1::1/64 dev veth-c1 nodad + $IP -6 addr add 2001:db8:2::1/64 dev veth-c2 nodad + + $IP -6 route add $SERVER_IP6/128 \ + nexthop via 2001:db8:1::2 dev veth-c1 weight 1 \ + nexthop via 2001:db8:2::2 dev veth-c2 weight 1 + + # By default, tcp_retries1=3 triggers a route refresh + # after 3 retransmits (~5s). Ensure this never occurs + # for test stability. + $NS_EXEC sysctl -qw net.ipv4.tcp_retries1=100 + + # When NETDEV_CHANGE is issued for a dev tied to an ECMP + # route, RTNH_F_LINKDOWN is flagged and the sernum is + # bumped to invalidate the route via sk_dst_check(). + # + # Without ignore_routes_with_linkdown=1, subsequent + # lookups may still select the same RTNH_F_LINKDOWN route. + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c2.ignore_routes_with_linkdown=1 + + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c2.ignore_routes_with_linkdown=1 +} + +setup() +{ + setup_ns client server + + ip -n "$client" link add veth-c1 type veth peer veth-s1 netns "$server" + ip -n "$client" link add veth-c2 type veth peer veth-s2 netns "$server" + + setup_server + setup_client +} + +cleanup() +{ + cleanup_all_ns > /dev/null 2>&1 +} + +tcp_ecmp_failover() +{ + local pf=$1; shift + local server_ip=$1; shift + local client_ip=$1; shift + + RET=0 + + tcpdump_start veth-s1 "$server" + tcpdump_start veth-s2 "$server" + + ip netns exec "$server" \ + socat -u TCP-LISTEN:8080,pf="$pf",bind="$server_ip",reuseaddr /dev/null & + server_pid=$! + + # Wait for server to start listening. + # Sometimes client fails without this sleep. + sleep 1 + + ip netns exec "$client" \ + socat -u /dev/zero TCP:"$server_ip":8080,pf="$pf",bind="$client_ip" & + client_pid=$! + + # To capture enough packets. + sleep 3 + + tcpdump_stop veth-s1 + tcpdump_stop veth-s2 + + pkts_s1=$(tcpdump_show veth-s1 | wc -l) + pkts_s2=$(tcpdump_show veth-s2 | wc -l) + + tcpdump_cleanup veth-s1 + tcpdump_cleanup veth-s2 + + # Detect the device chosen by the client + if [ "$pkts_s1" -gt "$pkts_s2" ]; then + veth_down=veth-s1 + veth_up=veth-s2 + else + veth_down=veth-s2 + veth_up=veth-s1 + fi + + # Taking down $veth_down causes its peer to lose carrier, + # triggering NETDEV_CHANGE. This flags RTNH_F_LINKDOWN + # and bumps the sernum for the route associated with that + # peer, invalidating the cached dst in the TCP socket. + # + # Consequently, sk_dst_check() fails, forcing the subsequent + # lookup to select the remaining healthy route via $veth_up. + ip -n "$server" link set "$veth_down" down + + tcpdump_start "$veth_up" "$server" + + # To capture enough packets. + sleep 3 + + tcpdump_stop "$veth_up" + + kill -9 "$client_pid" > /dev/null 2>&1 + kill -9 "$server_pid" > /dev/null 2>&1 + wait 2> /dev/null + + pkts=$(tcpdump_show $veth_up | wc -l) + + tcpdump_cleanup "$veth_up" + + if [ "$pkts" -lt 1000 ]; then + RET=$ksft_fail + fi +} + +test_ipv4() +{ + setup + tcp_ecmp_failover IPv4 $SERVER_IP $CLIENT_IP + log_test "TCP IPv4 failover" + cleanup +} + +test_ipv6() +{ + setup + tcp_ecmp_failover IPv6 "[$SERVER_IP6]" "[$CLIENT_IP6]" + log_test "TCP IPv6 failover" + cleanup +} + +require_command socat +require_command tcpdump + +trap cleanup EXIT + +test_ipv4 +test_ipv6 + +exit "$EXIT_STATUS" From b1f1e80620deb49daf63c2e677046599b693dc1f Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Tue, 28 Apr 2026 23:08:54 +0900 Subject: [PATCH 195/972] ksmbd: centralize ksmbd_conn final release to plug transport leak ksmbd_conn_free() is one of four sites that can observe the last refcount drop of a struct ksmbd_conn. The other three fs/smb/server/connection.c ksmbd_conn_r_count_dec() fs/smb/server/oplock.c __free_opinfo() fs/smb/server/vfs_cache.c session_fd_check() end the conn with a bare kfree(), skipping ida_destroy(&conn->async_ida) and conn->transport->ops->free_transport(conn->transport). Whenever one of them is the last putter, the embedded async_ida and the entire transport struct leak -- for TCP, that is also the struct socket and the kvec iov. __free_opinfo() being a final putter is not theoretical. opinfo_put() queues the callback via call_rcu(&opinfo->rcu, free_opinfo_rcu), so ksmbd_server_terminate_conn() can deposit N opinfo releases in RCU and have ksmbd_conn_free() run in the handler thread before any of them fire. ksmbd_conn_free() then observes refcnt > 0 and short-circuits; the last RCU-delivered __free_opinfo() falls onto its bare kfree(conn) branch and the transport is lost. A/B validation in a QEMU/virtme guest, mounting //127.0.0.1/testshare: each iteration holds 8 files open via sleep processes, force-closes TCP with "ss -K sport = :445", kills the holders, lazy-umounts; repeated 10 times, then ksmbd shutdown and kmemleak scan. state conn_alloc conn_free tcp_free opi_rcu kmemleak ---------- ---------- --------- -------- ------- -------- pre-patch 20 20 10 160 7 with patch 20 20 20 160 0 Pre-patch conn_free=20 with tcp_free=10 directly demonstrates the bare-kfree paths skipping transport cleanup; kmemleak backtraces point into struct tcp_transport / iov. With this patch tcp_free matches conn_free at 20/20 and kmemleak is clean. Move the per-struct final release into __ksmbd_conn_release_work() and route the three bare-kfree final-put sites through a new ksmbd_conn_put(). Those sites now pair ida_destroy() and free_transport() with kfree(conn) regardless of which holder happens to release the last reference. stop_sessions() only triggers the transport shutdown and does not itself drop the last conn reference, so it is unaffected. The centralized release reaches sock_release() -> tcp_close() -> lock_sock_nested() (might_sleep) from every final putter, including __free_opinfo() invoked from an RCU softirq callback, which trips CONFIG_DEBUG_ATOMIC_SLEEP. Defer the release to a dedicated ksmbd_conn_wq workqueue so ksmbd_conn_put() is safe from any non-sleeping context. Make ksmbd_file own a strong connection reference while fp->conn is non-NULL so durable-preserve and final-close paths cannot dereference a stale connection. ksmbd_open_fd() and ksmbd_reopen_durable_fd() take the reference via ksmbd_conn_get() (the latter also reorders the fp->conn / fp->tcon assignments before __open_id() so the published fp is never observed with fp->conn == NULL); session_fd_check() and __ksmbd_close_fd() drop it via ksmbd_conn_put(). With that invariant, session_fd_check() can take a local conn pointer once and use it across the m_op_list and lock_list iterations even though op->conn puts may otherwise drop the last reference. At module exit the workqueue is flushed and destroyed after rcu_barrier(), so any release queued by a trailing RCU callback is drained before the inode hash and module text go away. Fixes: ee426bfb9d09 ("ksmbd: add refcnt to ksmbd_conn struct") Signed-off-by: DaeMyung Kang Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 101 +++++++++++++++++++++++++++++++------ fs/smb/server/connection.h | 6 +++ fs/smb/server/oplock.c | 7 +-- fs/smb/server/server.c | 12 +++++ fs/smb/server/vfs_cache.c | 60 ++++++++++++++++++---- 5 files changed, 156 insertions(+), 30 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c5aac4946cbe52..95654a80816286 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -79,6 +79,81 @@ static int create_proc_clients(void) { return 0; } static void delete_proc_clients(void) {} #endif +static struct workqueue_struct *ksmbd_conn_wq; + +int ksmbd_conn_wq_init(void) +{ + ksmbd_conn_wq = alloc_workqueue("ksmbd-conn-release", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!ksmbd_conn_wq) + return -ENOMEM; + return 0; +} + +void ksmbd_conn_wq_destroy(void) +{ + if (ksmbd_conn_wq) { + destroy_workqueue(ksmbd_conn_wq); + ksmbd_conn_wq = NULL; + } +} + +/* + * __ksmbd_conn_release_work() - perform the final, once-per-struct cleanup + * of a ksmbd_conn whose refcount has just dropped to zero. + * + * This is the common release path used by ksmbd_conn_put() for the embedded + * state that outlives the connection thread: async_ida and the attached + * transport (which owns the socket and iov for TCP). Called from a workqueue + * so that sleep-allowed teardown (sock_release -> tcp_close -> + * lock_sock_nested) never runs from an RCU softirq callback (free_opinfo_rcu) + * or any other non-sleeping putter context. + */ +static void __ksmbd_conn_release_work(struct work_struct *work) +{ + struct ksmbd_conn *conn = + container_of(work, struct ksmbd_conn, release_work); + + ida_destroy(&conn->async_ida); + conn->transport->ops->free_transport(conn->transport); + kfree(conn); +} + +/** + * ksmbd_conn_get() - take a reference on @conn and return it. + * + * Returns @conn unchanged so callers can write + * "fp->conn = ksmbd_conn_get(work->conn);" in one expression. Returns NULL + * if @conn is NULL. + */ +struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn) +{ + if (!conn) + return NULL; + + atomic_inc(&conn->refcnt); + return conn; +} + +/** + * ksmbd_conn_put() - drop a reference and, if it was the last, queue the + * release onto ksmbd_conn_wq so it runs from process context. + * + * Callable from any context including RCU softirq callbacks and non-sleeping + * locks; the actual release is deferred to the workqueue. ksmbd_conn_wq is + * created in ksmbd_server_init() before any conn can be allocated and is + * destroyed in ksmbd_server_exit() after rcu_barrier(), so it is always + * non-NULL while a conn reference is held. + */ +void ksmbd_conn_put(struct ksmbd_conn *conn) +{ + if (!conn) + return; + + if (atomic_dec_and_test(&conn->refcnt)) + queue_work(ksmbd_conn_wq, &conn->release_work); +} + /** * ksmbd_conn_free() - free resources of the connection instance * @@ -93,23 +168,19 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) hash_del(&conn->hlist); up_write(&conn_list_lock); + /* + * request_buf / preauth_info / mechToken are only ever accessed by the + * connection handler thread that owns @conn. ksmbd_conn_free() is + * called from the transport free_transport() path when that thread is + * exiting, so it is safe to release them unconditionally even when + * ksmbd_conn_put() below is not the final putter (oplock / ksmbd_file + * holders only retain the conn pointer, not these per-thread buffers). + */ xa_destroy(&conn->sessions); kvfree(conn->request_buf); kfree(conn->preauth_info); kfree(conn->mechToken); - if (atomic_dec_and_test(&conn->refcnt)) { - /* - * async_ida is embedded in struct ksmbd_conn, so pair - * ida_destroy() with the final kfree() rather than with - * the unconditional field teardown above. This keeps - * the IDA valid for the entire lifetime of the struct, - * even while other refcount holders (oplock / vfs - * durable handles) still reference the connection. - */ - ida_destroy(&conn->async_ida); - conn->transport->ops->free_transport(conn->transport); - kfree(conn); - } + ksmbd_conn_put(conn); } /** @@ -136,6 +207,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->um = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(conn->um)) conn->um = NULL; + INIT_WORK(&conn->release_work, __ksmbd_conn_release_work); atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); atomic_set(&conn->refcnt, 1); @@ -512,8 +584,7 @@ void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn) if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) wake_up(&conn->r_count_q); - if (atomic_dec_and_test(&conn->refcnt)) - kfree(conn); + ksmbd_conn_put(conn); } int ksmbd_conn_transport_init(void) diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index de2d46941c930b..e074be9425823b 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "smb_common.h" #include "ksmbd_work.h" @@ -120,6 +121,7 @@ struct ksmbd_conn { bool binding; atomic_t refcnt; bool is_aapl; + struct work_struct release_work; }; struct ksmbd_conn_ops { @@ -164,6 +166,10 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id); struct ksmbd_conn *ksmbd_conn_alloc(void); void ksmbd_conn_free(struct ksmbd_conn *conn); +struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn); +void ksmbd_conn_put(struct ksmbd_conn *conn); +int ksmbd_conn_wq_init(void); +void ksmbd_conn_wq_destroy(void); bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); int ksmbd_conn_write(struct ksmbd_work *work); int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index cd3f28b0e7cb24..8feca02ddbf259 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -30,7 +30,6 @@ static DEFINE_RWLOCK(lease_list_lock); static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, u64 id, __u16 Tid) { - struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; struct oplock_info *opinfo; @@ -39,7 +38,7 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, return NULL; opinfo->sess = sess; - opinfo->conn = conn; + opinfo->conn = ksmbd_conn_get(work->conn); opinfo->level = SMB2_OPLOCK_LEVEL_NONE; opinfo->op_state = OPLOCK_STATE_NONE; opinfo->pending_break = 0; @@ -50,7 +49,6 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, init_waitqueue_head(&opinfo->oplock_brk); atomic_set(&opinfo->refcount, 1); atomic_set(&opinfo->breaking_cnt, 0); - atomic_inc(&opinfo->conn->refcnt); return opinfo; } @@ -132,8 +130,7 @@ static void __free_opinfo(struct oplock_info *opinfo) { if (opinfo->is_lease) free_lease(opinfo); - if (opinfo->conn && atomic_dec_and_test(&opinfo->conn->refcnt)) - kfree(opinfo->conn); + ksmbd_conn_put(opinfo->conn); kfree(opinfo); } diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 58ef02c423fcef..5d799b2d4c62f4 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -596,8 +596,14 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; + ret = ksmbd_conn_wq_init(); + if (ret) + goto err_workqueue_destroy; + return 0; +err_workqueue_destroy: + ksmbd_workqueue_destroy(); err_crypto_destroy: ksmbd_crypto_destroy(); err_release_inode_hash: @@ -623,6 +629,12 @@ static void __exit ksmbd_server_exit(void) { ksmbd_server_shutdown(); rcu_barrier(); + /* + * ksmbd_conn_put() defers the final release onto ksmbd_conn_wq, + * so drain it after rcu_barrier() has fired any pending RCU + * callbacks that may have queued a release. + */ + ksmbd_conn_wq_destroy(); ksmbd_release_inode_hash(); } diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 3551f01a3fa035..4a18107937cc94 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -475,6 +475,17 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) kfree(smb_lock); } + /* + * Drop fp's strong reference on conn (taken in ksmbd_open_fd() / + * ksmbd_reopen_durable_fd()). Durable fps that reached the + * scavenger have already had fp->conn cleared by session_fd_check(), + * in which case there is nothing to drop here. + */ + if (fp->conn) { + ksmbd_conn_put(fp->conn); + fp->conn = NULL; + } + if (ksmbd_stream_fd(fp)) kfree(fp->stream.name); kfree(fp->owner.name); @@ -752,7 +763,14 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) atomic_set(&fp->refcount, 1); fp->filp = filp; - fp->conn = work->conn; + /* + * fp owns a strong reference on fp->conn for as long as fp->conn is + * non-NULL, so session_fd_check() and __ksmbd_close_fd() never + * dereference a dangling pointer. Paired with ksmbd_conn_put() in + * session_fd_check() (durable preserve), in __ksmbd_close_fd() + * (final close), and on the error paths below. + */ + fp->conn = ksmbd_conn_get(work->conn); fp->tcon = work->tcon; fp->volatile_id = KSMBD_NO_FID; fp->persistent_id = KSMBD_NO_FID; @@ -774,6 +792,8 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) return fp; err_out: + /* fp->conn was set and refcounted before every branch here. */ + ksmbd_conn_put(fp->conn); kmem_cache_free(filp_cache, fp); return ERR_PTR(ret); } @@ -1062,25 +1082,32 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, if (!is_reconnectable(fp)) return false; + if (WARN_ON_ONCE(!fp->conn)) + return false; + if (ksmbd_vfs_copy_durable_owner(fp, user)) return false; + /* + * fp owns a strong reference on fp->conn (taken in ksmbd_open_fd() + * / ksmbd_reopen_durable_fd()), so conn stays valid for the whole + * body of this function regardless of any op->conn puts below. + */ conn = fp->conn; ci = fp->f_ci; down_write(&ci->m_lock); list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn != conn) continue; - if (op->conn && atomic_dec_and_test(&op->conn->refcnt)) - kfree(op->conn); + ksmbd_conn_put(op->conn); op->conn = NULL; } up_write(&ci->m_lock); list_for_each_entry_safe(smb_lock, tmp_lock, &fp->lock_list, flist) { - spin_lock(&fp->conn->llist_lock); + spin_lock(&conn->llist_lock); list_del_init(&smb_lock->clist); - spin_unlock(&fp->conn->llist_lock); + spin_unlock(&conn->llist_lock); } fp->conn = NULL; @@ -1091,6 +1118,8 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, fp->durable_scavenger_timeout = jiffies_to_msecs(jiffies) + fp->durable_timeout; + /* Drop fp's own reference on conn. */ + ksmbd_conn_put(conn); return true; } @@ -1178,15 +1207,27 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) old_f_state = fp->f_state; fp->f_state = FP_NEW; + + /* + * Initialize fp's connection binding before publishing fp into the + * session's file table. If __open_id() is ordered first, a + * concurrent teardown that iterates the table can observe a valid + * volatile_id with fp->conn == NULL and preserve a + * partially-initialized fp. fp owns a strong reference on the new + * conn (see ksmbd_open_fd()); undo it on __open_id() failure. + */ + fp->conn = ksmbd_conn_get(conn); + fp->tcon = work->tcon; + __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID); if (!has_file_id(fp->volatile_id)) { + fp->conn = NULL; + fp->tcon = NULL; + ksmbd_conn_put(conn); fp->f_state = old_f_state; return -EBADF; } - fp->conn = conn; - fp->tcon = work->tcon; - list_for_each_entry(smb_lock, &fp->lock_list, flist) { spin_lock(&conn->llist_lock); list_add_tail(&smb_lock->clist, &conn->lock_list); @@ -1198,8 +1239,7 @@ int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp) list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) { if (op->conn) continue; - op->conn = fp->conn; - atomic_inc(&op->conn->refcnt); + op->conn = ksmbd_conn_get(fp->conn); } up_write(&ci->m_lock); From a42896bebfcc287ed1e61d820a888e33b1eb80ce Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Tue, 28 Apr 2026 23:08:55 +0900 Subject: [PATCH 196/972] ksmbd: harden file lifetime during session teardown __close_file_table_ids() is the per-session teardown that closes every fp belonging to a session (or to one tree connect on that session) by walking the session's volatile-id idr. The current loop has three related problems on busy or racing workloads: * Sleeping under ft->lock. The session-teardown skip callback, session_fd_check(), already sleeps in ksmbd_vfs_copy_durable_owner() -> kstrdup(GFP_KERNEL) and down_write(&fp->f_ci->m_lock) (a rw_semaphore). Running the callback inside write_lock(&ft->lock) trips CONFIG_DEBUG_ATOMIC_SLEEP / CONFIG_PROVE_LOCKING on a durable-fd workload. * Refcount accounting blind to f_state. The unconditional atomic_dec_and_test(&fp->refcount) does not distinguish FP_INITED (idr-owned reference still intact) from FP_CLOSED (an earlier ksmbd_close_fd() already consumed the idr-owned reference while leaving fp in the idr because a holder kept refcount non-zero). When the latter races with teardown the same path over-decrements into a holder reference and ksmbd_fd_put() later UAFs that holder. * FP_NEW window. Between __open_id() publishing fp into the session idr and ksmbd_update_fstate(..., FP_INITED) committing the transition at the end of smb2_open(), an fp is in FP_NEW and an intervening teardown that takes a transient reference and unpublishes the volatile id leaves the original idr-owned reference orphaned -- the opener is unaware that fp has been unpublished, returns success to the client, and the fp leaks at refcount = 1. Refactor __close_file_table_ids() to take a transient reference on fp and unpublish fp from the session idr *under ft->lock* before calling skip() outside the lock. A transient ref protects lifetime but not concurrent field mutation, so the idr_remove() is what keeps __ksmbd_lookup_fd() through this session's idr from granting a new ksmbd_fp_get() reference to an fp whose fp->conn / fp->tcon / fp->volatile_id / op->conn / lock_list links are about to be rewritten by session_fd_check(). Durable reconnect is unaffected because it reaches fp through the global durable table (ksmbd_lookup_durable_fd -> global_ft). Decide n_to_drop together with any FP_INITED -> FP_CLOSED transition under ft->lock so teardown and ksmbd_close_fd() never both consume the idr-owned reference. See ksmbd_mark_fp_closed() for the per-state accounting. For the FP_NEW path to be safe, the opener has to learn that fp was unpublished: ksmbd_update_fstate() now returns -ENOENT when an FP_NEW -> FP_INITED transition finds f_state already advanced or the volatile id cleared (both committed by teardown under ft->lock); smb2_open() propagates that as STATUS_OBJECT_NAME_INVALID and drops the original reference via ksmbd_fd_put(). The list removal cannot be left for a deferred final putter because fp->volatile_id has already been cleared and __ksmbd_remove_fd() will intentionally skip both idr_remove() and list_del_init(). Move the m_fp_list unlink in __ksmbd_remove_fd() above the volatile-id check so that an FP_NEW fp that happened to be added to m_fp_list (smb2_open() adds fp->node before ksmbd_update_fstate() runs) is still cleaned up on the deferred putter path; list_del_init() on an empty node is a no-op and remains safe for fps that were never added. Add a defensive guard in session_fd_check() that refuses non-FP_INITED fps so that even if a teardown reaches an FP_NEW fp it falls into the close branch (where the n_to_drop = 1 accounting keeps the opener's reference alive) instead of the durable-preserve branch (which mutates fp->conn / fp->tcon). Validation on a debug kernel additionally built with CONFIG_DEBUG_LIST and CONFIG_DEBUG_OBJECTS_WORK used a same-session two-tcon workload (open/write storm on one tcon, 50 tree disconnects on the other) and reported no list-corruption, work_struct ODEBUG, sleep-in-atomic, lockdep or kmemleak reports. Reverting only the __close_file_table_ids() hunk while keeping a forced-is_reconnectable() harness produced the expected sleep-in-atomic at vfs_cache.c:1095, confirming the ft->lock-out-of-sleepable-skip discipline. KASAN-enabled direct SMB2 coverage with durable handles enabled exercised ksmbd_close_tree_conn_fds(), ksmbd_close_session_fds(), the FP_NEW failure path, tree_conn_fd_check(), and a non-zero session_fd_check() durable-preserve return. This produced no KASAN, DEBUG_LIST, ODEBUG, or WARNING reports. Fixes: f44158485826 ("cifsd: add file operations") Signed-off-by: DaeMyung Kang Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 6 +- fs/smb/server/vfs_cache.c | 179 +++++++++++++++++++++++++++++++++----- fs/smb/server/vfs_cache.h | 4 +- 3 files changed, 164 insertions(+), 25 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 47b7af631f7b35..62d4399a993d80 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -3767,8 +3767,10 @@ int smb2_open(struct ksmbd_work *work) err_out2: if (!rc) { - ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED); - rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); + rc = ksmbd_update_fstate(&work->sess->file_table, fp, + FP_INITED); + if (!rc) + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); } if (rc) { if (rc == -EINVAL) diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 4a18107937cc94..dc4037ef1834ac 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -431,13 +431,13 @@ static void ksmbd_remove_durable_fd(struct ksmbd_file *fp) static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) { - if (!has_file_id(fp->volatile_id)) - return; - down_write(&fp->f_ci->m_lock); list_del_init(&fp->node); up_write(&fp->f_ci->m_lock); + if (!has_file_id(fp->volatile_id)) + return; + write_lock(&ft->lock); idr_remove(ft->idr, fp->volatile_id); write_unlock(&ft->lock); @@ -798,15 +798,58 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) return ERR_PTR(ret); } -void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, - unsigned int state) +/** + * ksmbd_update_fstate() - update an fp state under the file-table lock + * @ft: file table that publishes @fp's volatile id + * @fp: file pointer to update + * @state: new state + * + * Return: 0 on success. The FP_NEW -> FP_INITED transition is special: + * -ENOENT if teardown already unpublished @fp by advancing the state or + * clearing the volatile id. Other state updates preserve the historical + * fire-and-forget behavior. + */ +int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state) { + int ret; + if (!fp) - return; + return -ENOENT; write_lock(&ft->lock); - fp->f_state = state; + if (state == FP_INITED && + (fp->f_state != FP_NEW || !has_file_id(fp->volatile_id))) { + ret = -ENOENT; + } else { + fp->f_state = state; + ret = 0; + } write_unlock(&ft->lock); + + return ret; +} + +/* + * ksmbd_mark_fp_closed() - mark fp closed under ft->lock and return how many + * refs the teardown path owns. + * + * FP_INITED has a normal idr-owned reference, so teardown owns both that + * reference and the transient lookup reference. FP_NEW is still owned by the + * in-flight opener/reopener, which will drop the original reference after + * ksmbd_update_fstate(..., FP_INITED) observes the cleared volatile id. + * FP_CLOSED on entry means an earlier ksmbd_close_fd() already consumed the + * idr-owned ref. + */ +static int ksmbd_mark_fp_closed(struct ksmbd_file *fp) +{ + if (fp->f_state == FP_INITED) { + set_close_state_blocked_works(fp); + fp->f_state = FP_CLOSED; + return 2; + } + + return 1; } static int @@ -814,7 +857,8 @@ __close_file_table_ids(struct ksmbd_session *sess, struct ksmbd_tree_connect *tcon, bool (*skip)(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp, - struct ksmbd_user *user)) + struct ksmbd_user *user), + bool skip_preserves_fp) { struct ksmbd_file_table *ft = &sess->file_table; struct ksmbd_file *fp; @@ -822,32 +866,120 @@ __close_file_table_ids(struct ksmbd_session *sess, int num = 0; while (1) { + int n_to_drop; + write_lock(&ft->lock); fp = idr_get_next(ft->idr, &id); if (!fp) { write_unlock(&ft->lock); break; } - - if (skip(tcon, fp, sess->user) || - !atomic_dec_and_test(&fp->refcount)) { + if (!atomic_inc_not_zero(&fp->refcount)) { id++; write_unlock(&ft->lock); continue; } - set_close_state_blocked_works(fp); - idr_remove(ft->idr, fp->volatile_id); - fp->volatile_id = KSMBD_NO_FID; - write_unlock(&ft->lock); + if (skip_preserves_fp) { + /* + * Session teardown: skip() is session_fd_check(), + * which may sleep and mutates fp->conn / fp->tcon / + * fp->volatile_id when it chooses to preserve fp + * for durable reconnect. Unpublish fp from the + * session idr here, under ft->lock, so that + * __ksmbd_lookup_fd() through this session cannot + * grant a new ksmbd_fp_get() reference to an fp + * whose fields are about to be rewritten outside + * the lock. Durable reconnect still reaches fp via + * global_ft. + */ + idr_remove(ft->idr, id); + fp->volatile_id = KSMBD_NO_FID; + write_unlock(&ft->lock); + + if (skip(tcon, fp, sess->user)) { + /* + * session_fd_check() has converted fp to + * durable-preserve state and cleared its + * per-conn fields. fp is already unpublished + * above; the original idr-owned ref keeps it + * alive for the durable scavenger. Drop only + * the transient ref. atomic_dec() is safe -- + * atomic_inc_not_zero() succeeded on a + * positive value and we added one more, so + * refcount cannot be zero here. + */ + atomic_dec(&fp->refcount); + id++; + continue; + } + + /* + * Keep the close-state decision under the same lock + * observed by ksmbd_update_fstate(), which is how an + * in-flight FP_NEW opener learns that teardown has + * cleared its volatile id. + */ + write_lock(&ft->lock); + n_to_drop = ksmbd_mark_fp_closed(fp); + write_unlock(&ft->lock); + } else { + /* + * Tree teardown: skip() is tree_conn_fd_check(), a + * cheap pointer compare that doesn't sleep and has + * no side effects, so keep the skip decision plus + * the unpublish-and-mark-closed sequence atomic + * under ft->lock. fps belonging to other tree + * connects (skip() == true) stay fully published in + * the session idr with no lock window. + */ + if (skip(tcon, fp, sess->user)) { + atomic_dec(&fp->refcount); + write_unlock(&ft->lock); + id++; + continue; + } + idr_remove(ft->idr, id); + fp->volatile_id = KSMBD_NO_FID; + n_to_drop = ksmbd_mark_fp_closed(fp); + write_unlock(&ft->lock); + } + /* + * fp->volatile_id is already cleared to prevent stale idr + * removal from a deferred final close. Remove fp from + * m_fp_list here because __ksmbd_remove_fd() will skip the + * list unlink when volatile_id is KSMBD_NO_FID. + */ down_write(&fp->f_ci->m_lock); list_del_init(&fp->node); up_write(&fp->f_ci->m_lock); - __ksmbd_close_fd(ft, fp); - - num++; + /* + * Drop the references this iteration owns: + * + * n_to_drop == 2: we observed FP_INITED and committed + * the FP_CLOSED transition ourselves, so we own the + * transient (+1) and the still-intact idr-owned ref. + * + * n_to_drop == 1: either a prior ksmbd_close_fd() + * already consumed the idr-owned ref, or fp was still + * FP_NEW and the in-flight opener/reopener must keep + * the original reference until ksmbd_update_fstate() + * observes the cleared volatile id. + * + * If we end up as the final putter, finalize fp and + * account the open_files_count decrement via the caller's + * atomic_sub(num, ...). Otherwise the remaining user's + * ksmbd_fd_put() reaches __put_fd_final(), which does its + * own atomic_dec(&open_files_count), so we must not count + * this fp here -- doing so would double-decrement the + * connection-wide counter. + */ + if (atomic_sub_and_test(n_to_drop, &fp->refcount)) { + __ksmbd_close_fd(NULL, fp); + num++; + } id++; } @@ -1082,6 +1214,9 @@ static bool session_fd_check(struct ksmbd_tree_connect *tcon, if (!is_reconnectable(fp)) return false; + if (fp->f_state != FP_INITED) + return false; + if (WARN_ON_ONCE(!fp->conn)) return false; @@ -1127,7 +1262,8 @@ void ksmbd_close_tree_conn_fds(struct ksmbd_work *work) { int num = __close_file_table_ids(work->sess, work->tcon, - tree_conn_fd_check); + tree_conn_fd_check, + false); atomic_sub(num, &work->conn->stats.open_files_count); } @@ -1136,7 +1272,8 @@ void ksmbd_close_session_fds(struct ksmbd_work *work) { int num = __close_file_table_ids(work->sess, work->tcon, - session_fd_check); + session_fd_check, + true); atomic_sub(num, &work->conn->stats.open_files_count); } @@ -1268,7 +1405,7 @@ void ksmbd_destroy_file_table(struct ksmbd_session *sess) if (!ft->idr) return; - __close_file_table_ids(sess, NULL, session_fd_check); + __close_file_table_ids(sess, NULL, session_fd_check, true); idr_destroy(ft->idr); kfree(ft->idr); ft->idr = NULL; diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index 866f32c10d4dda..e6871266a94bad 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -172,8 +172,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); void ksmbd_set_fd_limit(unsigned long limit); -void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, - unsigned int state); +int ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state); bool ksmbd_vfs_compare_durable_owner(struct ksmbd_file *fp, struct ksmbd_user *user); From bf736184d063da1a552ffeff0481813599a182cc Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Tue, 28 Apr 2026 23:08:56 +0900 Subject: [PATCH 197/972] ksmbd: close durable scavenger races against m_fp_list lookups ksmbd_durable_scavenger() has two related races against any walker that iterates f_ci->m_fp_list, including ksmbd_lookup_fd_inode() (used by ksmbd_vfs_rename) and the share-mode checks in fs/smb/server/smb_common.c. (1) fp->node list-head reuse. Durable-preserved handles can remain linked on f_ci->m_fp_list after session teardown so share-mode checks still see them while the handle is reconnectable. The scavenger collected expired handles by adding fp->node to a local scavenger_list after removing them from the global durable idr. Because fp->node is the same list_head used by m_fp_list, list_add(&fp->node, &scavenger_list) overwrites the m_fp_list links and corrupts both lists. CONFIG_DEBUG_LIST can report this on the share-mode walk path. (2) Refcount race against m_fp_list walkers. The scavenger qualifies an expired durable handle with atomic_read(&fp->refcount) > 1 and fp->conn under global_ft.lock, removes fp from global_ft, then drops global_ft.lock before unlinking fp from m_fp_list and freeing it. During that gap fp is still linked on m_fp_list with f_state == FP_INITED. ksmbd_lookup_fd_inode() under m_lock read calls ksmbd_fp_get() (atomic_inc_not_zero on refcount that is still 1) and takes a live reference; the scavenger then unlinks and frees fp while the holder owns a reference, leading to UAF on the holder's subsequent ksmbd_fd_put() and on any field reads performed by a concurrent share-mode walker that iterates m_fp_list without taking ksmbd_fp_get() (smb_check_perm_dleases-like paths). Fix both: * Stop reusing fp->node as a scavenger-private list node. Remove one expired handle from global_ft under global_ft.lock, take an explicit transient reference, drop the lock, unlink fp->node from m_fp_list under f_ci->m_lock, then drop both the durable lifetime and transient references with atomic_sub_and_test(2, &fp->refcount). If the scavenger is the last putter the close runs there; otherwise an in-flight holder that already raced through the m_fp_list lookup owns the final close via its ksmbd_fd_put() path. The one-at-a-time disposal can rescan the durable idr when multiple handles expire in the same pass, but durable scavenging is a background expiration path and the final full scan recomputes min_timeout before the next wait. * Clear fp->persistent_id inside __ksmbd_remove_durable_fd() right after idr_remove(), so a delayed final close from a holder that snatched fp does not re-issue idr_remove() on a persistent id that idr_alloc_cyclic() in ksmbd_open_durable_fd() may have already handed out to a brand-new durable handle. * Bypass the per-conn open_files_count decrement in __put_fd_final() when fp is detached from any session table (fp->conn cleared by session_fd_check() at durable preserve -- paired with the volatile_id clear at unpublish, so checking fp->conn alone is sufficient). The walker that owns the final close runs from an unrelated work->conn whose stats.open_files_count never tracked this durable fp; without this guard the holder would underflow that unrelated counter. The two races are folded into one patch because patch (1) alone cleans up the corrupted list but leaves a deterministic UAF window for m_fp_list walkers that the transient-reference and persistent_id discipline in (2) close; bisecting onto an intermediate state would land on a UAF that pre-patch chaos merely made less reproducible. Validation: * CONFIG_DEBUG_LIST coverage for the list_head reuse path. * KASAN-enabled direct SMB2 durable-handle coverage that exercised ksmbd_durable_scavenger() and non-NULL ksmbd_lookup_fd_inode() returns while durable handles expired under concurrent rename lookups, with no KASAN, UAF, list-corruption, ODEBUG, or WARNING reports. * checkpatch --strict * make -j$(nproc) M=fs/smb/server Fixes: d484d621d40f ("ksmbd: add durable scavenger timer") Signed-off-by: DaeMyung Kang Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/vfs_cache.c | 102 ++++++++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index dc4037ef1834ac..354c4d8a1cfbd1 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -418,6 +418,14 @@ static void __ksmbd_remove_durable_fd(struct ksmbd_file *fp) return; idr_remove(global_ft.idr, fp->persistent_id); + /* + * Clear persistent_id so a later __ksmbd_close_fd() that runs from a + * delayed putter (e.g. when a concurrent ksmbd_lookup_fd_inode() + * walker held the final reference) does not re-issue idr_remove() on + * an id that idr_alloc_cyclic() may have already handed out to a new + * durable handle. + */ + fp->persistent_id = KSMBD_NO_FID; } static void ksmbd_remove_durable_fd(struct ksmbd_file *fp) @@ -521,6 +529,20 @@ static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft, static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp) { + /* + * Detached durable fp -- session_fd_check() cleared fp->conn at + * preserve, so this fp is no longer tracked by any conn's + * stats.open_files_count. This happens when + * ksmbd_scavenger_dispose_dh() hands the final close off to an + * m_fp_list walker (e.g. ksmbd_lookup_fd_inode()) whose work->conn + * is unrelated to the conn that originally opened the handle; close + * via the NULL-ft path so we do not underflow that unrelated + * counter. + */ + if (!fp->conn) { + __ksmbd_close_fd(NULL, fp); + return; + } __ksmbd_close_fd(&work->sess->file_table, fp); atomic_dec(&work->conn->stats.open_files_count); } @@ -1033,24 +1055,37 @@ static bool ksmbd_durable_scavenger_alive(void) return true; } -static void ksmbd_scavenger_dispose_dh(struct list_head *head) +static void ksmbd_scavenger_dispose_dh(struct ksmbd_file *fp) { - while (!list_empty(head)) { - struct ksmbd_file *fp; + /* + * Durable-preserved fp can remain linked on f_ci->m_fp_list for + * share-mode checks. Unlink it before final close; fp->node is not + * available as a scavenger-private list node because re-adding it to + * another list corrupts m_fp_list. + */ + down_write(&fp->f_ci->m_lock); + list_del_init(&fp->node); + up_write(&fp->f_ci->m_lock); - fp = list_first_entry(head, struct ksmbd_file, node); - list_del_init(&fp->node); + /* + * Drop both the durable lifetime reference and the transient reference + * taken by the scavenger under global_ft.lock. If a concurrent + * ksmbd_lookup_fd_inode() (or any other m_fp_list walker) snatched fp + * before the unlink above, that holder owns the final close via + * ksmbd_fd_put() -> __ksmbd_close_fd(). Otherwise the scavenger is + * the last putter and finalises fp here. + */ + if (atomic_sub_and_test(2, &fp->refcount)) __ksmbd_close_fd(NULL, fp); - } } static int ksmbd_durable_scavenger(void *dummy) { struct ksmbd_file *fp = NULL; + struct ksmbd_file *expired_fp; unsigned int id; unsigned int min_timeout = 1; bool found_fp_timeout; - LIST_HEAD(scavenger_list); unsigned long remaining_jiffies; __module_get(THIS_MODULE); @@ -1060,8 +1095,6 @@ static int ksmbd_durable_scavenger(void *dummy) if (try_to_freeze()) continue; - found_fp_timeout = false; - remaining_jiffies = wait_event_timeout(dh_wq, ksmbd_durable_scavenger_alive() == false, __msecs_to_jiffies(min_timeout)); @@ -1070,23 +1103,39 @@ static int ksmbd_durable_scavenger(void *dummy) else min_timeout = DURABLE_HANDLE_MAX_TIMEOUT; - write_lock(&global_ft.lock); - idr_for_each_entry(global_ft.idr, fp, id) { - if (!fp->durable_timeout) - continue; + do { + expired_fp = NULL; + found_fp_timeout = false; - if (atomic_read(&fp->refcount) > 1 || - fp->conn) - continue; - - found_fp_timeout = true; - if (fp->durable_scavenger_timeout <= - jiffies_to_msecs(jiffies)) { - __ksmbd_remove_durable_fd(fp); - list_add(&fp->node, &scavenger_list); - } else { + write_lock(&global_ft.lock); + idr_for_each_entry(global_ft.idr, fp, id) { unsigned long durable_timeout; + if (!fp->durable_timeout) + continue; + + if (atomic_read(&fp->refcount) > 1 || + fp->conn) + continue; + + found_fp_timeout = true; + if (fp->durable_scavenger_timeout <= + jiffies_to_msecs(jiffies)) { + __ksmbd_remove_durable_fd(fp); + /* + * Take a transient reference so fp + * cannot be freed by an in-flight + * ksmbd_lookup_fd_inode() that found + * it through f_ci->m_fp_list while we + * drop global_ft.lock and reach the + * m_fp_list unlink in + * ksmbd_scavenger_dispose_dh(). + */ + atomic_inc(&fp->refcount); + expired_fp = fp; + break; + } + durable_timeout = fp->durable_scavenger_timeout - jiffies_to_msecs(jiffies); @@ -1094,10 +1143,11 @@ static int ksmbd_durable_scavenger(void *dummy) if (min_timeout > durable_timeout) min_timeout = durable_timeout; } - } - write_unlock(&global_ft.lock); + write_unlock(&global_ft.lock); - ksmbd_scavenger_dispose_dh(&scavenger_list); + if (expired_fp) + ksmbd_scavenger_dispose_dh(expired_fp); + } while (expired_fp); if (found_fp_timeout == false) break; From a74668eb2c0b866d7ac4823be6006ab2e227bc03 Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Wed, 29 Apr 2026 16:59:56 +0800 Subject: [PATCH 198/972] ksmbd: fail share config requests when path allocation fails Non-pipe shares must have a duplicated backing path before they can be published. share_config_request() currently calls kstrndup() for that path, but if the allocation fails it leaves ret unchanged. If veto list parsing succeeds and share->name exists, the partially built share is still inserted into the global share table with share->path left NULL. A later share-root SMB2 create uses tree_conn->share_conf->path as the lookup root. If the share was published with path == NULL, that request passes a NULL pathname into do_getname_kernel()/strlen() and can crash the ksmbd worker. Set ret = -ENOMEM when path duplication fails so the incomplete share is destroyed before publication. Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Signed-off-by: Shuhao Fu Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/mgmt/share_config.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index 53f44ff4d376f3..6f97f8d39657cd 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -167,7 +167,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, share->path = kstrndup(ksmbd_share_config_path(resp), path_len, KSMBD_DEFAULT_GFP); - if (share->path) { + if (!share->path) { + ret = -ENOMEM; + } else { + ret = 0; share->path_sz = strlen(share->path); while (share->path_sz > 1 && share->path[share->path_sz - 1] == '/') @@ -179,9 +182,10 @@ static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work, share->force_directory_mode = resp->force_directory_mode; share->force_uid = resp->force_uid; share->force_gid = resp->force_gid; - ret = parse_veto_list(share, - KSMBD_SHARE_CONFIG_VETO_LIST(resp), - resp->veto_list_sz); + if (!ret) + ret = parse_veto_list(share, + KSMBD_SHARE_CONFIG_VETO_LIST(resp), + resp->veto_list_sz); if (!ret && share->path) { if (__ksmbd_override_fsids(work, share)) { kill_share(share); From 6fd7dd4e44d7840cb1ba0c3a895e9f576af3fe5c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 1 May 2026 08:34:55 +0900 Subject: [PATCH 199/972] ksmbd: fix kernel-doc warnings from ksmbd_conn_get/put() The kernel test robot reported W=1 build warnings for ksmbd_conn_get() and ksmbd_conn_put() due to missing parameter descriptions. Add the @conn description to fix these warnings. Reported-by: kernel test robot Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 95654a80816286..8347495dbc6284 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -122,6 +122,8 @@ static void __ksmbd_conn_release_work(struct work_struct *work) /** * ksmbd_conn_get() - take a reference on @conn and return it. * + * @conn: connection instance to get a reference to + * * Returns @conn unchanged so callers can write * "fp->conn = ksmbd_conn_get(work->conn);" in one expression. Returns NULL * if @conn is NULL. @@ -139,6 +141,8 @@ struct ksmbd_conn *ksmbd_conn_get(struct ksmbd_conn *conn) * ksmbd_conn_put() - drop a reference and, if it was the last, queue the * release onto ksmbd_conn_wq so it runs from process context. * + * @conn: connection instance to put a reference to + * * Callable from any context including RCU softirq callbacks and non-sleeping * locks; the actual release is deferred to the workqueue. ksmbd_conn_wq is * created in ksmbd_server_init() before any conn can be allocated and is From 996454bc0da84d5a1dedb1a7861823087e01a7ae Mon Sep 17 00:00:00 2001 From: Shota Zaizen Date: Tue, 28 Apr 2026 19:02:55 +0900 Subject: [PATCH 200/972] ksmbd: validate inherited ACE SID length smb_inherit_dacl() walks the parent directory DACL loaded from the security descriptor xattr. It verifies that each ACE contains the fixed SID header before using it, but does not verify that the variable-length SID described by sid.num_subauth is fully contained in the ACE. A malformed inheritable ACE can advertise more subauthorities than are present in the ACE. compare_sids() may then read past the ACE. smb_set_ace() also clamps the copied destination SID, but used the unchecked source SID count to compute the inherited ACE size. That could advance the temporary inherited ACE buffer pointer and nt_size accounting past the allocated buffer. Fix this by validating the parent ACE SID count and SID length before using the SID during inheritance. Compute the inherited ACE size from the copied SID so the size matches the bounded destination SID. Reject the inherited DACL if size accumulation would overflow smb_acl.size or the security descriptor allocation size. Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Signed-off-by: Shota Zaizen Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smbacl.c | 66 +++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 4bbc2c27e6805e..c1d1f34581d69d 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1068,7 +1068,26 @@ static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, ace->flags = flags; ace->access_req = access_req; smb_copy_sid(&ace->sid, sid); - ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + (sid->num_subauth * 4)); + ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + + (ace->sid.num_subauth * 4)); +} + +static int smb_append_inherited_ace(struct smb_ace **ace, int *nt_size, + u16 *ace_cnt, const struct smb_sid *sid, + u8 type, u8 flags, __le32 access_req) +{ + int ace_size; + + smb_set_ace(*ace, sid, type, flags, access_req); + ace_size = le16_to_cpu((*ace)->size); + /* pdacl->size is __le16 and includes struct smb_acl. */ + if (check_add_overflow(*nt_size, ace_size, nt_size) || + *nt_size > U16_MAX - (int)sizeof(struct smb_acl)) + return -EINVAL; + + (*ace_cnt)++; + *ace = (struct smb_ace *)((char *)*ace + ace_size); + return 0; } int smb_inherit_dacl(struct ksmbd_conn *conn, @@ -1157,6 +1176,12 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, CIFS_SID_BASE_SIZE) break; + if (parent_aces->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + pace_size < offsetof(struct smb_ace, sid) + + CIFS_SID_BASE_SIZE + + sizeof(__le32) * parent_aces->sid.num_subauth) + break; + aces_size -= pace_size; flags = parent_aces->flags; @@ -1186,22 +1211,24 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, } if (is_dir && creator && flags & CONTAINER_INHERIT_ACE) { - smb_set_ace(aces, psid, parent_aces->type, inherited_flags, - parent_aces->access_req); - nt_size += le16_to_cpu(aces->size); - ace_cnt++; - aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); + rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt, + psid, parent_aces->type, + inherited_flags, + parent_aces->access_req); + if (rc) + goto free_aces_base; flags |= INHERIT_ONLY_ACE; psid = creator; } else if (is_dir && !(parent_aces->flags & NO_PROPAGATE_INHERIT_ACE)) { psid = &parent_aces->sid; } - smb_set_ace(aces, psid, parent_aces->type, flags | inherited_flags, - parent_aces->access_req); - nt_size += le16_to_cpu(aces->size); - aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); - ace_cnt++; + rc = smb_append_inherited_ace(&aces, &nt_size, &ace_cnt, psid, + parent_aces->type, + flags | inherited_flags, + parent_aces->access_req); + if (rc) + goto free_aces_base; pass: parent_aces = (struct smb_ace *)((char *)parent_aces + pace_size); } @@ -1211,7 +1238,7 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_acl *pdacl; struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL; int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size; - int pntsd_alloc_size; + size_t pntsd_alloc_size; if (parent_pntsd->osidoffset) { powner_sid = (struct smb_sid *)((char *)parent_pntsd + @@ -1224,8 +1251,19 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4); } - pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size + - pgroup_sid_size + sizeof(struct smb_acl) + nt_size; + if (check_add_overflow(sizeof(struct smb_ntsd), + (size_t)powner_sid_size, + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, + (size_t)pgroup_sid_size, + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, sizeof(struct smb_acl), + &pntsd_alloc_size) || + check_add_overflow(pntsd_alloc_size, (size_t)nt_size, + &pntsd_alloc_size)) { + rc = -EINVAL; + goto free_aces_base; + } pntsd = kzalloc(pntsd_alloc_size, KSMBD_DEFAULT_GFP); if (!pntsd) { From 6ebcbb53fc9bc30843054ed99fd60b8e542628f4 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 1 May 2026 06:23:20 +0000 Subject: [PATCH 201/972] riscv: Fix register corruption from uninitialized cregs on error compat_riscv_gpr_set() calls cregs_to_regs() unconditionally, even when user_regset_copyin() fails. Since cregs is an uninitialized stack variable, a copyin failure causes uninitialized stack data to be written into the target task's pt_regs, corrupting its register state and potentially leaking kernel stack contents. compat_restore_sigcontext() has the same issue: it calls cregs_to_regs() even when __copy_from_user() fails, leading to the same corruption of the signal-returning task's register state on error. Only call cregs_to_regs() when the user copy succeeds. Fixes: 4608c159594f ("riscv: compat: ptrace: Add compat_arch_ptrace implement") Fixes: 7383ee05314b ("riscv: compat: signal: Add rt_frame implementation") Signed-off-by: Michael Neuling Assisted-by: Cursor:claude-4.6-opus-high-thinking Link: https://patch.msgid.link/20260501062320.2339562-1-mikey@neuling.org Signed-off-by: Paul Walmsley --- arch/riscv/kernel/compat_signal.c | 2 ++ arch/riscv/kernel/ptrace.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/compat_signal.c b/arch/riscv/kernel/compat_signal.c index 6ec4e34255a9ab..cf3eb33a11e464 100644 --- a/arch/riscv/kernel/compat_signal.c +++ b/arch/riscv/kernel/compat_signal.c @@ -107,6 +107,8 @@ static long compat_restore_sigcontext(struct pt_regs *regs, /* sc_regs is structured the same as the start of pt_regs */ err = __copy_from_user(&cregs, &sc->sc_regs, sizeof(sc->sc_regs)); + if (unlikely(err)) + return err; cregs_to_regs(&cregs, regs); diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 93de2e7a30747d..793bcee4618282 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -577,8 +577,8 @@ static int compat_riscv_gpr_set(struct task_struct *target, struct compat_user_regs_struct cregs; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &cregs, 0, -1); - - cregs_to_regs(&cregs, task_pt_regs(target)); + if (!ret) + cregs_to_regs(&cregs, task_pt_regs(target)); return ret; } From db909bd7986c10da074917af3dae83a60fa65093 Mon Sep 17 00:00:00 2001 From: "Guo Ren (Alibaba DAMO Academy)" Date: Sun, 25 Jan 2026 00:52:12 -0500 Subject: [PATCH 202/972] riscv: mm: Fixup no5lvl failure when vaddr is invalid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike no4lvl, no5lvl still continues to detect satp, which requires va=pa mapping. When pa=0x800000000000, no5lvl would fail in Sv48 mode due to an illegal VA value of 0x800000000000. So, prevent detecting the satp flow for no5lvl, when vaddr is invalid. Add the is_vaddr_valid() function for checking. Fixes: 26e7aacb83df ("riscv: Allow to downgrade paging mode from the command line") Cc: Alexandre Ghiti Cc: Björn Töpel Signed-off-by: Guo Ren (Alibaba DAMO Academy) Tested-by: Fangyu Yu Link: https://patch.msgid.link/20260125055212.433163-1-guoren@kernel.org [pjw@kernel.org: cleaned up commit message] Signed-off-by: Paul Walmsley --- arch/riscv/mm/init.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index decd7df40fa421..fa8d2f6f554b57 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -792,6 +792,27 @@ static void __init set_mmap_rnd_bits_max(void) mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; } +static bool __init is_vaddr_valid(unsigned long va) +{ + unsigned long up = 0; + + switch (satp_mode) { + case SATP_MODE_39: + up = 1UL << 38; + break; + case SATP_MODE_48: + up = 1UL << 47; + break; + case SATP_MODE_57: + up = 1UL << 56; + break; + default: + return false; + } + + return (va < up) || (va >= (ULONG_MAX - up + 1)); +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode @@ -833,6 +854,9 @@ static __init void set_satp_mode(uintptr_t dtb_pa) set_satp_mode_pmd + PMD_SIZE, PMD_SIZE, PAGE_KERNEL_EXEC); retry: + if (!is_vaddr_valid(set_satp_mode_pmd)) + goto out; + create_pgd_mapping(early_pg_dir, set_satp_mode_pmd, pgtable_l5_enabled ? @@ -855,6 +879,7 @@ static __init void set_satp_mode(uintptr_t dtb_pa) disable_pgtable_l4(); } +out: memset(early_pg_dir, 0, PAGE_SIZE); memset(early_p4d, 0, PAGE_SIZE); memset(early_pud, 0, PAGE_SIZE); From 0c7ae130698e70107430254e79fbe996b4d37ab5 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sat, 2 May 2026 12:12:40 +0200 Subject: [PATCH 203/972] tools/headers: Regenerate stddef.h to fix BPF selftests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With commit dacbfc167808 ("crypto: af_alg - Annotate struct af_alg_iv with __counted_by"), two selftests, test_tag and crypto_sanity, now indirectly rely on the __counted_by macro. On systems with commit dacbfc167808 in the installed UAPI headers, the selftests build fails with: In file included from tools/testing/selftests/bpf/prog_tests/crypto_sanity.c:7: /usr/include/linux/if_alg.h:45:22: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘__counted_by’ 45 | __u8 iv[] __counted_by(ivlen); | ^~~~~~~~~~~~ This patch fixes it by regenerating stddef.h in tools/include using the instructions from commit a778f5d46b62 ("tools/headers: Pull in stddef.h to uapi to fix BPF selftests build in CI"). Fixes: dacbfc167808 ("crypto: af_alg - Annotate struct af_alg_iv with __counted_by") Signed-off-by: Paul Chaignon Reviewed-by: Alan Maguire Tested-by: Ihor Solodrai Link: https://lore.kernel.org/r/8da8ef16055aa452d940668ed5359ce54adc6b0b.1777715500.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/stddef.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/stddef.h b/tools/include/uapi/linux/stddef.h index c53cde425406b7..45749825949464 100644 --- a/tools/include/uapi/linux/stddef.h +++ b/tools/include/uapi/linux/stddef.h @@ -3,7 +3,6 @@ #define _LINUX_STDDEF_H - #ifndef __always_inline #define __always_inline __inline__ #endif @@ -36,6 +35,11 @@ struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ } ATTRS +#ifdef __cplusplus +/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */ +#define __DECLARE_FLEX_ARRAY(T, member) \ + T member[0] +#else /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union * @@ -52,3 +56,23 @@ TYPE NAME[]; \ } #endif + +#ifndef __counted_by +#define __counted_by(m) +#endif + +#ifndef __counted_by_le +#define __counted_by_le(m) +#endif + +#ifndef __counted_by_be +#define __counted_by_be(m) +#endif + +#ifndef __counted_by_ptr +#define __counted_by_ptr(m) +#endif + +#define __kernel_nonstring + +#endif /* _LINUX_STDDEF_H */ From ddca6da148b8ced3e6d3d7fb3b2e5b4ed6359dc2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Apr 2026 11:23:35 +0100 Subject: [PATCH 204/972] MAINTAINERS: Add self for the DEC LANCE network driver Like with the rest of DECstation and TURBOchannel hardware I have been handling the DEC LANCE network driver for some 25 years now anyway. Signed-off-by: Maciej W. Rozycki Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/alpine.DEB.2.21.2604271113520.28583@angie.orcam.me.uk Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 27a073f53cea63..ec8661b446fb73 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7077,6 +7077,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/debugobjec F: include/linux/debugobjects.h F: lib/debugobjects.c +DEC LANCE NETWORK DRIVER +M: "Maciej W. Rozycki" +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/amd/declance.c + DECSTATION PLATFORM SUPPORT M: "Maciej W. Rozycki" L: linux-mips@vger.kernel.org From 1a57efe250a13906396c2a4792f0090f142f9844 Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Wed, 29 Apr 2026 13:42:07 +0200 Subject: [PATCH 205/972] net: wan: fsl_ucc_hdlc: fix uhdlc_memclean Unmapping of uf_regs is done from ucc_fast_free and doesn't need to be done explicitly. If already unmapped ucc_fast_free will crash. Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 3bd57527b1be63..8155e92af14e23 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -773,11 +773,6 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv) kfree(priv->tx_skbuff); priv->tx_skbuff = NULL; - if (priv->uf_regs) { - iounmap(priv->uf_regs); - priv->uf_regs = NULL; - } - if (priv->uccf) { ucc_fast_free(priv->uccf); priv->uccf = NULL; From 851bba8068d15f5a386da544096f7ed6bc16e551 Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Wed, 29 Apr 2026 13:42:08 +0200 Subject: [PATCH 206/972] net: wan: fsl_ucc_hdlc: fix ucc_hdlc_remove If the driver is used in a non tdm mode priv->utdm is a NULL pointer. Therefore we need to check this pointer first before checking si_regs. Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 8155e92af14e23..15bfb78381d4cb 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1250,12 +1250,12 @@ static void ucc_hdlc_remove(struct platform_device *pdev) uhdlc_memclean(priv); - if (priv->utdm->si_regs) { + if (priv->utdm && priv->utdm->si_regs) { iounmap(priv->utdm->si_regs); priv->utdm->si_regs = NULL; } - if (priv->utdm->siram) { + if (priv->utdm && priv->utdm->siram) { iounmap(priv->utdm->siram); priv->utdm->siram = NULL; } From 383d0fb8946921b4914ea0f360342e221d419d40 Mon Sep 17 00:00:00 2001 From: Gregory Fuchedgi Date: Wed, 29 Apr 2026 14:54:14 -0700 Subject: [PATCH 207/972] amd-xgbe: fix PTP addend overflow causing frozen clock XGBE_PTP_ACT_CLK_FREQ and XGBE_V2_PTP_ACT_CLK_FREQ were 10x too large (500MHz/1GHz instead of 50MHz/100MHz), causing the computed addend to overflow the 32-bit tstamp_addend. In the general case this would result in the clock advancing at the wrong rate. For v2 (PCI), ptpclk_rate is hardcoded to 125MHz, so the addend formula (ACT_CLK_FREQ << 32) / ptpclk_rate yields exactly 8 * 2^32, and when stored to the 32-bit tstamp_addend the value is zero. With addend = 0 the hardware accumulator never overflows and the PTP clock is fully stopped. For v1 (platform), ptpclk_rate is read from ACPI/DT so the exact overflow behavior depends on the firmware-reported frequency. Define the constants as NSEC_PER_SEC / SSINC so the relationship is explicit and cannot drift out of sync. Fixes: fbd47be098b5 ("amd-xgbe: add hardware PTP timestamping support") Tested-by: Gregory Fuchedgi Signed-off-by: Gregory Fuchedgi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-fix-xgbe-ptp-addend-v1-1-fca5b0ca5e62@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 60b7e53206d1e1..3d3b09010d48fb 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -135,11 +135,11 @@ */ #define XGBE_TSTAMP_SSINC 20 #define XGBE_TSTAMP_SNSINC 0 -#define XGBE_PTP_ACT_CLK_FREQ 500000000 +#define XGBE_PTP_ACT_CLK_FREQ (NSEC_PER_SEC / XGBE_TSTAMP_SSINC) #define XGBE_V2_TSTAMP_SSINC 0xA #define XGBE_V2_TSTAMP_SNSINC 0 -#define XGBE_V2_PTP_ACT_CLK_FREQ 1000000000 +#define XGBE_V2_PTP_ACT_CLK_FREQ (NSEC_PER_SEC / XGBE_V2_TSTAMP_SSINC) /* Define maximum supported values */ #define XGBE_MAX_PPS_OUT 4 From 458d5615272d3de535748342eb68ca492343048c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 30 Apr 2026 11:29:55 -0400 Subject: [PATCH 208/972] net/sched: sch_red: Replace direct dequeue call with peek and qdisc_dequeue_peeked When red qdisc has children (eg qfq qdisc) whose peek() callback is qdisc_peek_dequeued(), we could get a kernel panic. When the parent of such qdiscs (eg illustrated in patch #3 as tbf) wants to retrieve an skb from its child (red in this case), it will do the following: 1a. do a peek() - and when sensing there's an skb the child can offer, then - the child in this case(red) calls its child's (qfq) peek. qfq does the right thing and will return the gso_skb queue packet. Note: if there wasnt a gso_skb entry then qfq will store it there. 1b. invoke a dequeue() on the child (red). And herein lies the problem. - red will call the child's dequeue() which will essentially just try to grab something of qfq's queue. [ 78.667668][ T363] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 78.667927][ T363] CPU: 1 UID: 0 PID: 363 Comm: ping Not tainted 7.1.0-rc1-00033-g46f74a3f7d57-dirty #790 PREEMPT(full) [ 78.668263][ T363] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 78.668486][ T363] RIP: 0010:qfq_dequeue+0x446/0xc90 [sch_qfq] [ 78.668718][ T363] Code: 54 c0 e8 dd 90 00 f1 48 c7 c7 e0 03 54 c0 48 89 de e8 ce 90 00 f1 48 8d 7b 48 b8 ff ff 37 00 48 89 fa 48 c1 e0 2a 48 c1 ea 03 <80> 3c 02 00 74 05 e8 ef a1 e1 f1 48 8b 7b 48 48 8d 54 24 58 48 8d [ 78.669312][ T363] RSP: 0018:ffff88810de573e0 EFLAGS: 00010216 [ 78.669533][ T363] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 78.669790][ T363] RDX: 0000000000000009 RSI: 0000000000000004 RDI: 0000000000000048 [ 78.670044][ T363] RBP: ffff888110dc4000 R08: ffffffffb1b0885a R09: fffffbfff6ba9078 [ 78.670297][ T363] R10: 0000000000000003 R11: ffff888110e31c80 R12: 0000001880000000 [ 78.670560][ T363] R13: ffff888110dc4150 R14: ffff888110dc42b8 R15: 0000000000000200 [ 78.670814][ T363] FS: 00007f66a8f09c40(0000) GS:ffff888163428000(0000) knlGS:0000000000000000 [ 78.671110][ T363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 78.671324][ T363] CR2: 000055db4c6a30a8 CR3: 000000010da67000 CR4: 0000000000750ef0 [ 78.671585][ T363] PKRU: 55555554 [ 78.671713][ T363] Call Trace: [ 78.671843][ T363] [ 78.671936][ T363] ? __pfx_qfq_dequeue+0x10/0x10 [sch_qfq] [ 78.672148][ T363] ? __pfx__printk+0x10/0x10 [ 78.672322][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.672496][ T363] ? lockdep_hardirqs_on_prepare+0xa8/0x1a0 [ 78.672706][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.672875][ T363] ? trace_hardirqs_on+0x19/0x1a0 [ 78.673047][ T363] red_dequeue+0x65/0x270 [sch_red] [ 78.673217][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.673385][ T363] tbf_dequeue.cold+0xb0/0x70c [sch_tbf] [ 78.673566][ T363] __qdisc_run+0x169/0x1900 The right thing to do in #1b is to grab the skb off gso_skb queue. This patchset fixes that issue by changing #1b to use qdisc_dequeue_peeked() method instead. Fixes: 77be155cba4e ("pkt_sched: Add peek emulation for non-work-conserving qdiscs.") Reported-by: Manas Reported-by: Rakshit Awasthi Signed-off-by: Jamal Hadi Salim Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430152957.194015-2-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_red.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 432b8a3000a57b..4d0e44a2e7c664 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -162,7 +162,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; - skb = child->dequeue(child); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); From 1b9bc71153b01dbde8045b9edede4240f4f5520e Mon Sep 17 00:00:00 2001 From: Victor Nogueria Date: Thu, 30 Apr 2026 11:29:56 -0400 Subject: [PATCH 209/972] net/sched: sch_sfb: Replace direct dequeue call with peek and qdisc_dequeue_peeked When sfb has children (eg qfq qdisc) whose peek() callback is qdisc_peek_dequeued(), we could get a kernel panic. When the parent of such qdiscs (eg illustrated in patch #3 as tbf) wants to retrieve an skb from its child (sfb in this case), it will do the following: 1a. do a peek() - and when sensing there's an skb the child can offer, then - the child in this case(sfb) calls its child's (qfq) peek. qfq does the right thing and will return the gso_skb queue packet. Note: if there wasnt a gso_skb entry then qfq will store it there. 1b. invoke a dequeue() on the child (sfb). And herein lies the problem. - sfb will call the child's dequeue() which will essentially just try to grab something of qfq's queue. [ 127.594489][ T453] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 127.594741][ T453] CPU: 2 UID: 0 PID: 453 Comm: ping Not tainted 7.1.0-rc1-00035-gac961974495b-dirty #793 PREEMPT(full) [ 127.595059][ T453] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 127.595254][ T453] RIP: 0010:qfq_dequeue+0x35c/0x1650 [sch_qfq] [ 127.595461][ T453] Code: 00 fc ff df 80 3c 02 00 0f 85 17 0e 00 00 4c 8d 73 48 48 89 9d b8 02 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 76 0c 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b [ 127.596081][ T453] RSP: 0018:ffff88810e5af440 EFLAGS: 00010216 [ 127.596337][ T453] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: dffffc0000000000 [ 127.596623][ T453] RDX: 0000000000000009 RSI: 0000001880000000 RDI: ffff888104fd82b0 [ 127.596917][ T453] RBP: ffff888104fd8000 R08: ffff888104fd8280 R09: 1ffff110211893a3 [ 127.597165][ T453] R10: 1ffff110211893a6 R11: 1ffff110211893a7 R12: 0000001880000000 [ 127.597404][ T453] R13: ffff888104fd82b8 R14: 0000000000000048 R15: 0000000040000000 [ 127.597644][ T453] FS: 00007fc380cbfc40(0000) GS:ffff88816f2a8000(0000) knlGS:0000000000000000 [ 127.597956][ T453] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 127.598160][ T453] CR2: 00005610aa9890a8 CR3: 000000010369e000 CR4: 0000000000750ef0 [ 127.598390][ T453] PKRU: 55555554 [ 127.598509][ T453] Call Trace: [ 127.598629][ T453] [ 127.598718][ T453] ? mark_held_locks+0x40/0x70 [ 127.598890][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599053][ T453] sfb_dequeue+0x88/0x4d0 [ 127.599174][ T453] ? ktime_get+0x137/0x230 [ 127.599328][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599480][ T453] ? qdisc_peek_dequeued+0x7b/0x350 [sch_qfq] [ 127.599670][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599831][ T453] tbf_dequeue+0x6b1/0x1098 [sch_tbf] [ 127.599988][ T453] __qdisc_run+0x169/0x1900 The right thing to do in #1b is to grab the skb off gso_skb queue. This patchset fixes that issue by changing #1b to use qdisc_dequeue_peeked() method instead. Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Victor Nogueria Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430152957.194015-3-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index bd5ef561030fea..d3ee8e5479b35e 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -441,7 +441,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) struct Qdisc *child = q->qdisc; struct sk_buff *skb; - skb = child->dequeue(q->qdisc); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); From 3a3a30c14d7f04206916824a79878cfefac6c8e2 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 30 Apr 2026 11:29:57 -0400 Subject: [PATCH 210/972] selftests/tc-testing: Add tests that force red and sfb to dequeue from child's gso_skb Create 4 test cases: - Force red to dequeue from its child's gso_skb with qfq leaf - Force sfb to dequeue from its child's gso_skb with qfq leaf - Force red to dequeue from its child's gso_skb with dualpi2 leaf - Force sfb to dequeue from its child's gso_skb with dualpi2 leaf All of them have tbf followed by red (or sfb) followed by qfq (or dualpi2). Since tbf calls its child's peek followed by qdisc_dequeue_peeked, it will force red/sfb to call their child's peek. In this case, since the child (qfq/dualpi2) has qdisc_peek_dequeued as its peek callback, the packet will be stored in its gso_skb queue. During the subsequent call to qdisc_dequeue_peeked, red/sfb will have to dequeue from the child's gso_skb to retrieve the packet. Not doing so will cause a NULL ptr deref which was happening before a recent fix. Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260430152957.194015-4-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index eefadd0546d3b6..b1f856cf62c126 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1136,5 +1136,153 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "7a5f", + "name": "Force red to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "red", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "cdae", + "name": "Force sfb to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "291d", + "name": "Force red to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "red", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "9c6d", + "name": "Force sfb to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] } ] From 1d324c2f43f70c965f25c58cc3611c779adbe47e Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Thu, 30 Apr 2026 18:33:18 +0800 Subject: [PATCH 211/972] ip6_gre: Use cached t->net in ip6erspan_changelink(). After commit 5e72ce3e3980 ("net: ipv6: Use link netns in newlink() of rtnl_link_ops"), ip6erspan_newlink() correctly resolves the per-netns ip6gre hash via link_net. ip6erspan_changelink() was not converted in that series and still uses dev_net(dev), which diverges from the device's creation netns after IFLA_NET_NS_FD migration. This re-inserts the tunnel into the wrong per-netns hash. The original netns keeps a stale entry. When that netns is later destroyed, ip6gre_exit_rtnl_net() walks the stale entry, producing a slab-use-after-free reported by KASAN, followed by a kernel BUG at net/core/dev.c (LIST_POISON1) in unregister_netdevice_many_notify(). Reachable from an unprivileged user namespace (unshare --user --map-root-user --net). ip6gre_changelink() earlier in the same file already uses the cached t->net; only ip6erspan_changelink() has the wrong shape. Fixes: 2d665034f239 ("net: ip6_gre: Fix ip6erspan hlen calculation") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260430103318.3206018-1-maoyi.xie@ntu.edu.sg Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 63fc8556b475e5..365b4059eb2035 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2262,10 +2262,11 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; - struct ip6_tnl *t; + struct ip6gre_net *ign; + ign = net_generic(t->net, ip6gre_net_id); t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) return PTR_ERR(t); From 70f780edcd1e86350202d8a409de026b2d2e2067 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:34 -0300 Subject: [PATCH 212/972] RDMA/ionic: Fix typo in format string Applying the corrupted patch by hand mangled the format string, put the s in the right place. Cc: stable@vger.kernel.org Fixes: 654a27f25530 ("RDMA/ionic: bound node_desc sysfs read with %.64s") Link: https://patch.msgid.link/r/1-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reported-by: Brad Spengler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ionic/ionic_ibdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ionic/ionic_ibdev.c b/drivers/infiniband/hw/ionic/ionic_ibdev.c index 0382a64839d26a..73a616ae350236 100644 --- a/drivers/infiniband/hw/ionic/ionic_ibdev.c +++ b/drivers/infiniband/hw/ionic/ionic_ibdev.c @@ -185,7 +185,7 @@ static ssize_t hca_type_show(struct device *device, struct ionic_ibdev *dev = rdma_device_to_drv_device(device, struct ionic_ibdev, ibdev); - return sysfs_emit(buf, "%s.64\n", dev->ibdev.node_desc); + return sysfs_emit(buf, "%.64s\n", dev->ibdev.node_desc); } static DEVICE_ATTR_RO(hca_type); From 641858d52f2372124d9312a407e2124915d846ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:35 -0300 Subject: [PATCH 213/972] RDMA/mlx5: Restore zero-init to mlx5_ib_modify_qp() ucmd Sashiko points out the check for inlen==0 got missed, the ={} was not redundant, put it back. Fixes: a9cd442a5347 ("RDMA: Remove redundant = {} for udata req structs") Link: https://patch.msgid.link/r/2-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1611a704c1b332..8fd05532c09cc7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4697,7 +4697,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_modify_qp_resp resp = {}; struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_ib_modify_qp ucmd; + struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; From 45e8ebc9ede73543c55d597bb53b6bbb7e8b7327 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:36 -0300 Subject: [PATCH 214/972] RDMA/mlx5: Add missing store/release for lock elision pattern mlx5 has a common pattern implementing a device-global singleton resource where it checks the resource pointer for !NULL and then skips obtaining the lock. This is not ordered properly as observing !NULL doesn't mean that all the data under that pointer is also visible on this CPU when the lock is not taken. Use a release/acquire pairing to explicitly manage this. Pointed out by sashiko, Codex found more cases. Fixes: 5895e70f2e6e ("IB/mlx5: Allocate resources just before first QP/SRQ is created") Fixes: 638420115cc4 ("IB/mlx5: Create UMR QP just before first reg_mr occurs") Link: https://sashiko.dev/#/patchset/SYBPR01MB7881E1E0970268BD69C0BA75AF2B2%40SYBPR01MB7881.ausprd01.prod.outlook.com Link: https://patch.msgid.link/r/3-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Assisted-by: Codex:GPT-5.5 Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++---- drivers/infiniband/hw/mlx5/umr.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8115ae869ef209..61078281953d6c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3310,7 +3310,7 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) * devr->c0 is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (devr->c0) + if (smp_load_acquire(&devr->c0)) return 0; mutex_lock(&devr->cq_lock); @@ -3336,7 +3336,7 @@ int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev) } devr->p0 = pd; - devr->c0 = cq; + smp_store_release(&devr->c0, cq); unlock: mutex_unlock(&devr->cq_lock); @@ -3354,7 +3354,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) * devr->s1 is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (devr->s1) + if (smp_load_acquire(&devr->s1)) return 0; mutex_lock(&devr->srq_lock); @@ -3396,7 +3396,7 @@ int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev) } devr->s0 = s0; - devr->s1 = s1; + smp_store_release(&devr->s1, s1); unlock: mutex_unlock(&devr->srq_lock); diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 29488fba21a034..f2139474be3751 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -147,7 +147,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) * UMR qp is set once, never changed until device unload. * Avoid taking the mutex if initialization is already done. */ - if (dev->umrc.qp) + if (smp_load_acquire(&dev->umrc.qp)) return 0; mutex_lock(&dev->umrc.init_lock); @@ -185,7 +185,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) sema_init(&dev->umrc.sem, MAX_UMR_WR); mutex_init(&dev->umrc.lock); dev->umrc.state = MLX5_UMR_STATE_ACTIVE; - dev->umrc.qp = qp; + smp_store_release(&dev->umrc.qp, qp); mutex_unlock(&dev->umrc.init_lock); return 0; From 6dd2d4ad9c8429523b1c220c5132bd551c006425 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:37 -0300 Subject: [PATCH 215/972] RDMA/mana: Validate rx_hash_key_len Sashiko points out that rx_hash_key_len comes from a uAPI structure and is blindly passed to memcpy, allowing the userspace to trash kernel memory. Bounds check it so the memcpy cannot overflow. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/4-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 645581359cee0b..f7bb0d1f0f8034 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -21,6 +21,9 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, gc = mdev_to_gc(dev); + if (rx_hash_key_len > sizeof(req->hashkey)) + return -EINVAL; + req_buf_size = struct_size(req, indir_tab, MANA_INDIRECT_TABLE_DEF_SIZE); req = kzalloc(req_buf_size, GFP_KERNEL); if (!req) From 159f2efabc89d3f931d38f2d35876535d4abf0a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:38 -0300 Subject: [PATCH 216/972] RDMA/mana: Remove user triggerable WARN_ON() in mana_ib_create_qp_rss() Sashiko points out that the user can specify WQs sharing the same CQ as a part of the uAPI and this will trigger the WARN_ON() then go on to corrupt the kernel. Just reject it outright and fail the QP creation. Cc: stable@vger.kernel.org Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/5-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/cq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c index f4cbe21763bf11..2d682428ef202a 100644 --- a/drivers/infiniband/hw/mana/cq.c +++ b/drivers/infiniband/hw/mana/cq.c @@ -137,8 +137,9 @@ int mana_ib_install_cq_cb(struct mana_ib_dev *mdev, struct mana_ib_cq *cq) if (cq->queue.id >= gc->max_num_cqs) return -EINVAL; - /* Create CQ table entry */ - WARN_ON(gc->cq_table[cq->queue.id]); + /* Create CQ table entry, sharing a CQ between WQs is not supported */ + if (gc->cq_table[cq->queue.id]) + return -EINVAL; if (cq->queue.kmem) gdma_cq = cq->queue.kmem; else From 34ecf795692ee57c393109f4a24ccc313091e137 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:39 -0300 Subject: [PATCH 217/972] RDMA/mana: Fix mana_destroy_wq_obj() cleanup in mana_ib_create_qp_rss() Sashiko points out there are two bugs here in the error unwind flow, both related to how the WQ table is unwound. First there is a double i-- on the first failure path due to the while loop having a i--, remove it. Second if mana_ib_install_cq_cb() fails then mana_create_wq_obj() is not undone due to the above i--. Cc: stable@vger.kernel.org Fixes: c15d7802a424 ("RDMA/mana_ib: Add CQ interrupt support for RAW QP") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=1 Link: https://patch.msgid.link/r/6-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index f7bb0d1f0f8034..8e1f052d0ec976 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -176,11 +176,8 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, &wq_spec, &cq_spec, &wq->rx_object); - if (ret) { - /* Do cleanup starting with index i-1 */ - i--; + if (ret) goto fail; - } /* The GDMA regions are now owned by the WQ object */ wq->queue.gdma_region = GDMA_INVALID_DMA_REGION; @@ -200,8 +197,10 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, /* Create CQ table entry */ ret = mana_ib_install_cq_cb(mdev, cq); - if (ret) + if (ret) { + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); goto fail; + } } resp.num_entries = i; From 6aaa978c6b6218cfac15fe1dab17c76fe229ce3f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:40 -0300 Subject: [PATCH 218/972] RDMA/mana: Fix error unwind in mana_ib_create_qp_rss() Sashiko points out that mana_ib_cfg_vport_steering() is leaked, the normal destroy path cleans it up. Cc: stable@vger.kernel.org Fixes: 0266a177631d ("RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/7-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Long Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mana/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 8e1f052d0ec976..0fbcf449c134b5 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -217,13 +217,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata create rss-qp, %d\n", ret); - goto fail; + goto err_disable_vport_rx; } kfree(mana_ind_table); return 0; +err_disable_vport_rx: + mana_disable_vport_rx(mpc); fail: while (i-- > 0) { ibwq = ind_tbl->ind_tbl[i]; From ea4e4b168a531c522d2850719816b6f583b1738b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:41 -0300 Subject: [PATCH 219/972] RDMA/ocrdma: Clarify the mm_head searching The intention of this code is to find matching entries exactly, the driver never creates phys_addr's with different lens so the current expression is not a bug, but it doesn't make sense and confuses review tooling. Search for exact match instead. Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/8-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c17e2a54dbcaf9..463c9a5703fc4e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -215,7 +215,7 @@ static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) { - if (len != mm->key.len && phy_addr != mm->key.phy_addr) + if (len != mm->key.len || phy_addr != mm->key.phy_addr) continue; list_del(&mm->entry); @@ -233,7 +233,7 @@ static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr, mutex_lock(&uctx->mm_list_lock); list_for_each_entry(mm, &uctx->mm_head, entry) { - if (len != mm->key.len && phy_addr != mm->key.phy_addr) + if (len != mm->key.len || phy_addr != mm->key.phy_addr) continue; found = true; From 34fbf48cf3b410d2a6e8c586fa952a36331ca5ba Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:42 -0300 Subject: [PATCH 220/972] RDMA/ocrdma: Don't NULL deref uctx on errors in ocrdma_copy_pd_uresp() Sashiko points out that pd->uctx isn't initialized until late in the function so all these error flow references are NULL and will crash. Use the uctx that isn't NULL. Cc: stable@vger.kernel.org Fixes: fe2caefcdf58 ("RDMA/ocrdma: Add driver for Emulex OneConnect IBoE RDMA adapter") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/9-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 463c9a5703fc4e..a88cc5d84af828 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -620,9 +620,9 @@ static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd, ucopy_err: if (pd->dpp_enabled) - ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE); + ocrdma_del_mmap(uctx, dpp_page_addr, PAGE_SIZE); dpp_map_err: - ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size); + ocrdma_del_mmap(uctx, db_page_addr, db_page_size); return status; } From e38e86995df27f1f854063dab1f0c6a513db3faf Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:43 -0300 Subject: [PATCH 221/972] RDMA/vmw_pvrdma: Fix double free on pvrdma_alloc_ucontext() error path Sashiko points out that pvrdma_uar_free() is already called within pvrdma_dealloc_ucontext(), so calling it before triggers a double free. Cc: stable@vger.kernel.org Fixes: 29c8d9eba550 ("IB: Add vmw_pvrdma driver") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=4 Link: https://patch.msgid.link/r/10-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index bcd43dc30e21c6..c7c2b41060e526 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -322,7 +322,7 @@ int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) uresp.qp_tab_size = vdev->dsr->caps.max_qp; ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) { - pvrdma_uar_free(vdev, &context->uar); + /* pvrdma_dealloc_ucontext() also frees the UAR */ pvrdma_dealloc_ucontext(&context->ibucontext); return -EFAULT; } From c54c7e4cb679c0aaa1cb489b9c3f2cd98e63a44c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:44 -0300 Subject: [PATCH 222/972] RDMA/mlx4: Fix resource leak on error in mlx4_ib_create_srq() Sashiko points out that mlx4_srq_alloc() was not undone during error unwind, add the missing call to mlx4_srq_free(). Cc: stable@vger.kernel.org Fixes: 225c7b1feef1 ("IB/mlx4: Add a driver Mellanox ConnectX InfiniBand adapters") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=8 Link: https://patch.msgid.link/r/11-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/srq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 5b23e5f8b84aca..767840736d583b 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -194,13 +194,15 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq, if (udata) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { err = -EFAULT; - goto err_wrid; + goto err_srq; } init_attr->attr.max_wr = srq->msrq.max - 1; return 0; +err_srq: + mlx4_srq_free(dev->dev, &srq->msrq); err_wrid: if (udata) mlx4_ib_db_unmap_user(ucontext, &srq->db); From c9341307ea16b9395c2e4c9c94d8499d91fe31d0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:45 -0300 Subject: [PATCH 223/972] RDMA/mlx4: Fix mis-use of RCU in mlx4_srq_event() Sashiko points out the radix_tree itself is RCU safe, but nothing ever frees the mlx4_srq struct with RCU, and it isn't even accessed within the RCU critical section. It also will crash if an event is delivered before the srq object is finished initializing. Use the spinlock since it isn't easy to make RCU work, use refcount_inc_not_zero() to protect against partially initialized objects, and order the refcount_set() to be after the srq is fully initialized. Cc: stable@vger.kernel.org Fixes: 30353bfc43a1 ("net/mlx4_core: Use RCU to perform radix tree lookup for SRQ") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=5 Link: https://patch.msgid.link/r/12-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx4/srq.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c index dd890f5d7b725c..8711689120f302 100644 --- a/drivers/net/ethernet/mellanox/mlx4/srq.c +++ b/drivers/net/ethernet/mellanox/mlx4/srq.c @@ -44,13 +44,14 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; struct mlx4_srq *srq; + unsigned long flags; - rcu_read_lock(); + spin_lock_irqsave(&srq_table->lock, flags); srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1)); - rcu_read_unlock(); - if (srq) - refcount_inc(&srq->refcount); - else { + if (!srq || !refcount_inc_not_zero(&srq->refcount)) + srq = NULL; + spin_unlock_irqrestore(&srq_table->lock, flags); + if (!srq) { mlx4_warn(dev, "Async event for bogus SRQ %08x\n", srqn); return; } @@ -203,8 +204,8 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, if (err) goto err_radix; - refcount_set(&srq->refcount, 1); init_completion(&srq->free); + refcount_set_release(&srq->refcount, 1); return 0; From 48973c6c938737bb900d15dc82b91dfe3586cb0f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:46 -0300 Subject: [PATCH 224/972] RDMA/hns: Fix xarray race in hns_roce_create_srq() Sashiko points out that once the srq memory is stored into the xarray by alloc_srqc() it can immediately be looked up by: xa_lock(&srq_table->xa); srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1)); if (srq) refcount_inc(&srq->refcount); xa_unlock(&srq_table->xa); Which will fail refcount debug because the refcount is 0 and then crash: srq->event(srq, event_type); Because event is NULL. Use refcount_inc_not_zero() instead to ensure a partially prepared srq is never retrieved from the event handler and fix the ordering of the initialization so refcount becomes 1 only after it is fully ready. All the initialization must be done before calling free_srqc() since it depends on the completion and refcount. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Link: https://sashiko.dev/#/patchset/0-v1-e911b76a94d1%2B65d95-rdma_udata_rep_jgg%40nvidia.com?part=3 Link: https://patch.msgid.link/r/13-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_srq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index cb848e8e6bbd76..8b94cbdfa54dfa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -16,8 +16,8 @@ void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type) xa_lock(&srq_table->xa); srq = xa_load(&srq_table->xa, srqn & (hr_dev->caps.num_srqs - 1)); - if (srq) - refcount_inc(&srq->refcount); + if (srq && !refcount_inc_not_zero(&srq->refcount)) + srq = NULL; xa_unlock(&srq_table->xa); if (!srq) { @@ -470,6 +470,10 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, if (ret) goto err_srqn; + srq->event = hns_roce_ib_srq_event; + init_completion(&srq->free); + refcount_set_release(&srq->refcount, 1); + if (udata) { resp.cap_flags = srq->cap_flags; resp.srqn = srq->srqn; @@ -480,10 +484,6 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, } } - srq->event = hns_roce_ib_srq_event; - refcount_set(&srq->refcount, 1); - init_completion(&srq->free); - return 0; err_srqc: From 7d51783d82fea000a9ce96fa1dcf3e0a8cedc4fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:47 -0300 Subject: [PATCH 225/972] RDMA/hns: Fix xarray race in hns_roce_create_qp_common() Similar to the SRQ case the hr_qp is stored in the xarray before it is fully initialized. Unlike the SRQ case the error unwinds do not wait for the completion so keep the refcount 0 until the function succeeds. Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver") Link: https://patch.msgid.link/r/14-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Suggested-by: Junxian Huang Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index a27ea85bb06323..f94ba98871f0d0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -47,8 +47,8 @@ static struct hns_roce_qp *hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, xa_lock_irqsave(&hr_dev->qp_table_xa, flags); qp = __hns_roce_qp_lookup(hr_dev, qpn); - if (qp) - refcount_inc(&qp->refcount); + if (qp && !refcount_inc_not_zero(&qp->refcount)) + qp = NULL; xa_unlock_irqrestore(&hr_dev->qp_table_xa, flags); if (!qp) @@ -1251,8 +1251,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp->ibqp.qp_num = hr_qp->qpn; hr_qp->event = hns_roce_ib_qp_event; - refcount_set(&hr_qp->refcount, 1); init_completion(&hr_qp->free); + refcount_set_release(&hr_qp->refcount, 1); return 0; From 0c99acbc8b6c6dd526ae475a48ee1897b61072fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 28 Apr 2026 13:17:48 -0300 Subject: [PATCH 226/972] RDMA/hns: Fix unlocked call to hns_roce_qp_remove() Sashiko points out that hns_roce_qp_remove() requires the caller to hold locks. The error flow in hns_roce_create_qp_common() doesn't hold those locks for the error unwind so it risks corrupting memory. Grab the same locks the other two callers use. Cc: stable@vger.kernel.org Fixes: e088a685eae9 ("RDMA/hns: Support rq record doorbell for the user space") Link: https://sashiko.dev/#/patchset/0-v2-1c49eeb88c48%2B91-rdma_udata_rep_jgg%40nvidia.com?part=9 Link: https://patch.msgid.link/r/15-v1-41f3135e5565+9d2-rdma_ai_fixes1_jgg@nvidia.com Reviewed-by: Junxian Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f94ba98871f0d0..bf04ee84a94392 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1171,6 +1171,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct hns_roce_ib_create_qp_resp resp = {}; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_ib_create_qp ucmd = {}; + unsigned long flags; int ret; mutex_init(&hr_qp->mutex); @@ -1257,7 +1258,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, return 0; err_flow_ctrl: + spin_lock_irqsave(&hr_dev->qp_list_lock, flags); + hns_roce_lock_cqs(init_attr->send_cq ? to_hr_cq(init_attr->send_cq) : NULL, + init_attr->recv_cq ? to_hr_cq(init_attr->recv_cq) : NULL); hns_roce_qp_remove(hr_dev, hr_qp); + hns_roce_unlock_cqs(init_attr->send_cq ? to_hr_cq(init_attr->send_cq) : NULL, + init_attr->recv_cq ? to_hr_cq(init_attr->recv_cq) : NULL); + spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags); err_store: free_qpc(hr_dev, hr_qp); err_qpc: From 0799e5943611006b346b8813c7daf7dd5aa26bfd Mon Sep 17 00:00:00 2001 From: Lyes Bourennani Date: Wed, 22 Apr 2026 00:20:22 +0200 Subject: [PATCH 227/972] batman-adv: fix integer overflow on buff_pos Fixing an integer overflow present in batadv_iv_ogm_send_to_if. The size check is done using the int type in batadv_iv_ogm_aggr_packet whereas the buff_pos variable uses the s16 type. This could lead to an out-of-bound read. Cc: stable@vger.kernel.org Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Lyes Bourennani Signed-off-by: Alexis Pinson Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f28e9cbf8ad5f2..618d1889c04e75 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -335,7 +335,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; - s16 buff_pos; + int buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; u8 *packet_pos; From 3243543592425beec83d453793e9d27caa0d8e66 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Mon, 27 Apr 2026 14:43:33 +0800 Subject: [PATCH 228/972] batman-adv: reject new tp_meter sessions during teardown Prevent tp_meter from starting new sender or receiver sessions after mesh_state has left BATADV_MESH_ACTIVE. Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Sven Eckelmann --- net/batman-adv/tp_meter.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 2e42f6b348c83d..d9a80e459c2e4a 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -947,6 +947,13 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /* look for an already existing test towards this node */ spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) { + spin_unlock_bh(&bat_priv->tp_list_lock); + batadv_tp_batctl_error_notify(BATADV_TP_REASON_DST_UNREACHABLE, + dst, bat_priv, session_cookie); + return; + } + tp_vars = batadv_tp_list_find(bat_priv, dst); if (tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); @@ -1329,9 +1336,12 @@ static struct batadv_tp_vars * batadv_tp_init_recv(struct batadv_priv *bat_priv, const struct batadv_icmp_tp_packet *icmp) { - struct batadv_tp_vars *tp_vars; + struct batadv_tp_vars *tp_vars = NULL; spin_lock_bh(&bat_priv->tp_list_lock); + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out_unlock; + tp_vars = batadv_tp_list_find_session(bat_priv, icmp->orig, icmp->session); if (tp_vars) @@ -1464,6 +1474,9 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_icmp_tp_packet *icmp; + if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) + goto out; + icmp = (struct batadv_icmp_tp_packet *)skb->data; switch (icmp->subtype) { @@ -1478,6 +1491,8 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) "Received unknown TP Metric packet type %u\n", icmp->subtype); } + +out: consume_skb(skb); } From 3d3cf6a7314aca4df0a6dde28ce784a2a30d0166 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Mon, 27 Apr 2026 14:43:34 +0800 Subject: [PATCH 229/972] batman-adv: stop tp_meter sessions during mesh teardown TP meter sessions remain linked on bat_priv->tp_list after the netlink request has already finished. When the mesh interface is removed, batadv_mesh_free() currently tears down the mesh without first draining these sessions. A running sender thread or a late incoming tp_meter packet can then keep processing against a mesh instance which is already shutting down. Synchronize tp_meter with the mesh lifetime by stopping all active sessions from batadv_mesh_free() and waiting for sender threads to exit before teardown continues. Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Luxing Yin Signed-off-by: Luxing Yin Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Sven Eckelmann --- net/batman-adv/main.c | 1 + net/batman-adv/tp_meter.c | 94 +++++++++++++++++++++++++++++++-------- net/batman-adv/tp_meter.h | 1 + net/batman-adv/types.h | 4 ++ 4 files changed, 82 insertions(+), 18 deletions(-) diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3a35aadd8b4191..a4d33ee0fda59e 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -249,6 +249,7 @@ void batadv_mesh_free(struct net_device *mesh_iface) atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); + batadv_tp_stop_all(bat_priv); batadv_gw_node_free(bat_priv); diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index d9a80e459c2e4a..58ca59a2799ed1 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -365,23 +366,38 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) } /** - * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer - * @bat_priv: the bat priv with all the mesh interface information - * @tp_vars: the private data of the current TP meter session to cleanup + * batadv_tp_list_detach() - remove tp session from mesh session list once + * @tp_vars: the private data of the current TP meter session */ -static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, - struct batadv_tp_vars *tp_vars) +static void batadv_tp_list_detach(struct batadv_tp_vars *tp_vars) { - cancel_delayed_work(&tp_vars->finish_work); + bool detached = false; spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); + if (!hlist_unhashed(&tp_vars->list)) { + hlist_del_init_rcu(&tp_vars->list); + detached = true; + } spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); + if (!detached) + return; + + atomic_dec(&tp_vars->bat_priv->tp_num); + /* drop list reference */ batadv_tp_vars_put(tp_vars); +} - atomic_dec(&tp_vars->bat_priv->tp_num); +/** + * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer + * @tp_vars: the private data of the current TP meter session to cleanup + */ +static void batadv_tp_sender_cleanup(struct batadv_tp_vars *tp_vars) +{ + cancel_delayed_work_sync(&tp_vars->finish_work); + + batadv_tp_list_detach(tp_vars); /* kill the timer and remove its reference */ timer_delete_sync(&tp_vars->timer); @@ -886,7 +902,8 @@ static int batadv_tp_send(void *arg) batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); batadv_tp_vars_put(tp_vars); @@ -918,7 +935,8 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) batadv_tp_vars_put(tp_vars); /* cleanup of failed tp meter variables */ - batadv_tp_sender_cleanup(bat_priv, tp_vars); + batadv_tp_sender_cleanup(tp_vars); + complete(&tp_vars->finished); return; } @@ -1024,6 +1042,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars->start_time = jiffies; init_waitqueue_head(&tp_vars->more_bytes); + init_completion(&tp_vars->finished); spin_lock_init(&tp_vars->unacked_lock); INIT_LIST_HEAD(&tp_vars->unacked_list); @@ -1126,14 +1145,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) "Shutting down for inactivity (more than %dms) from %pM\n", BATADV_TP_RECV_TIMEOUT, tp_vars->other_end); - spin_lock_bh(&tp_vars->bat_priv->tp_list_lock); - hlist_del_rcu(&tp_vars->list); - spin_unlock_bh(&tp_vars->bat_priv->tp_list_lock); - - /* drop list reference */ - batadv_tp_vars_put(tp_vars); - - atomic_dec(&bat_priv->tp_num); + batadv_tp_list_detach(tp_vars); spin_lock_bh(&tp_vars->unacked_lock); list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) { @@ -1496,6 +1508,52 @@ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) consume_skb(skb); } +/** + * batadv_tp_stop_all() - stop all currently running tp meter sessions + * @bat_priv: the bat priv with all the mesh interface information + */ +void batadv_tp_stop_all(struct batadv_priv *bat_priv) +{ + struct batadv_tp_vars *tp_vars[BATADV_TP_MAX_NUM]; + struct batadv_tp_vars *tp_var; + size_t count = 0; + size_t i; + + spin_lock_bh(&bat_priv->tp_list_lock); + hlist_for_each_entry(tp_var, &bat_priv->tp_list, list) { + if (WARN_ON_ONCE(count >= BATADV_TP_MAX_NUM)) + break; + + if (!kref_get_unless_zero(&tp_var->refcount)) + continue; + + tp_vars[count++] = tp_var; + } + spin_unlock_bh(&bat_priv->tp_list_lock); + + for (i = 0; i < count; i++) { + tp_var = tp_vars[i]; + + switch (tp_var->role) { + case BATADV_TP_SENDER: + batadv_tp_sender_shutdown(tp_var, + BATADV_TP_REASON_CANCEL); + wake_up(&tp_var->more_bytes); + wait_for_completion(&tp_var->finished); + break; + case BATADV_TP_RECEIVER: + batadv_tp_list_detach(tp_var); + if (timer_shutdown_sync(&tp_var->timer)) + batadv_tp_vars_put(tp_var); + break; + } + + batadv_tp_vars_put(tp_var); + } + + synchronize_net(); +} + /** * batadv_tp_meter_init() - initialize global tp_meter structures */ diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index f0046d366eac65..4e97cd10cd0259 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -17,6 +17,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, u32 test_length, u32 *cookie); void batadv_tp_stop(struct batadv_priv *bat_priv, const u8 *dst, u8 return_value); +void batadv_tp_stop_all(struct batadv_priv *bat_priv); void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb); #endif /* _NET_BATMAN_ADV_TP_METER_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 8fc5fe0e9b0539..daa06f42115429 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1328,6 +1329,9 @@ struct batadv_tp_vars { /** @finish_work: work item for the finishing procedure */ struct delayed_work finish_work; + /** @finished: completion signaled when a sender thread exits */ + struct completion finished; + /** @test_length: test length in milliseconds */ u32 test_length; From 046111a1a35a1720748f254377d3d1663664ea61 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 06:16:09 +0000 Subject: [PATCH 230/972] net/sched: sch_cake: annotate data-races in cake_dump_class_stats (I) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_class_stats() runs without qdisc spinlock being held. In this first patch, I add READ_ONCE()/WRITE_ONCE() annotations for: - flow->head - flow->dropped - b->backlogs[] Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260430061610.3503483-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 13c6d1869a1447..806eb73d6a05e1 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -914,7 +914,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) struct sk_buff *skb = flow->head; if (skb) { - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); } @@ -926,7 +926,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { if (!flow->head) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -1357,7 +1357,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, if (elig_ack_prev) elig_ack_prev->next = elig_ack->next; else - flow->head = elig_ack->next; + WRITE_ONCE(flow->head, elig_ack->next); skb_mark_not_on_list(elig_ack); @@ -1595,11 +1595,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; - b->backlogs[idx] -= len; WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len); sch->qstats.backlog -= len; - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); if (q->config->rate_flags & CAKE_FLAG_INGRESS) @@ -1824,11 +1824,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /* stats */ - b->backlogs[idx] += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; WRITE_ONCE(b->bytes, b->bytes + slen); WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen); qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); @@ -1861,11 +1861,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ WRITE_ONCE(b->packets, b->packets + 1); - b->backlogs[idx] += len - ack_pkt_len; sch->qstats.backlog += len - ack_pkt_len; q->avg_window_bytes += len - ack_pkt_len; WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len); WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len); } if (q->overflow_timeout) @@ -1977,7 +1977,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) if (flow->head) { skb = dequeue_head(flow); len = qdisc_pkt_len(skb); - b->backlogs[q->cur_flow] -= len; + WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len); WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; @@ -2235,7 +2235,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) flow->deficit -= len; b->tin_deficit -= len; } - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); @@ -3137,7 +3137,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, flow = &b->flows[idx % CAKE_QUEUES]; - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -3146,8 +3146,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = b->backlogs[idx % CAKE_QUEUES]; - qs.drops = flow->dropped; + qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]); + qs.drops = READ_ONCE(flow->dropped); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; From 67dc6c56b871617deac85b9f72500b69b1fdf835 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 06:16:10 +0000 Subject: [PATCH 231/972] net/sched: sch_cake: annotate data-races in cake_dump_class_stats (II) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_class_stats() runs without qdisc spinlock being held. In this second patch, I add READ_ONCE()/WRITE_ONCE() annotations for: - flow->deficit - flow->cvars.dropping - flow->cvars.count - flow->cvars.p_drop - flow->cvars.blue_timer - flow->cvars.drop_next Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260430061610.3503483-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 131 +++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 60 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 806eb73d6a05e1..5862933be8d746 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -399,14 +399,14 @@ static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ -static void cobalt_newton_step(struct cobalt_vars *vars) +static void cobalt_newton_step(struct cobalt_vars *vars, u32 count) { u32 invsqrt, invsqrt2; u64 val; invsqrt = vars->rec_inv_sqrt; invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; - val = (3LL << 32) - ((u64)vars->count * invsqrt2); + val = (3LL << 32) - ((u64)count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); @@ -414,12 +414,12 @@ static void cobalt_newton_step(struct cobalt_vars *vars) vars->rec_inv_sqrt = val; } -static void cobalt_invsqrt(struct cobalt_vars *vars) +static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count) { - if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; + if (count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = inv_sqrt_cache[count]; else - cobalt_newton_step(vars); + cobalt_newton_step(vars, count); } static void cobalt_vars_init(struct cobalt_vars *vars) @@ -449,16 +449,19 @@ static bool cobalt_queue_full(struct cobalt_vars *vars, bool up = false; if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { - up = !vars->p_drop; - vars->p_drop += p->p_inc; - if (vars->p_drop < p->p_inc) - vars->p_drop = ~0; - vars->blue_timer = now; - } - vars->dropping = true; - vars->drop_next = now; + u32 p_drop = vars->p_drop; + + up = !p_drop; + p_drop += p->p_inc; + if (p_drop < p->p_inc) + p_drop = ~0; + WRITE_ONCE(vars->p_drop, p_drop); + WRITE_ONCE(vars->blue_timer, now); + } + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, now); if (!vars->count) - vars->count = 1; + WRITE_ONCE(vars->count, 1); return up; } @@ -475,20 +478,20 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, if (vars->p_drop && ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { if (vars->p_drop < p->p_dec) - vars->p_drop = 0; + WRITE_ONCE(vars->p_drop, 0); else - vars->p_drop -= p->p_dec; - vars->blue_timer = now; + WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec); + WRITE_ONCE(vars->blue_timer, now); down = !vars->p_drop; } - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->count, vars->count - 1); + cobalt_invsqrt(vars, vars->count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); } return down; @@ -507,6 +510,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, bool next_due, over_target; ktime_t schedule; u64 sojourn; + u32 count; /* The 'schedule' variable records, in its sign, whether 'now' is before or * after 'drop_next'. This allows 'drop_next' to be updated before the next @@ -528,21 +532,22 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, over_target = sojourn > p->target && sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; - next_due = vars->count && ktime_to_ns(schedule) >= 0; + count = vars->count; + next_due = count && ktime_to_ns(schedule) >= 0; vars->ecn_marked = false; if (over_target) { if (!vars->dropping) { - vars->dropping = true; - vars->drop_next = cobalt_control(now, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, + cobalt_control(now, p->interval, + vars->rec_inv_sqrt)); } - if (!vars->count) - vars->count = 1; + if (!count) + count = 1; } else if (vars->dropping) { - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } if (next_due && vars->dropping) { @@ -550,23 +555,23 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) reason = QDISC_DROP_CONGESTED; - vars->count++; - if (!vars->count) - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count++; + if (!count) + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); } else { while (next_due) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); - next_due = vars->count && ktime_to_ns(schedule) >= 0; + next_due = count && ktime_to_ns(schedule) >= 0; } } @@ -575,11 +580,12 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, get_random_u32() < vars->p_drop) reason = QDISC_DROP_FLOOD_PROTECTION; + WRITE_ONCE(vars->count, count); /* Overload the drop_next field as an activity timeout */ - if (!vars->count) - vars->drop_next = ktime_add_ns(now, p->interval); + if (!count) + WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval)); else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC) - vars->drop_next = now; + WRITE_ONCE(vars->drop_next, now); return reason; } @@ -1924,7 +1930,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1); - flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode)); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -2166,7 +2172,8 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) } } - flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, + flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode)); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2232,7 +2239,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) if (q->config->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; } WRITE_ONCE(flow->dropped, flow->dropped + 1); @@ -2259,7 +2266,7 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) delay < b->base_delay ? 2 : 8)); len = cake_advance_shaper(q, b, skb, now, false); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { @@ -3153,6 +3160,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, return -1; if (flow) { ktime_t now = ktime_get(); + bool dropping; + u32 p_drop; stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) @@ -3167,21 +3176,23 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, goto nla_put_failure; \ } while (0) - PUT_STAT_S32(DEFICIT, flow->deficit); - PUT_STAT_U32(DROPPING, flow->cvars.dropping); - PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); - PUT_STAT_U32(P_DROP, flow->cvars.p_drop); - if (flow->cvars.p_drop) { + PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit)); + dropping = READ_ONCE(flow->cvars.dropping); + PUT_STAT_U32(DROPPING, dropping); + PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count)); + p_drop = READ_ONCE(flow->cvars.p_drop); + PUT_STAT_U32(P_DROP, p_drop); + if (p_drop) { PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + READ_ONCE(flow->cvars.blue_timer)))); } - if (flow->cvars.dropping) { + if (dropping) { PUT_STAT_S32(DROP_NEXT_US, ktime_to_us( ktime_sub(now, - flow->cvars.drop_next))); + READ_ONCE(flow->cvars.drop_next)))); } if (nla_nest_end(d->skb, stats) < 0) From 7e7be31bfdb066c1c780dcd6b1224078fc54063f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Apr 2026 15:29:38 -0700 Subject: [PATCH 232/972] net: tls: fix silent data drop under pipe back-pressure tls_sw_splice_read() uses len when advancing rxm->offset / rxm->full_len after skb_splice_bits(), rather than copied (the actual number of bytes successfully spliced into the pipe). When the destination pipe cannot accept all the requested bytes, splice_to_pipe() returns fewer bytes than len, and 'len - copied' of data is effectively skipped over. Fixes: e062fe99cccd ("tls: splice_read: fix accessing pre-processed records") Link: https://patch.msgid.link/20260429222944.2139041-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 798243eabb1f87..2590e855f6a5a9 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2317,9 +2317,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_requeue; - if (chunk < rxm->full_len) { - rxm->offset += len; - rxm->full_len -= len; + if (copied < rxm->full_len) { + rxm->offset += copied; + rxm->full_len -= copied; goto splice_requeue; } From bd3a4795d5744f59a1f485379f1303e5e606f377 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Apr 2026 15:29:39 -0700 Subject: [PATCH 233/972] selftests: tls: add test for data loss on small pipe Add selftest for data loss on short splice. Link: https://patch.msgid.link/20260429222944.2139041-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 43 +++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 9e2ccea13d7023..30a236b8e9f733 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -946,6 +946,49 @@ TEST_F(tls, peek_and_splice) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_to_pipe_small) +{ + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + size_t total = 0; + int p[2]; + + memrnd(mem_send, sizeof(mem_send)); + + ASSERT_GE(pipe(p), 0); + + /* Shrink pipe to 1 page (typically 4096 bytes) to force multiple + * splice iterations for a 16384-byte TLS record. + */ + EXPECT_GE(fcntl(p[1], F_SETPIPE_SZ, 4096), 4096); + + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + while (total < (size_t)send_len) { + ssize_t spliced, drained; + + spliced = splice(self->cfd, NULL, p[1], NULL, + send_len - total, 0); + EXPECT_GT(spliced, 0); + if (spliced <= 0) + break; + + drained = read(p[0], mem_recv + total, spliced); + EXPECT_EQ(drained, spliced); + if (drained <= 0) + break; + + total += drained; + } + + EXPECT_EQ(total, (size_t)send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); + + close(p[0]); + close(p[1]); +} + #define MAX_FRAGS 48 TEST_F(tls, splice_short) { From 0a69ac25bd596d50823d530d0a2004336668c0df Mon Sep 17 00:00:00 2001 From: Eliot Courtney Date: Fri, 1 May 2026 19:49:37 +0900 Subject: [PATCH 234/972] rust: drm: fix unsound initialization in drm::Device::new If pinned initialization of drm::Device::Data fails, it calls drm::Device::release via drm_dev_put. This materializes a reference to &drm::Device, but it's not fully constructed yet, because initializing `data` failed. It should not be dropped either. Instead, if pinned initialization fails, make sure drm::Device::release isn't called. Fixes: 2e9fdbe5ec7a ("rust: drm: device: drop_in_place() the drm::Device in release()") Signed-off-by: Eliot Courtney Reviewed-by: Gary Guo Link: https://patch.msgid.link/20260501-fix-drm-1-v2-1-5c4f681837bc@nvidia.com Signed-off-by: Danilo Krummrich --- rust/kernel/drm/device.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs index adbafe8db54d11..403fc35353c740 100644 --- a/rust/kernel/drm/device.rs +++ b/rust/kernel/drm/device.rs @@ -119,13 +119,20 @@ impl Device { // compatible `Layout`. let layout = Kmalloc::aligned_layout(Layout::new::()); + // Use a temporary vtable without a `release` callback until `data` is initialized, so + // init failure can release the DRM device without dropping uninitialized fields. + let alloc_vtable = bindings::drm_driver { + release: None, + ..Self::VTABLE + }; + // SAFETY: - // - `VTABLE`, as a `const` is pinned to the read-only section of the compilation, + // - `alloc_vtable` reference remains valid until no longer used, // - `dev` is valid by its type invarants, let raw_drm: *mut Self = unsafe { bindings::__drm_dev_alloc( dev.as_raw(), - &Self::VTABLE, + &alloc_vtable, layout.size(), mem::offset_of!(Self, dev), ) @@ -133,6 +140,10 @@ impl Device { .cast(); let raw_drm = NonNull::new(from_err_ptr(raw_drm)?).ok_or(ENOMEM)?; + // SAFETY: `raw_drm` is a valid pointer to `Self`, given that `__drm_dev_alloc` was + // successful. + let drm_dev = unsafe { Self::into_drm_device(raw_drm) }; + // SAFETY: `raw_drm` is a valid pointer to `Self`. let raw_data = unsafe { ptr::addr_of_mut!((*raw_drm.as_ptr()).data) }; @@ -140,15 +151,14 @@ impl Device { // - `raw_data` is a valid pointer to uninitialized memory. // - `raw_data` will not move until it is dropped. unsafe { data.__pinned_init(raw_data) }.inspect_err(|_| { - // SAFETY: `raw_drm` is a valid pointer to `Self`, given that `__drm_dev_alloc` was - // successful. - let drm_dev = unsafe { Self::into_drm_device(raw_drm) }; - // SAFETY: `__drm_dev_alloc()` was successful, hence `drm_dev` must be valid and the // refcount must be non-zero. unsafe { bindings::drm_dev_put(drm_dev) }; })?; + // SAFETY: `drm_dev` is still private to this function. + unsafe { (*drm_dev).driver = const { &Self::VTABLE } }; + // SAFETY: The reference count is one, and now we take ownership of that reference as a // `drm::Device`. Ok(unsafe { ARef::from_raw(raw_drm) }) From 464af6fc2b1dcc74005b7f58ee3812b17777efee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 27 Apr 2026 14:25:40 +0200 Subject: [PATCH 235/972] KVM: x86: check for nEPT/nNPT in slow flush hypercalls Checking is_guest_mode(vcpu) is incorrect, because translate_nested_gpa() is only valid if an L2 guest is running *with nested EPT/NPT enabled*. Instead use the same condition as translate_nested_gpa() itself. Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Fixes: aee738236dca ("KVM: x86: Prepare kvm_hv_flush_tlb() to handle L2's GPAs", 2022-11-18) Link: https://patch.msgid.link/20260503200905.106077-1-pbonzini@redhat.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 9b140bbdc1d83b..4438ecac9a89bb 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2040,7 +2040,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ - if (!hc->fast && is_guest_mode(vcpu)) { + if (!hc->fast && mmu_is_nested(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; From 33fd0ccd2590b470b65adcca288615ad3b5e3e06 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 3 May 2026 19:19:32 +0200 Subject: [PATCH 236/972] KVM: x86: Do IRR scan in __kvm_apic_update_irr even if PIR is empty Fall back to apic_find_highest_vector() when PID.ON is set but PIR turns out to be empty, to correctly report the highest pending interrupt from the existing IRR. In a nested VM stress test, the following WARNING fires in vmx_check_nested_events() when kvm_cpu_has_interrupt() reports a pending interrupt but the subsequent kvm_apic_has_interrupt() (which invokes vmx_sync_pir_to_irr() again) returns -1: WARNING: CPU: 99 PID: 57767 at arch/x86/kvm/vmx/nested.c:4449 vmx_check_nested_events+0x6bf/0x6e0 [kvm_intel] Call Trace: kvm_check_and_inject_events vcpu_enter_guest.constprop.0 vcpu_run kvm_arch_vcpu_ioctl_run kvm_vcpu_ioctl __x64_sys_ioctl do_syscall_64 entry_SYSCALL_64_after_hwframe The root cause is a race between vmx_sync_pir_to_irr() on the target vCPU and __vmx_deliver_posted_interrupt() on a sender vCPU. The sender performs two individually-atomic operations that are not a single transaction: 1. pi_test_and_set_pir(vector) -- sets the PIR bit 2. pi_test_and_set_on() -- sets PID.ON The following interleaving triggers the bug: Sender vCPU (IPI): Target vCPU (1st sync_pir_to_irr): B1: set PIR[vector] A1: pi_clear_on() A2: pi_harvest_pir() -> sees B1 bit A3: xchg() -> consumes bit, PIR=0 (1st sync returns correct max_irr) B2: set PID.ON = 1 Target vCPU (2nd sync_pir_to_irr): C1: pi_test_on() -> TRUE (from B2) C2: pi_clear_on() -> ON=0 C3: pi_harvest_pir() -> PIR empty C4: *max_irr = -1, early return IRR NOT SCANNED The interrupt is not lost (it resides in the IRR from the first sync and is recovered on the next vcpu_enter_guest() iteration), but the incorrect max_irr causes a spurious WARNING and a wasted L2 VM-Enter/VM-Exit cycle. Fixes: b41f8638b9d3 ("KVM: VMX: Isolate pure loads from atomic XCHG when processing PIR") Reported-by: Farrah Chen Analyzed-by: Chenyi Qiang Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/kvm/20260428070349.1633238-1-chenyi.qiang@intel.com/T/ Link: https://patch.msgid.link/20260503201703.108231-2-pbonzini@redhat.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e3ec4d8607c19f..5ee14d6bc2883f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -669,12 +669,14 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) u32 irr_val, prev_irr_val; int max_updated_irr; + if (!pi_harvest_pir(pir, pir_vals)) { + *max_irr = apic_find_highest_vector(regs + APIC_IRR); + return false; + } + max_updated_irr = -1; *max_irr = -1; - if (!pi_harvest_pir(pir, pir_vals)) - return false; - for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); From 0aec99f9bf0213f7910688472e1ccf517c0e8b5a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Apr 2026 08:50:43 -0700 Subject: [PATCH 237/972] KVM: x86: Fix misleading variable names and add more comments for PIR=>IRR flow Rename kvm_apic_update_irr()'s "irr_updated" and vmx_sync_pir_to_irr()'s "got_posted_interrupt" to a more accurate "max_irr_is_from_pir", as neither "irr_updated" nor "got_posted_interrupt" is accurate. __kvm_apic_update_irr() and thus kvm_apic_update_irr() specifically return true if and only if the highest priority IRQ, i.e. max_irr, is a "new" pending IRQ from the PIR. I.e. it's possible for the IRR to be updated, i.e. for a posted IRQ to be "got", *without* the APIs returning true. Expand vmx_sync_pir_to_irr()'s comment to explain why it's necessary to set KVM_REQ_EVENT only if a "new" IRQ was found, and to explain why it's safe to do so only if a new IRQ is also the highest priority pending IRQ. No functional change intended. Signed-off-by: Sean Christopherson Link: https://patch.msgid.link/20260503201703.108231-3-pbonzini@redhat.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5ee14d6bc2883f..4078e624ca6675 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -667,14 +667,14 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; - int max_updated_irr; + int max_new_irr; if (!pi_harvest_pir(pir, pir_vals)) { *max_irr = apic_find_highest_vector(regs + APIC_IRR); return false; } - max_updated_irr = -1; + max_new_irr = -1; *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { @@ -690,25 +690,25 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); if (prev_irr_val != irr_val) - max_updated_irr = __fls(irr_val ^ prev_irr_val) + vec; + max_new_irr = __fls(irr_val ^ prev_irr_val) + vec; } if (irr_val) *max_irr = __fls(irr_val) + vec; } - return ((max_updated_irr != -1) && - (max_updated_irr == *max_irr)); + return max_new_irr != -1 && max_new_irr == *max_irr; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_apic_update_irr); bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; - bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); + bool max_irr_is_from_pir; - if (unlikely(!apic->apicv_active && irr_updated)) + max_irr_is_from_pir = __kvm_apic_update_irr(pir, apic->regs, max_irr); + if (unlikely(!apic->apicv_active && max_irr_is_from_pir)) apic->irr_pending = true; - return irr_updated; + return max_irr_is_from_pir; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a29896a9ef1456..5c2c33a5f7dc9a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7029,8 +7029,8 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); + bool max_irr_is_from_pir; int max_irr; - bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; @@ -7042,17 +7042,22 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - got_posted_interrupt = - kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); + max_irr_is_from_pir = kvm_apic_update_irr(vcpu, vt->pi_desc.pir, + &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); - got_posted_interrupt = false; + max_irr_is_from_pir = false; } /* - * Newly recognized interrupts are injected via either virtual interrupt - * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is - * disabled in two cases: + * If APICv is enabled and L2 is not active, then update the Requesting + * Virtual Interrupt (RVI) portion of vmcs01.GUEST_INTR_STATUS with the + * highest priority IRR to deliver the IRQ via Virtual Interrupt + * Delivery. Note, this is required even if the highest priority IRQ + * was already pending in the IRR, as RVI isn't updated in lockstep with + * the IRR (unlike apic->irr_pending). + * + * For the cases where Virtual Interrupt Delivery can't be used: * * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a @@ -7063,10 +7068,29 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * 2) If APICv is disabled for this vCPU, assigned devices may still * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + * + * In both cases, set KVM_REQ_EVENT if and only if the highest priority + * pending IRQ came from the PIR, as setting KVM_REQ_EVENT if any IRQ + * is pending may put the vCPU into an infinite loop, e.g. if the IRQ + * is blocked, then it will stay pending until an IRQ window is opened. + * + * Note! It's possible that one or more IRQs were moved from the PIR + * to the IRR _without_ max_irr_is_from_pir being true! I.e. if there + * was a higher priority IRQ already pending in the IRR. Not setting + * KVM_REQ_EVENT in this case is intentional and safe. If APICv is + * inactive, or L2 is running with exit-on-interrupt off (in vmcs12), + * i.e. without nested virtual interrupt delivery, then there's no need + * to request an IRQ window as the lower priority IRQ only needs to be + * delivered when the higher priority IRQ is dismissed from the ISR, + * i.e. on the next EOI, and EOIs are always intercepted if APICv is + * disabled or if L2 is running without nested VID. If L2 is running + * exit-on-interrupt on (in vmcs12), then the higher priority IRQ will + * trigger a nested VM-Exit, at which point KVM will re-evaluate L1's + * pending IRQs. */ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) vmx_set_rvi(max_irr); - else if (got_posted_interrupt) + else if (max_irr_is_from_pir) kvm_make_request(KVM_REQ_EVENT, vcpu); return max_irr; From 0cb2af2ea66ad8ff195c156ea690f11216285bdf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Apr 2026 00:52:34 +0000 Subject: [PATCH 238/972] KVM: x86: Fix shadow paging use-after-free due to unexpected GFN The shadow MMU computes GFNs for direct shadow pages using sp->gfn plus the SPTE index. This assumption breaks for shadow paging if the guest page tables are modified between VM entries (similar to commit aad885e77496, "KVM: x86/mmu: Drop/zap existing present SPTE even when creating an MMIO SPTE", 2026-03-27). The flow is as follows: - a PDE is installed for a 2MB mapping, and a page in that area is accessed. KVM creates a kvm_mmu_page consisting of 512 4KB pages; the kvm_mmu_page is marked by FNAME(fetch) as direct-mapped because the guest's mapping is a huge page (and thus contiguous). - the PDE mapping is changed from outside the guest. - the guest accesses another page in the same 2MB area. KVM installs a new leaf SPTE and rmap entry; the SPTE uses the "correct" GFN (i.e. based on the new mapping, as changed in the previous step) but that GFN is outside of the [sp->gfn, sp->gfn + 511] range; therefore the rmap entry cannot be found and removed when the kvm_mmu_page is zapped. - the memslot that covers the first 2MB mapping is deleted, and the kvm_mmu_page for the now-invalid GPA is zapped. However, rmap_remove() only looks at the [sp->gfn, sp->gfn + 511] range established in step 1, and fails to find the rmap entry that was recorded by step 3. - any operation that causes an rmap walk for the same page accessed by step 3 then walks a stale rmap and dereferences a freed kvm_mmu_page. This includes dirty logging or MMU notifier invalidations (e.g., from MADV_DONTNEED). The underlying issue is that KVM's walking of shadow PTEs assumes that if a SPTE is present when KVM wants to install a non-leaf SPTE, then the existing kvm_mmu_page must be for the correct gfn. Because the only way for the gfn to be wrong is if KVM messed up and failed to zap a SPTE... which shouldn't happen, but *actually* only happens in response to a guest write. That bug dates back literally forever, as even the first version of KVM assumes that the GFN matches and walks into the "wrong" shadow page. However, that was only an imprecision until 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") came along. Fix it by checking for a target gfn mismatch and zapping the existing SPTE. That way the old SP and rmap entries are gone, KVM installs the rmap in the right location, and everyone is happy. Fixes: 2032a93d66fa ("KVM: MMU: Don't allocate gfns page for direct mmu pages") Fixes: 6aa8b732ca01 ("kvm: userspace interface") Reported-by: Alexander Bulekov Reported-by: Fred Griffoul Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Link: https://patch.msgid.link/20260503201029.106481-1-pbonzini@redhat.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 24fbc9ea502a30..892246204435c5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -182,6 +182,8 @@ static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; static void mmu_spte_set(u64 *sptep, u64 spte); +static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *spte, struct list_head *invalid_list); struct kvm_mmu_role_regs { const unsigned long cr0; @@ -1287,19 +1289,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) -{ - struct kvm_mmu_page *sp; - - sp = sptep_to_sp(sptep); - WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); - - drop_spte(kvm, sptep); - - if (flush) - kvm_flush_remote_tlbs_sptep(kvm, sptep); -} - /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. @@ -2466,7 +2455,8 @@ static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, { union kvm_mmu_page_role role; - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) && + spte_to_child_sp(*sptep) && spte_to_child_sp(*sptep)->gfn == gfn) return ERR_PTR(-EEXIST); role = kvm_mmu_child_role(sptep, direct, access); @@ -2544,13 +2534,16 @@ static void __link_shadow_page(struct kvm *kvm, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - /* - * If an SPTE is present already, it must be a leaf and therefore - * a large one. Drop it, and flush the TLB if needed, before - * installing sp. - */ - if (is_shadow_present_pte(*sptep)) - drop_large_spte(kvm, sptep, flush); + if (is_shadow_present_pte(*sptep)) { + struct kvm_mmu_page *parent_sp; + LIST_HEAD(invalid_list); + + parent_sp = sptep_to_sp(sptep); + WARN_ON_ONCE(parent_sp->role.level == PG_LEVEL_4K); + + mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, true); + } spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); From 4808e5cc4fe1f40c4d690d0ce5df0f413fb02938 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 4 May 2026 09:00:00 +0800 Subject: [PATCH 239/972] LoongArch: Make CONFIG_64BIT as the default option CONFIG_64BIT is the mandatory option before v7.0, but in v7.1-rc1 both CONFIG_32BIT and CONFIG_64BIT are selectable and CONFIG_32BIT became the default option. This breaks existing configurations, so explicitly make CONFIG_64BIT as the default option to keep existing behavior. Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 3b042dbb2c4123..606597da46b8df 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -220,6 +220,7 @@ menu "Kernel type and options" choice prompt "Kernel type" + default 64BIT # Keep existing behavior config 32BIT bool "32-bit kernel" From 5643c6b2c8308b206cb01cbfd0e6ac80f9f1bc9a Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 4 May 2026 09:00:01 +0800 Subject: [PATCH 240/972] LoongArch: Specify -m32/-m64 explicitly for 32BIT/64BIT Clang/LLVM build needs -m32/-m64 to switch triple variants (i.e. the --target=xxx parameter). Otherwise we get build errors for CONFIG_32BIT. GCC doesn't support -m32/-m64 now, but maybe support in future, so use cc-option to specify them. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604232041.ESJDwVG4-lkp@intel.com/ Suggested-by: Nathan Chancellor Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 2 ++ arch/loongarch/vdso/Makefile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 47516aeea9d29f..54fcfa1eac1f89 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -55,9 +55,11 @@ endif ifdef CONFIG_32BIT tool-archpref = $(32bit-tool-archpref) UTS_MACHINE := loongarch32 +cflags-y += $(call cc-option,-m32) else tool-archpref = $(64bit-tool-archpref) UTS_MACHINE := loongarch64 +cflags-y += $(call cc-option,-m64) endif ifneq ($(SUBARCH),$(ARCH)) diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index 42aa96249828c9..9c9181bb40715b 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -12,6 +12,8 @@ obj-vdso-$(CONFIG_GENERIC_GETTIMEOFDAY) += vgettimeofday.o ccflags-vdso := \ $(filter -I%,$(KBUILD_CFLAGS)) \ $(filter -E%,$(KBUILD_CFLAGS)) \ + $(filter -m32,$(KBUILD_CFLAGS)) \ + $(filter -m64,$(KBUILD_CFLAGS)) \ $(filter -march=%,$(KBUILD_CFLAGS)) \ $(filter -m%-float,$(KBUILD_CFLAGS)) \ $(CLANG_FLAGS) \ From 98b8aebb14fdc0133939fd8fe07d0d98333dc976 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 4 May 2026 09:00:01 +0800 Subject: [PATCH 241/972] LoongArch: Fix SYM_SIGFUNC_START definition for 32BIT The SYM_SIGFUNC_START definition should match sigcontext that the length of GPRs are 8 bytes for both 32BIT and 64BIT. So replace SZREG with 8 to fix it. Cc: stable@vger.kernel.org Fixes: e4878c37f6679fde ("LoongArch: vDSO: Emit GNU_EH_FRAME correctly") Suggested-by: Xi Ruoyao Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/linkage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/linkage.h b/arch/loongarch/include/asm/linkage.h index a1bd6a3ee03a19..ae937d1708b247 100644 --- a/arch/loongarch/include/asm/linkage.h +++ b/arch/loongarch/include/asm/linkage.h @@ -69,7 +69,7 @@ 9, 10, 11, 12, 13, 14, 15, 16, \ 17, 18, 19, 20, 21, 22, 23, 24, \ 25, 26, 27, 28, 29, 30, 31; \ - .cfi_offset \num, SC_REGS + \num * SZREG; \ + .cfi_offset \num, SC_REGS + \num * 8; \ .endr; \ \ nop; \ From 49f33840dcc907d21313d369e34872880846b61c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 4 May 2026 09:00:20 +0800 Subject: [PATCH 242/972] LoongArch: Use per-root-bridge PCIH flag to skip mem resource fixup When firmware enables 64-bit PCI host bridge support, some root bridges already provide valid 64-bit mem resource windows through ACPI. In this case, the LoongArch-specific mem resource high-bits fixup in acpi_prepare_root_resources() should not be applied unconditionally. Otherwise, the kernel may override the native resource layout derived from firmware, and later BAR assignment can fail to place device BARs into the intended 64-bit address space correctly. Add a per-root-bridge ACPI flag, PCIH, and evaluate it from the current root bridge device scope. When PCIH is set, skip the mem resource high- bits fixup path and let the kernel use the firmware-provided resource description directly. When PCIH is absent or cleared, keep the existing behavior and continue filling the high address bits from the host bridge address. This makes the behavior per-root-bridge configurable and avoids breaking valid 64-bit BAR space allocation on bridges whose 64-bit windows have already been fully described by firmware. Cc: stable@vger.kernel.org Suggested-by: Chao Li Tested-by: Dongyan Qian Signed-off-by: Dongyan Qian Signed-off-by: Huacai Chen --- arch/loongarch/pci/acpi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/loongarch/pci/acpi.c b/arch/loongarch/pci/acpi.c index 0dde3ddcd54436..b02698a338eefe 100644 --- a/arch/loongarch/pci/acpi.c +++ b/arch/loongarch/pci/acpi.c @@ -61,11 +61,16 @@ static void acpi_release_root_info(struct acpi_pci_root_info *ci) static int acpi_prepare_root_resources(struct acpi_pci_root_info *ci) { int status; + unsigned long long pci_h = 0; struct resource_entry *entry, *tmp; struct acpi_device *device = ci->bridge; status = acpi_pci_probe_root_resources(ci); if (status > 0) { + acpi_evaluate_integer(device->handle, "PCIH", NULL, &pci_h); + if (pci_h) + return status; + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { if (entry->res->flags & IORESOURCE_MEM) { entry->offset = ci->root->mcfg_addr & GENMASK_ULL(63, 40); From 8dfa2f8780e486d05b9a0ffce70b8f5fbd62053e Mon Sep 17 00:00:00 2001 From: Wentao Guan Date: Mon, 4 May 2026 09:00:20 +0800 Subject: [PATCH 243/972] LoongArch: Fix potential ADE in loongson_gpu_fixup_dma_hang() The switch case in loongson_gpu_fixup_dma_hang() may not DC2 or DC3, and readl(crtc_reg) will access with random address, because the "device" is from "base+PCI_DEVICE_ID", "base" is from "pdev->devfn+1". This is wrong when my platform inserts a discrete GPU: lspci -tv -[0000:00]-+-00.0 Loongson Technology LLC Hyper Transport Bridge Controller ... +-06.0 Loongson Technology LLC LG100 GPU +-06.2 Loongson Technology LLC Device 7a37 ... Add a default switch case to fix the panic as below: Kernel ade access[#1]: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.6.136-loong64-desktop-hwe+ #4 pc 90000000017e5534 ra 90000000017e54c0 tp 90000001002f8000 sp 90000001002fb6c0 a0 80000efe00003100 a1 0000000000003100 a2 0000000000000000 a3 0000000000000002 a4 90000001002fb6b4 a5 900000087cdb58fd a6 90000000027af000 a7 0000000000000001 t0 00000000000085b9 t1 000000000000ffff t2 0000000000000000 t3 0000000000000000 t4 fffffffffffffffd t5 00000000fffb6d9c t6 0000000000083b00 t7 00000000000070c0 t8 900000087cdb4d94 u0 900000087cdb58fd s9 90000001002fb826 s0 90000000031c12c8 s1 7fffffffffffff00 s2 90000000031c12d0 s3 0000000000002710 s4 0000000000000000 s5 0000000000000000 s6 9000000100053000 s7 7fffffffffffff00 s8 90000000030d4000 ra: 90000000017e54c0 loongson_gpu_fixup_dma_hang+0x40/0x210 ERA: 90000000017e5534 loongson_gpu_fixup_dma_hang+0xb4/0x210 CRMD: 000000b0 (PLV0 -IE -DA +PG DACF=CC DACM=CC -WE) PRMD: 00000004 (PPLV0 +PIE -PWE) EUEN: 00000000 (-FPE -SXE -ASXE -BTE) ECFG: 00071c1d (LIE=0,2-4,10-12 VS=7) ESTAT: 00480000 [ADEM] (IS= ECode=8 EsubCode=1) BADV: 7fffffffffffff00 PRID: 0014d000 (Loongson-64bit, Loongson-3A6000-HV) Modules linked in: Process swapper/0 (pid: 1, threadinfo=(____ptrval____), task=(____ptrval____)) Stack : 0000000000000006 90000001002fb778 90000001002fb704 0000000000000007 0000000016a65700 90000000017e5690 000000000000ffff ffffffffffffffff 900000000209f7c0 9000000100053000 900000000209f7a8 9000000000eebc08 0000000000000000 0000000000000000 0000000000000006 90000001002fb778 90000001000530b8 90000000027af000 0000000000000000 9000000100054000 9000000100053000 9000000000ebb70c 9000000100004c00 9000000004000001 90000001002fb7e4 bae765461f31cb12 0000000000000000 0000000000000000 0000000000000006 90000000027af000 0000000000000030 90000000027af000 900000087cd6f800 9000000100053000 0000000000000000 9000000000ebc560 7a2500147cdaf720 bae765461f31cb12 0000000000000001 0000000000000030 ... Call Trace: [<90000000017e5534>] loongson_gpu_fixup_dma_hang+0xb4/0x210 [<9000000000eebc08>] pci_fixup_device+0x108/0x280 [<9000000000ebb70c>] pci_setup_device+0x24c/0x690 [<9000000000ebc560>] pci_scan_single_device+0xe0/0x140 [<9000000000ebc684>] pci_scan_slot+0xc4/0x280 [<9000000000ebdd00>] pci_scan_child_bus_extend+0x60/0x3f0 [<9000000000f5bc94>] acpi_pci_root_create+0x2b4/0x420 [<90000000017e5e74>] pci_acpi_scan_root+0x2d4/0x440 [<9000000000f5b02c>] acpi_pci_root_add+0x21c/0x3a0 [<9000000000f4ee54>] acpi_bus_attach+0x1a4/0x3c0 [<90000000010e200c>] device_for_each_child+0x6c/0xe0 [<9000000000f4bbf4>] acpi_dev_for_each_child+0x44/0x70 [<9000000000f4ef40>] acpi_bus_attach+0x290/0x3c0 [<90000000010e200c>] device_for_each_child+0x6c/0xe0 [<9000000000f4bbf4>] acpi_dev_for_each_child+0x44/0x70 [<9000000000f4ef40>] acpi_bus_attach+0x290/0x3c0 [<9000000000f5211c>] acpi_bus_scan+0x6c/0x280 [<900000000189c028>] acpi_scan_init+0x194/0x310 [<900000000189bc6c>] acpi_init+0xcc/0x140 [<9000000000220cdc>] do_one_initcall+0x4c/0x310 [<90000000018618fc>] kernel_init_freeable+0x258/0x2d4 [<900000000184326c>] kernel_init+0x28/0x13c [<9000000000222008>] ret_from_kernel_thread+0xc/0xa4 Cc: stable@vger.kernel.org Fixes: 95db0c9f526d ("LoongArch: Workaround LS2K/LS7A GPU DMA hang bug") Link: https://gist.github.com/opsiff/ebf2dac51b4013d22462f2124c55f807 Link: https://gist.github.com/opsiff/a62f2a73db0492b3c49bf223a339b133 Signed-off-by: Wentao Guan Signed-off-by: Huacai Chen --- arch/loongarch/pci/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/loongarch/pci/pci.c b/arch/loongarch/pci/pci.c index d233ea2218fe0a..f33c7ea1443d94 100644 --- a/arch/loongarch/pci/pci.c +++ b/arch/loongarch/pci/pci.c @@ -132,6 +132,9 @@ static void loongson_gpu_fixup_dma_hang(struct pci_dev *pdev, bool on) crtc_reg = regbase; crtc_offset = 0x400; break; + default: + iounmap(regbase); + return; } for (i = 0; i < CRTC_NUM_MAX; i++, crtc_reg += crtc_offset) { From 7e2c41bc62e436f465ee1ff7ebc14e35c99d95fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 4 May 2026 09:00:20 +0800 Subject: [PATCH 244/972] LoongArch: vDSO: Drop custom __arch_vdso_hres_capable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The custom definition is identical to the generic fallback one. So remove it. Signed-off-by: Thomas Weißschuh Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/vdso/gettimeofday.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index bae76767c693a9..18ba403e1ed944 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -85,12 +85,6 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return count; } -static inline bool loongarch_vdso_hres_capable(void) -{ - return true; -} -#define __arch_vdso_hres_capable loongarch_vdso_hres_capable - #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* !__ASSEMBLER__ */ From 5203012fa6045aac4b69d4e7c212e16dcf38ef10 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 4 May 2026 09:00:37 +0800 Subject: [PATCH 245/972] LoongArch: KVM: Compile switch.S directly into the kernel If we directly compile the switch.S file into the kernel, the address of the kvm_exc_entry function will definitely be within the DMW memory area. Therefore, we will no longer need to perform a copy relocation of the kvm_exc_entry. So this patch compiles switch.S directly into the kernel, and then remove the copy relocation execution logic for the kvm_exc_entry function. Cc: stable@vger.kernel.org Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/Kbuild | 2 +- arch/loongarch/include/asm/asm-prototypes.h | 20 ++++++++++++ arch/loongarch/include/asm/kvm_host.h | 3 -- arch/loongarch/kvm/Makefile | 3 +- arch/loongarch/kvm/main.c | 35 ++------------------- arch/loongarch/kvm/switch.S | 20 +++++++++--- 6 files changed, 41 insertions(+), 42 deletions(-) diff --git a/arch/loongarch/Kbuild b/arch/loongarch/Kbuild index beb8499dd8ed84..1c7a0dbe5e72f2 100644 --- a/arch/loongarch/Kbuild +++ b/arch/loongarch/Kbuild @@ -3,7 +3,7 @@ obj-y += mm/ obj-y += net/ obj-y += vdso/ -obj-$(CONFIG_KVM) += kvm/ +obj-$(subst m,y,$(CONFIG_KVM)) += kvm/ # for cleaning subdir- += boot diff --git a/arch/loongarch/include/asm/asm-prototypes.h b/arch/loongarch/include/asm/asm-prototypes.h index 704066b4f7368b..de0c17f3f49c2c 100644 --- a/arch/loongarch/include/asm/asm-prototypes.h +++ b/arch/loongarch/include/asm/asm-prototypes.h @@ -20,3 +20,23 @@ asmlinkage void noinstr __no_stack_protector ret_from_kernel_thread(struct task_ struct pt_regs *regs, int (*fn)(void *), void *fn_arg); + +struct kvm_run; +struct kvm_vcpu; +struct loongarch_fpu; + +void kvm_exc_entry(void); +int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu); + +void kvm_save_fpu(struct loongarch_fpu *fpu); +void kvm_restore_fpu(struct loongarch_fpu *fpu); + +#ifdef CONFIG_CPU_HAS_LSX +void kvm_save_lsx(struct loongarch_fpu *fpu); +void kvm_restore_lsx(struct loongarch_fpu *fpu); +#endif + +#ifdef CONFIG_CPU_HAS_LASX +void kvm_save_lasx(struct loongarch_fpu *fpu); +void kvm_restore_lasx(struct loongarch_fpu *fpu); +#endif diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 130cedbb6b39b3..776bc487a70527 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -87,7 +87,6 @@ struct kvm_context { struct kvm_world_switch { int (*exc_entry)(void); int (*enter_guest)(struct kvm_run *run, struct kvm_vcpu *vcpu); - unsigned long page_order; }; #define MAX_PGTABLE_LEVELS 4 @@ -359,8 +358,6 @@ void kvm_exc_entry(void); int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu); extern unsigned long vpid_mask; -extern const unsigned long kvm_exception_size; -extern const unsigned long kvm_enter_guest_size; extern struct kvm_world_switch *kvm_loongarch_ops; #define SW_GCSR (1 << 0) diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile index ae469edec99c16..a4d044da3aa7c2 100644 --- a/arch/loongarch/kvm/Makefile +++ b/arch/loongarch/kvm/Makefile @@ -7,11 +7,12 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o +obj-y += switch.o + kvm-y += exit.o kvm-y += interrupt.o kvm-y += main.o kvm-y += mmu.o -kvm-y += switch.o kvm-y += timer.o kvm-y += tlb.o kvm-y += vcpu.o diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 76ebff2faeddc9..f105a86143f5ba 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -348,8 +348,7 @@ void kvm_arch_disable_virtualization_cpu(void) static int kvm_loongarch_env_init(void) { - int cpu, order, ret; - void *addr; + int cpu, ret; struct kvm_context *context; vmcs = alloc_percpu(struct kvm_context); @@ -365,30 +364,8 @@ static int kvm_loongarch_env_init(void) return -ENOMEM; } - /* - * PGD register is shared between root kernel and kvm hypervisor. - * So world switch entry should be in DMW area rather than TLB area - * to avoid page fault reenter. - * - * In future if hardware pagetable walking is supported, we won't - * need to copy world switch code to DMW area. - */ - order = get_order(kvm_exception_size + kvm_enter_guest_size); - addr = (void *)__get_free_pages(GFP_KERNEL, order); - if (!addr) { - free_percpu(vmcs); - vmcs = NULL; - kfree(kvm_loongarch_ops); - kvm_loongarch_ops = NULL; - return -ENOMEM; - } - - memcpy(addr, kvm_exc_entry, kvm_exception_size); - memcpy(addr + kvm_exception_size, kvm_enter_guest, kvm_enter_guest_size); - flush_icache_range((unsigned long)addr, (unsigned long)addr + kvm_exception_size + kvm_enter_guest_size); - kvm_loongarch_ops->exc_entry = addr; - kvm_loongarch_ops->enter_guest = addr + kvm_exception_size; - kvm_loongarch_ops->page_order = order; + kvm_loongarch_ops->exc_entry = (void *)kvm_exc_entry; + kvm_loongarch_ops->enter_guest = (void *)kvm_enter_guest; vpid_mask = read_csr_gstat(); vpid_mask = (vpid_mask & CSR_GSTAT_GIDBIT) >> CSR_GSTAT_GIDBIT_SHIFT; @@ -428,16 +405,10 @@ static int kvm_loongarch_env_init(void) static void kvm_loongarch_env_exit(void) { - unsigned long addr; - if (vmcs) free_percpu(vmcs); if (kvm_loongarch_ops) { - if (kvm_loongarch_ops->exc_entry) { - addr = (unsigned long)kvm_loongarch_ops->exc_entry; - free_pages(addr, kvm_loongarch_ops->page_order); - } kfree(kvm_loongarch_ops); } diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index f1768b7a619497..1d3ba7190154dc 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -4,9 +4,11 @@ */ #include +#include #include #include #include +#include #include #include @@ -100,8 +102,13 @@ * - is still in guest mode, such as pgd table/vmid registers etc, * - will fix with hw page walk enabled in future * load kvm_vcpu from reserved CSR KVM_VCPU_KS, and save a2 to KVM_TEMP_KS + * + * PGD register is shared between root kernel and kvm hypervisor. + * So world switch entry should be in DMW area rather than TLB area + * to avoid page fault re-enter. */ .text + .p2align PAGE_SHIFT .cfi_sections .debug_frame SYM_CODE_START(kvm_exc_entry) UNWIND_HINT_UNDEFINED @@ -190,8 +197,8 @@ ret_to_host: kvm_restore_host_gpr a2 jr ra -SYM_INNER_LABEL(kvm_exc_entry_end, SYM_L_LOCAL) SYM_CODE_END(kvm_exc_entry) +EXPORT_SYMBOL_FOR_KVM(kvm_exc_entry) /* * int kvm_enter_guest(struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -215,8 +222,8 @@ SYM_FUNC_START(kvm_enter_guest) /* Save kvm_vcpu to kscratch */ csrwr a1, KVM_VCPU_KS kvm_switch_to_guest -SYM_INNER_LABEL(kvm_enter_guest_end, SYM_L_LOCAL) SYM_FUNC_END(kvm_enter_guest) +EXPORT_SYMBOL_FOR_KVM(kvm_enter_guest) SYM_FUNC_START(kvm_save_fpu) fpu_save_csr a0 t1 @@ -224,6 +231,7 @@ SYM_FUNC_START(kvm_save_fpu) fpu_save_cc a0 t1 t2 jr ra SYM_FUNC_END(kvm_save_fpu) +EXPORT_SYMBOL_FOR_KVM(kvm_save_fpu) SYM_FUNC_START(kvm_restore_fpu) fpu_restore_double a0 t1 @@ -231,6 +239,7 @@ SYM_FUNC_START(kvm_restore_fpu) fpu_restore_cc a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_fpu) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_fpu) #ifdef CONFIG_CPU_HAS_LSX SYM_FUNC_START(kvm_save_lsx) @@ -239,6 +248,7 @@ SYM_FUNC_START(kvm_save_lsx) lsx_save_data a0 t1 jr ra SYM_FUNC_END(kvm_save_lsx) +EXPORT_SYMBOL_FOR_KVM(kvm_save_lsx) SYM_FUNC_START(kvm_restore_lsx) lsx_restore_data a0 t1 @@ -246,6 +256,7 @@ SYM_FUNC_START(kvm_restore_lsx) fpu_restore_csr a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_lsx) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_lsx) #endif #ifdef CONFIG_CPU_HAS_LASX @@ -255,6 +266,7 @@ SYM_FUNC_START(kvm_save_lasx) lasx_save_data a0 t1 jr ra SYM_FUNC_END(kvm_save_lasx) +EXPORT_SYMBOL_FOR_KVM(kvm_save_lasx) SYM_FUNC_START(kvm_restore_lasx) lasx_restore_data a0 t1 @@ -262,10 +274,8 @@ SYM_FUNC_START(kvm_restore_lasx) fpu_restore_csr a0 t1 t2 jr ra SYM_FUNC_END(kvm_restore_lasx) +EXPORT_SYMBOL_FOR_KVM(kvm_restore_lasx) #endif - .section ".rodata" -SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) -SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) #ifdef CONFIG_CPU_HAS_LBT STACK_FRAME_NON_STANDARD kvm_restore_fpu From b323a441da602dfdfc24f30d3190cac786ffebf2 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Mon, 4 May 2026 09:00:37 +0800 Subject: [PATCH 246/972] LoongArch: KVM: Fix "unreliable stack" for kvm_exc_entry Insert the appropriate UNWIND hint into the kvm_exc_entry assembly function to guide the generation of correct ORC table entries, thereby solving the timeout problem ("unreliable stack") while loading the livepatch-sample module on a physical machine running virtual machines with multiple vcpus. Cc: stable@vger.kernel.org Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kvm/switch.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index 1d3ba7190154dc..936e4ae3e40859 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -111,7 +111,7 @@ .p2align PAGE_SHIFT .cfi_sections .debug_frame SYM_CODE_START(kvm_exc_entry) - UNWIND_HINT_UNDEFINED + UNWIND_HINT_END_OF_STACK csrwr a2, KVM_TEMP_KS csrrd a2, KVM_VCPU_KS addi.d a2, a2, KVM_VCPU_ARCH From b3e31a6650d4cab63f0814c37c0b360372c6ee9e Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Mon, 4 May 2026 09:00:37 +0800 Subject: [PATCH 247/972] LoongArch: KVM: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS It doesn't make sense to return the recommended maximum number of vCPUs which exceeds the maximum possible number of vCPUs. Other architectures have already done this, such as commit 57a2e13ebdda ("KVM: MIPS: Cap KVM_CAP_NR_VCPUS by KVM_CAP_MAX_VCPUS") Cc: stable@vger.kernel.org Reviewed-by: Bibo Mao Signed-off-by: Qiang Ma Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 8cc5ee1c53efbe..1317c718f896af 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -125,7 +125,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_NR_VCPUS: - r = num_online_cpus(); + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; From f26faae96c411a70641e4d21b759475caa6122d5 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Mon, 4 May 2026 09:00:38 +0800 Subject: [PATCH 248/972] LoongArch: KVM: Fix missing EMULATE_FAIL in kvm_emu_mmio_read() In the ldptr (0x24...0x27) opcode decoding path, the default case only breaks out but without setting "ret" value to EMULATE_FAIL. This leaves run->mmio.len uninitialized (stale from a previous MMIO operation) while "ret" value remains EMULATE_DO_MMIO, causing the code to proceed with an incorrect MMIO length. Add "ret = EMULATE_FAIL" to match the other default branches in the same function (e.g. the 0x28...0x2e and 0x38 cases). Cc: stable@vger.kernel.org Reviewed-by: Bibo Mao Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index da0ad89f2eb746..3b95cd0f989b08 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -390,6 +390,7 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) run->mmio.len = 8; break; default: + ret = EMULATE_FAIL; break; } break; From 81e18777d61440511451866c7c80b34a8bdd6b33 Mon Sep 17 00:00:00 2001 From: Tao Cui Date: Mon, 4 May 2026 09:00:38 +0800 Subject: [PATCH 249/972] LoongArch: KVM: Use kvm_set_pte() in kvm_flush_pte() kvm_flush_pte() is the only caller that directly assigns *pte instead of using the kvm_set_pte() wrapper. Use the wrapper for consistency with the rest of the file. No functional change intended. Cc: stable@vger.kernel.org Reviewed-by: Bibo Mao Signed-off-by: Tao Cui Signed-off-by: Huacai Chen --- arch/loongarch/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index a7fa458e33605e..e104897aa53285 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -95,7 +95,7 @@ static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx) else kvm->stat.pages--; - *pte = ctx->invalid_entry; + kvm_set_pte(pte, ctx->invalid_entry); return 1; } From 6debfff78584f0adedf7355fe5263198a3fc6b19 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 May 2026 09:00:48 +0800 Subject: [PATCH 250/972] LoongArch: KVM: Move AVEC interrupt injection into switch loop When AVEC interrupt controller is emulated in user space, AVEC interrupt is injected by software like SIP0/SIP1/TI/IPI interrupts. Here also move the AVEC interrupt injection in switch loop. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/interrupt.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index 32930959f7c256..53dac8ab8fb130 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -33,13 +33,12 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { - dmsintc_inject_irq(vcpu); - set_gcsr_estat(irq); - return 1; - } - switch (priority) { + case INT_AVEC: + if (!kvm_guest_has_msgint(&vcpu->arch)) + break; + dmsintc_inject_irq(vcpu); + fallthrough; case INT_TI: case INT_IPI: case INT_SWI0: @@ -66,12 +65,11 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; - if (kvm_guest_has_msgint(&vcpu->arch) && (priority == INT_AVEC)) { - clear_gcsr_estat(irq); - return 1; - } - switch (priority) { + case INT_AVEC: + if (!kvm_guest_has_msgint(&vcpu->arch)) + break; + fallthrough; case INT_TI: case INT_IPI: case INT_SWI0: From 2433f3f5724b3af569d9fb411ba728629524738b Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 May 2026 09:00:48 +0800 Subject: [PATCH 251/972] LoongArch: KVM: Fix HW timer interrupt lost when inject interrupt by software With passthrough HW timer, timer interrupt is injected by HW. When inject emulated CPU interrupt by software such SIP0/SIP1/IPI, HW timer interrupt may be lost. Here check whether there is timer tick value inversion before and after injecting emulated CPU interrupt by software, timer enabling by reading timer cfg register is skipped. If the timer tick value is detected with changing, then timer should be enabled. And inject a timer interrupt by software if there is. Cc: Fixes: f45ad5b8aa93 ("LoongArch: KVM: Implement vcpu interrupt operations"). Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/interrupt.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index 53dac8ab8fb130..a18c60dffbbaea 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -28,6 +28,7 @@ static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) { unsigned int irq = 0; + unsigned long old, new; clear_bit(priority, &vcpu->arch.irq_pending); if (priority < EXCCODE_INT_NUM) @@ -43,7 +44,13 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) case INT_IPI: case INT_SWI0: case INT_SWI1: + old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); set_gcsr_estat(irq); + new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); + + /* Inject TI if TVAL inverted */ + if (new > old) + set_gcsr_estat(CPU_TIMER); break; case INT_HWI0 ... INT_HWI7: @@ -60,6 +67,7 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) { unsigned int irq = 0; + unsigned long old, new; clear_bit(priority, &vcpu->arch.irq_clear); if (priority < EXCCODE_INT_NUM) @@ -74,7 +82,13 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) case INT_IPI: case INT_SWI0: case INT_SWI1: + old = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); clear_gcsr_estat(irq); + new = kvm_read_hw_gcsr(LOONGARCH_CSR_TVAL); + + /* Inject TI if TVAL inverted */ + if (new > old) + set_gcsr_estat(CPU_TIMER); break; case INT_HWI0 ... INT_HWI7: From 5a873d77ba792410a796595a917be6a440f9b7d2 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 May 2026 09:00:48 +0800 Subject: [PATCH 252/972] LoongArch: KVM: Move unconditional delay into timer clear scenery When timer interrupt arrives in guest kernel, guest kernel clears the timer interrupt and program timer with the next incoming event. During this stage, timer tick is -1 and timer interrupt status is disabled in ESTAT register. KVM hypervisor need write zero with timer tick register and wait timer interrupt injection from HW side, and then clear timer interrupt. So there is 2 cycle delay in KVM hypervisor to emulate such scenery, and the delay is unnecessary if there is no need to clear the timer interrupt. Here move 2 cycle delay into timer clear scenery and add timer ESTAT checking after delay, and set max timer expire value if timer interrupt does not arrive still. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/timer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kvm/timer.c b/arch/loongarch/kvm/timer.c index 29c2aaba63c33b..8356fce0043f60 100644 --- a/arch/loongarch/kvm/timer.c +++ b/arch/loongarch/kvm/timer.c @@ -96,15 +96,21 @@ void kvm_restore_timer(struct kvm_vcpu *vcpu) * and set CSR TVAL with -1 */ write_gcsr_timertick(0); - __delay(2); /* Wait cycles until timer interrupt injected */ /* * Writing CSR_TINTCLR_TI to LOONGARCH_CSR_TINTCLR will clear * timer interrupt, and CSR TVAL keeps unchanged with -1, it * avoids spurious timer interrupt */ - if (!(estat & CPU_TIMER)) + if (!(estat & CPU_TIMER)) { + __delay(2); /* Wait cycles until timer interrupt injected */ + + /* Write TVAL with max value if no TI shot */ + estat = kvm_read_hw_gcsr(LOONGARCH_CSR_ESTAT); + if (!(estat & CPU_TIMER)) + write_gcsr_timertick(CSR_TCFG_VAL); gcsr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + } return; } From d68ce834f8cf6cb2e77f3331df65166b35466b53 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 28 Apr 2026 21:37:47 +0530 Subject: [PATCH 253/972] cifs: abort open_cached_dir if we don't request leases It is possible that SMB2_open_init may not set lease context based on the requested oplock level. This can happen when leases have been temporarily or permanently disabled. When this happens, we will have open_cached_dir making an open without lease context and the response will anyway be rejected by open_cached_dir (thereby forcing a close to discard this open). That's unnecessary two round-trips to the server. This change adds a check before making the open request to the server to make sure that SMB2_open_init did add the expected lease context to the open in open_cached_dir. Cc: Reviewed-by: Bharath SM Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 02791ec3c5a169..88d5e9a32f28b3 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -286,6 +286,14 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, &rqst[0], &oplock, &oparms, utf16_path); if (rc) goto oshr_free; + + if (oplock != SMB2_OPLOCK_LEVEL_II) { + rc = -EINVAL; + cifs_dbg(FYI, "%s: Oplock level %d not suitable for cached directory\n", + __func__, oplock); + goto oshr_free; + } + smb2_set_next_command(tcon, &rqst[0]); memset(&qi_iov, 0, sizeof(qi_iov)); From 5e489c6c47a2ac15edbaca153b9348e42c1eacab Mon Sep 17 00:00:00 2001 From: Bjoern Doebel Date: Thu, 30 Apr 2026 08:57:17 +0000 Subject: [PATCH 254/972] smb: client: use kzalloc to zero-initialize security descriptor buffer Commit 62e7dd0a39c2d ("smb: common: change the data type of num_aces to le16") split struct smb_acl's __le32 num_aces field into __le16 num_aces and __le16 reserved. The reserved field corresponds to Sbz2 in the MS-DTYP ACL wire format, which must be zero [1]. When building an ACL descriptor in build_sec_desc(), we are using a kmalloc()'ed descriptor buffer and writing the fields explicitly using le16() writes now. This never writes to the 2 byte reserved field, leaving it as uninitialized heap data. When the reserved field happens to contain non-zero slab garbage, Samba rejects the security descriptor with "ndr_pull_security_descriptor failed: Range Error", causing chmod to fail with EINVAL. Change kmalloc() to kzalloc() to ensure the entire buffer is zero-initialized. Fixes: 62e7dd0a39c2d ("smb: common: change the data type of num_aces to le16") Cc: stable@vger.kernel.org Signed-off-by: Bjoern Doebel Assisted-by: Kiro:claude-opus-4.6 [1] https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-dtyp/20233ed8-a6c6-4097-aafa-dd545ed24428 Signed-off-by: Steve French --- fs/smb/client/cifsacl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index ec5d477793040c..a2750f1e3d90bd 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -1732,7 +1732,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); - pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); + pnntsd = kzalloc(nsecdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); cifs_put_tlink(tlink); From 84d5d76c4e8e2750fa17869b7272f189d2bdd40b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 1 May 2026 23:53:38 -0700 Subject: [PATCH 255/972] drm/ttm: Fix GPU MM stats during pool shrinking TTM pool shrinking frees pages by calling __free_pages() directly, which bypasses updates to NR_GPU_ACTIVE and leaves GPU MM accounting out of sync. Introduce a helper, __free_pages_gpu_account(), and use it for all page frees in ttm_pool.c so GPU MM statistics are updated consistently. Reported-by: Kenneth Crudup Fixes: ae80122f3896 ("drm/ttm: use gpu mm stats to track gpu memory allocations. (v4)") Cc: Christian Koenig Cc: Huang Rui Cc: Matthew Auld Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Signed-off-by: Matthew Brost Tested-by: Kenneth Crudup Reviewed-by: Dave Airlie Link: https://patch.msgid.link/20260502065338.2720646-1-matthew.brost@intel.com --- drivers/gpu/drm/ttm/ttm_pool.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 26a3689e5fd90a..278bbe7a11add3 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -206,6 +206,14 @@ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags, return NULL; } +static void __free_pages_gpu_account(struct page *p, unsigned int order, + bool reclaim) +{ + mod_lruvec_page_state(p, reclaim ? NR_GPU_RECLAIM : NR_GPU_ACTIVE, + -(1 << order)); + __free_pages(p, order); +} + /* Reset the caching and pages of size 1 << order */ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching, unsigned int order, struct page *p, bool reclaim) @@ -223,9 +231,7 @@ static void ttm_pool_free_page(struct ttm_pool *pool, enum ttm_caching caching, #endif if (!pool || !ttm_pool_uses_dma_alloc(pool)) { - mod_lruvec_page_state(p, reclaim ? NR_GPU_RECLAIM : NR_GPU_ACTIVE, - -(1 << order)); - __free_pages(p, order); + __free_pages_gpu_account(p, order, reclaim); return; } @@ -606,7 +612,7 @@ static int ttm_pool_restore_commit(struct ttm_pool_tt_restore *restore, */ ttm_pool_split_for_swap(restore->pool, p); copy_highpage(restore->alloced_page + i, p); - __free_pages(p, 0); + __free_pages_gpu_account(p, 0, false); } restore->restored_pages++; @@ -1068,7 +1074,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt, if (flags->purge) { shrunken += num_pages; page->private = 0; - __free_pages(page, order); + __free_pages_gpu_account(page, order, false); memset(tt->pages + i, 0, num_pages * sizeof(*tt->pages)); } @@ -1109,7 +1115,7 @@ long ttm_pool_backup(struct ttm_pool *pool, struct ttm_tt *tt, } handle = shandle; tt->pages[i] = ttm_backup_handle_to_page_ptr(handle); - put_page(page); + __free_pages_gpu_account(page, 0, false); shrunken++; } From b8c2e9e27636b92dc96c12f16894cbc60c58a306 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Mon, 4 May 2026 01:56:10 +0800 Subject: [PATCH 256/972] io_uring/napi: clear tracked NAPI entries on unregister IORING_UNREGISTER_NAPI disables NAPI busy polling, but it currently leaves any previously tracked NAPI IDs on the ring context. The normal wait path only checks whether the list is empty before entering the busy poll helper, so an unregistered ring can still observe stale entries and run an unexpected busy poll pass. Make unregister switch the context to inactive and free the tracked entries. Do the same inactive transition while changing the tracking strategy, and recheck the expected tracking mode under napi_lock before inserting a newly learned NAPI ID. This prevents a racing poll path from repopulating the list after unregister or reconfiguration. Also make the busy poll dispatcher ignore inactive mode explicitly. Signed-off-by: Yufan Chen Fixes: 6bf90bd8c58a ("io_uring/napi: add static napi tracking strategy") Link: https://patch.msgid.link/20260503175610.35521-1-yufan.chen@linux.dev Signed-off-by: Jens Axboe --- io_uring/napi.c | 27 ++++++++++++++++++++------- io_uring/napi.h | 8 +++++--- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/io_uring/napi.c b/io_uring/napi.c index 8d68366a4b9039..bfc77144591203 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,7 +38,8 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode) { struct hlist_head *hash_list; struct io_napi_entry *e; @@ -69,6 +70,11 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) * kfree() */ spin_lock(&ctx->napi_lock); + if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) { + spin_unlock(&ctx->napi_lock); + kfree(e); + return -EINVAL; + } if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); @@ -196,9 +202,14 @@ __io_napi_do_busy_loop(struct io_ring_ctx *ctx, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { - if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + switch (READ_ONCE(ctx->napi_track_mode)) { + case IO_URING_NAPI_TRACKING_STATIC: return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); - return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + case IO_URING_NAPI_TRACKING_DYNAMIC: + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + default: + return false; + } } static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, @@ -273,13 +284,13 @@ static int io_napi_register_napi(struct io_ring_ctx *ctx, default: return -EINVAL; } - /* clean the napi list for new settings */ + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); io_napi_free(ctx); - WRITE_ONCE(ctx->napi_track_mode, napi->op_param); /* cap NAPI at 10 msec of spin time */ napi->busy_poll_to = min(10000, napi->busy_poll_to); WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); return 0; } @@ -315,7 +326,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) case IO_URING_NAPI_STATIC_ADD_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; - return __io_napi_add_id(ctx, napi.op_param); + return __io_napi_add_id(ctx, napi.op_param, + IO_URING_NAPI_TRACKING_STATIC); case IO_URING_NAPI_STATIC_DEL_ID: if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) return -EINVAL; @@ -343,9 +355,10 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) if (arg && copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); + io_napi_free(ctx); return 0; } diff --git a/io_uring/napi.h b/io_uring/napi.h index fa742f42e09b47..e0aecccc5065b9 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,8 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, + unsigned int mode); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -43,13 +44,14 @@ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; + unsigned int mode = IO_URING_NAPI_TRACKING_DYNAMIC; - if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) + if (READ_ONCE(ctx->napi_track_mode) != mode) return; sock = sock_from_file(req->file); if (sock && sock->sk) - __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id), mode); } #else From 04fe9aeb4f3c0999e6715385664c677469dfd8f4 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Mon, 4 May 2026 01:57:10 +0800 Subject: [PATCH 257/972] io_uring/eventfd: reset deferred signal state Recursive eventfd wakeups must defer io_uring eventfd signaling because eventfd_signal_mask() rejects reentry from eventfd wakeup handlers. The io_ev_fd ops bit tracks an outstanding deferred signal so that the same rcu_head is not queued twice. That bit is only set today. Once the first deferred callback runs, later recursive notifications still see the bit set and skip queueing another deferred signal. This can leave new completions without a matching eventfd wake after the first recursive deferral. Clear the pending bit before issuing the deferred signal. If the wakeup path recurses while the callback runs, a new signal can be queued for the next RCU grace period while the current callback keeps its reference until it returns. Signed-off-by: Yufan Chen Fixes: 60b6c075e8eb ("io_uring/eventfd: move to more idiomatic RCU free usage") Link: https://patch.msgid.link/20260503175710.37209-1-yufan.chen@linux.dev Signed-off-by: Jens Axboe --- io_uring/eventfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 3da028500f76a6..d656cc2a0b9ba5 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -43,6 +43,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); + atomic_andnot(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } From 0cfff13c94cb5fa818bb374945ff280e08dc1bb9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 May 2026 08:54:27 +0200 Subject: [PATCH 258/972] wifi: mac80211: tests: mark HT check strict The HT check now only applies in strict mode since APs were found to be broken. Mark it as such. Fixes: 711a9c018ad2 ("wifi: mac80211: skip ieee80211_verify_sta_ht_mcs_support check in non-strict mode") Signed-off-by: Johannes Berg --- net/mac80211/tests/chan-mode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c index adc069065e73dd..fa370831d61711 100644 --- a/net/mac80211/tests/chan-mode.c +++ b/net/mac80211/tests/chan-mode.c @@ -65,6 +65,7 @@ static const struct determine_chan_mode_case { .ht_capa_mask = { .mcs.rx_mask[0] = 0xf7, }, + .strict = true, }, { .desc = "Masking out a RX rate in VHT capabilities", .conn_mode = IEEE80211_CONN_MODE_EHT, From 65493f27a6008bf84bd11bd41c5e1ea6b0bf3c3d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 30 Apr 2026 10:44:15 -0700 Subject: [PATCH 259/972] wifi: cw1200: Revert "Fix locking in error paths" Revert commit d98c24617a83 ("wifi: cw1200: Fix locking in error paths") because it introduces a locking bug instead of fixing a locking bug. cw1200_wow_resume() unlocks priv->conf_mutex. Hence, adding mutex_unlock(&priv->conf_mutex) just after cw1200_wow_resume() is wrong. Reported-by: Ben Hutchings Closes: https://lore.kernel.org/all/408661f69f263266b028713e1412ba36d457e63d.camel@decadent.org.uk/ Fixes: d98c24617a83 ("wifi: cw1200: Fix locking in error paths") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20260430174418.1845431-1-bvanassche@acm.org Signed-off-by: Johannes Berg --- drivers/net/wireless/st/cw1200/pm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/st/cw1200/pm.c b/drivers/net/wireless/st/cw1200/pm.c index 84eb15d729c70d..120f0379f81dd5 100644 --- a/drivers/net/wireless/st/cw1200/pm.c +++ b/drivers/net/wireless/st/cw1200/pm.c @@ -264,14 +264,12 @@ int cw1200_wow_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) wiphy_err(priv->hw->wiphy, "PM request failed: %d. WoW is disabled.\n", ret); cw1200_wow_resume(hw); - mutex_unlock(&priv->conf_mutex); return -EBUSY; } /* Force resume if event is coming from the device. */ if (atomic_read(&priv->bh_rx)) { cw1200_wow_resume(hw); - mutex_unlock(&priv->conf_mutex); return -EAGAIN; } From 8bc6b14aab669f0c301febcf4aa5249b9393bf51 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 30 Apr 2026 11:08:10 +0200 Subject: [PATCH 260/972] i2c: testunit: Replace system_long_wq with system_dfl_long_wq Currently the code enqueue work items using {queue|mod}_delayed_work(), using system_long_wq. This workqueue should be used when long works are expected, but it is a per-cpu workqueue. This is important because queue_delayed_work() queue the work using: queue_delayed_work_on(WORK_CPU_UNBOUND, ...); Note that WORK_CPU_UNBOUND = NR_CPUS. This would end up calling __queue_delayed_work() that does: if (housekeeping_enabled(HK_TYPE_TIMER)) { // [....] } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } So when cpu == WORK_CPU_UNBOUND the timer is global and is not using a specific CPU. Later, when __queue_work() is called: if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } Because the wq is not unbound, it takes the CPU where the timer fired and enqueue the work on that CPU. The consequence of all of this is that the work can run anywhere, depending on where the timer fired. Recently, a new unbound workqueue specific for long running work has been added: c116737e972e ("workqueue: Add system_dfl_long_wq for long unbound works") So change system_long_wq with system_dfl_long_wq so that the work may benefit from scheduler task placement. Signed-off-by: Marco Crivellari [wsa: remove FIXME as well] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-slave-testunit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-slave-testunit.c b/drivers/i2c/i2c-slave-testunit.c index 6de4307050dde6..871c58461ebcc1 100644 --- a/drivers/i2c/i2c-slave-testunit.c +++ b/drivers/i2c/i2c-slave-testunit.c @@ -15,7 +15,7 @@ #include #include #include -#include /* FIXME: is system_long_wq the best choice? */ +#include #define TU_VERSION_MAX_LENGTH 128 @@ -124,7 +124,7 @@ static int i2c_slave_testunit_slave_cb(struct i2c_client *client, case I2C_SLAVE_STOP: if (tu->reg_idx == TU_NUM_REGS) { set_bit(TU_FLAG_IN_PROCESS, &tu->flags); - queue_delayed_work(system_long_wq, &tu->worker, + queue_delayed_work(system_dfl_long_wq, &tu->worker, msecs_to_jiffies(10 * tu->regs[TU_REG_DELAY])); } From 9a937ca22741eebe2bf10b18657b8b9aed9c009b Mon Sep 17 00:00:00 2001 From: Ronald Claveau Date: Fri, 24 Apr 2026 16:17:33 +0200 Subject: [PATCH 261/972] dt-bindings: i2c: amlogic: Add compatible for T7 SOC Add the T7 SOC compatible which fallback to AXG compatible. Acked-by: Rob Herring (Arm) Signed-off-by: Ronald Claveau Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/amlogic,meson6-i2c.yaml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml b/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml index c4cc8af1828078..7b59b60b62e5bb 100644 --- a/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/amlogic,meson6-i2c.yaml @@ -16,10 +16,15 @@ allOf: properties: compatible: - enum: - - amlogic,meson6-i2c # Meson6, Meson8 and compatible SoCs - - amlogic,meson-gxbb-i2c # GXBB and compatible SoCs - - amlogic,meson-axg-i2c # AXG and compatible SoCs + oneOf: + - items: + - enum: + - amlogic,t7-i2c + - const: amlogic,meson-axg-i2c + - enum: + - amlogic,meson6-i2c # Meson6, Meson8 and compatible SoCs + - amlogic,meson-gxbb-i2c # GXBB and compatible SoCs + - amlogic,meson-axg-i2c # AXG and compatible SoCs reg: maxItems: 1 From 02b7cd683892d8d8464815d69a3a76579909a726 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 2 May 2026 17:31:54 +0200 Subject: [PATCH 262/972] i2c: stm32f7: reinit_completion() per transfer not per msg Currently, the driver may repeatedly call reinit_completion() during transfer which contains multiple messages, while another thread is waiting for the completion. This happens during transfer with more than 1 message, invoked via stm32f7_i2c_xfer_core() -> stm32f7_i2c_xfer_msg(). After invoking the stm32f7_i2c_xfer_msg() to start transfer, stm32f7_i2c_xfer_core() calls wait_for_completion_timeout() to wait for completion of the transfer of all messages. When the first message transfer completes, the hard IRQ handler triggers, and detects transfer completion, which leads to stm32f7_i2c_isr_event_thread() IRQ thread being started. The stm32f7_i2c_isr_event_thread() calls stm32f7_i2c_xfer_msg() in case there are more messages. Without this change, the second and later stm32f7_i2c_xfer_msg() would call reinit_completion() on the completion which is still being waited for in stm32f7_i2c_xfer_core(). Fix this by moving the reinit_completion() into stm32f7_i2c_xfer_core(), together with wait_for_completion_timeout(). Since stm32f7_i2c_xfer_core() now waits for completion of the entire transfer, increase the default timeout. This fixes sporadic transfer timeouts on STM32MP25xx during kernel boot. Fixes: aeb068c57214 ("i2c: i2c-stm32f7: add driver") Signed-off-by: Marek Vasut [wsa: reworded commit subject] Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-stm32f7.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 70cb5822bf17bf..53d9df70ebe4de 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -895,8 +895,6 @@ static void stm32f7_i2c_xfer_msg(struct stm32f7_i2c_dev *i2c_dev, f7_msg->result = 0; f7_msg->stop = (i2c_dev->msg_id >= i2c_dev->msg_num - 1); - reinit_completion(&i2c_dev->complete); - cr1 = readl_relaxed(base + STM32F7_I2C_CR1); cr2 = readl_relaxed(base + STM32F7_I2C_CR2); @@ -1728,6 +1726,8 @@ static int stm32f7_i2c_xfer_core(struct i2c_adapter *i2c_adap, if (ret) goto pm_free; + reinit_completion(&i2c_dev->complete); + stm32f7_i2c_xfer_msg(i2c_dev, msgs); if (!i2c_dev->atomic) @@ -2253,7 +2253,7 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) snprintf(adap->name, sizeof(adap->name), "STM32F7 I2C(%pa)", &res->start); adap->owner = THIS_MODULE; - adap->timeout = 2 * HZ; + adap->timeout = 8 * HZ; adap->retries = 3; adap->algo = &stm32f7_i2c_algo; adap->dev.parent = &pdev->dev; From 10161b4a791d5c4b7ea16512c1ddb133c3f8f953 Mon Sep 17 00:00:00 2001 From: Weinan Liu Date: Thu, 30 Apr 2026 23:28:51 +0000 Subject: [PATCH 263/972] iommu/amd: Fix precedence order in set_dte_passthrough() Bitwise OR | operator has a higher precedence than the ternary ?: operatior. It will be incorrectly evaluated as: new->data[1] |= (FIELD_PREP(...) | dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; Wrap the conditional operation in parentheses to enforce the correct evaluation order. Fixes: 93eee2a49c1b ("iommu/amd: Refactor logic to program the host page table in DTE") Signed-off-by: Weinan Liu Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 157505d96fdd28..f78e23f03938d6 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2149,7 +2149,8 @@ static void set_dte_passthrough(struct iommu_dev_data *dev_data, new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | - (dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; + (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); + } static void set_dte_entry(struct amd_iommu *iommu, From c5f25f5800f56f1754d9eeb3ced7c1e08c29119a Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 20 Mar 2026 13:23:24 +0100 Subject: [PATCH 264/972] dt-bindings: i2c: apple,i2c: Add t8122 compatible The i2c block on the Apple silicon t8122 (M3) SoC is compatible with the existing driver. Add "apple,t8122-i2c" as SoC specific compatible under "apple,t8103-i2c" used by the deriver. Signed-off-by: Janne Grunau Acked-by: Andi Shyti Acked-by: Rob Herring (Arm) Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/apple,i2c.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml index 500a965bdb7a84..9e59200ad37b63 100644 --- a/Documentation/devicetree/bindings/i2c/apple,i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/apple,i2c.yaml @@ -22,7 +22,9 @@ properties: compatible: oneOf: - items: - - const: apple,t6020-i2c + - enum: + - apple,t6020-i2c + - apple,t8122-i2c - const: apple,t8103-i2c - items: - enum: From 8de779dc40d35d39fa07387b6f921eb11df0f511 Mon Sep 17 00:00:00 2001 From: Rajat Gupta Date: Sun, 3 May 2026 20:51:10 -0700 Subject: [PATCH 265/972] fbdev: udlfb: add vm_ops to dlfb_ops_mmap to prevent use-after-free dlfb_ops_mmap() uses remap_pfn_range() to map vmalloc framebuffer pages to userspace but sets no vm_ops on the VMA. This means the kernel cannot track active mmaps. When dlfb_realloc_framebuffer() replaces the backing buffer via FBIOPUT_VSCREENINFO, existing mmap PTEs are not invalidated. On USB disconnect, dlfb_ops_destroy() calls vfree() on the old pages while userspace PTEs still reference them, resulting in a use-after-free: the process retains read/write access to freed kernel pages. Add vm_operations_struct with open/close callbacks that maintain an atomic mmap_count on struct dlfb_data. In dlfb_realloc_framebuffer(), check mmap_count and return -EBUSY if the buffer is currently mapped, preventing buffer replacement while userspace holds stale PTEs. Tested with PoC using dummy_hcd + raw_gadget USB device emulation. Signed-off-by: Rajat Gupta Acked-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- drivers/video/fbdev/udlfb.c | 31 ++++++++++++++++++++++++++++++- include/video/udlfb.h | 1 + 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/udlfb.c b/drivers/video/fbdev/udlfb.c index c341d76bc5646b..fdbb8671a810c7 100644 --- a/drivers/video/fbdev/udlfb.c +++ b/drivers/video/fbdev/udlfb.c @@ -321,12 +321,32 @@ static int dlfb_set_video_mode(struct dlfb_data *dlfb, return retval; } +static void dlfb_vm_open(struct vm_area_struct *vma) +{ + struct dlfb_data *dlfb = vma->vm_private_data; + + atomic_inc(&dlfb->mmap_count); +} + +static void dlfb_vm_close(struct vm_area_struct *vma) +{ + struct dlfb_data *dlfb = vma->vm_private_data; + + atomic_dec(&dlfb->mmap_count); +} + +static const struct vm_operations_struct dlfb_vm_ops = { + .open = dlfb_vm_open, + .close = dlfb_vm_close, +}; + static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) { unsigned long start = vma->vm_start; unsigned long size = vma->vm_end - vma->vm_start; unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long page, pos; + struct dlfb_data *dlfb = info->par; if (info->fbdefio) return fb_deferred_io_mmap(info, vma); @@ -358,6 +378,9 @@ static int dlfb_ops_mmap(struct fb_info *info, struct vm_area_struct *vma) size = 0; } + vma->vm_ops = &dlfb_vm_ops; + vma->vm_private_data = dlfb; + atomic_inc(&dlfb->mmap_count); return 0; } @@ -1176,7 +1199,6 @@ static void dlfb_deferred_vfree(struct dlfb_data *dlfb, void *mem) /* * Assumes &info->lock held by caller - * Assumes no active clients have framebuffer open */ static int dlfb_realloc_framebuffer(struct dlfb_data *dlfb, struct fb_info *info, u32 new_len) { @@ -1188,6 +1210,13 @@ static int dlfb_realloc_framebuffer(struct dlfb_data *dlfb, struct fb_info *info new_len = PAGE_ALIGN(new_len); if (new_len > old_len) { + if (atomic_read(&dlfb->mmap_count) > 0) { + dev_warn(info->dev, + "refusing realloc: %d active mmaps\n", + atomic_read(&dlfb->mmap_count)); + return -EBUSY; + } + /* * Alloc system memory for virtual framebuffer */ diff --git a/include/video/udlfb.h b/include/video/udlfb.h index 58fb5732831a43..ab34790d57ecd6 100644 --- a/include/video/udlfb.h +++ b/include/video/udlfb.h @@ -56,6 +56,7 @@ struct dlfb_data { spinlock_t damage_lock; struct work_struct damage_work; struct fb_ops ops; + atomic_t mmap_count; /* blit-only rendering path metrics, exposed through sysfs */ atomic_t bytes_rendered; /* raw pixel-bytes driver asked to render */ atomic_t bytes_identical; /* saved effort with backbuffer comparison */ From 9998e388be9930c106eb5904c23ecf2162407527 Mon Sep 17 00:00:00 2001 From: Niels Franke Date: Sat, 18 Apr 2026 07:37:19 +0200 Subject: [PATCH 266/972] i2c: acpi: Add ELAN0678 to i2c_acpi_force_100khz_device_ids The ELAN0678 touchpad (04F3:3195) found in the Lenovo ThinkPad X13 exhibits excessive smoothing when the I2C bus runs at 400KHz, making the touchpad feel sluggish when plugged into AC power. This is the same issue previously fixed for ELAN06FA. The device's ACPI table (Lenovo TP-R22) specifies 0x00061A80 (400KHz) for the I2cSerialBusV2 descriptor. Forcing the bus to 100KHz eliminates the sluggish behavior. Signed-off-by: Niels Franke Acked-by: Mika Westerberg [wsa: kept the sorting] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-acpi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index 2cbd31f77667aa..28c0e4884a7f2e 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -371,6 +371,7 @@ static const struct acpi_device_id i2c_acpi_force_100khz_device_ids[] = { * a 400KHz frequency. The root cause of the issue is not known. */ { "DLL0945", 0 }, + { "ELAN0678", 0 }, { "ELAN06FA", 0 }, {} }; From 32c91e8ee039777d0b95b914633fc6a42607959c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 24 Apr 2026 12:49:10 +0200 Subject: [PATCH 267/972] staging: vme_user: fix root device leak on init failure Make sure to deregister and free the root device in case module initialisation fails. Fixes: 658bcdae9c67 ("vme: Adding Fake VME driver") Cc: stable@vger.kernel.org # 4.9 Cc: Martyn Welch Signed-off-by: Johan Hovold Link: https://patch.msgid.link/20260424104910.2619349-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/staging/vme_user/vme_fake.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/staging/vme_user/vme_fake.c b/drivers/staging/vme_user/vme_fake.c index be4ad47ed526a9..8abaa3165fbb7b 100644 --- a/drivers/staging/vme_user/vme_fake.c +++ b/drivers/staging/vme_user/vme_fake.c @@ -1230,6 +1230,8 @@ static int __init fake_init(void) err_driver: kfree(fake_bridge); err_struct: + root_device_unregister(vme_root); + return retval; } From 617eb7c0961a8dfcfc811844a6396e406b2923ea Mon Sep 17 00:00:00 2001 From: Mingyu Wang <25181214217@stu.xidian.edu.cn> Date: Mon, 27 Apr 2026 10:57:45 +0800 Subject: [PATCH 268/972] i2c: dev: prevent integer overflow in I2C_TIMEOUT ioctl While fuzzing with Syzkaller, a persistent `schedule_timeout: wrong timeout value` warning was observed, accompanied by SMBus controller state machine corruption. The I2C_TIMEOUT ioctl accepts a user-provided timeout in multiples of 10 ms. The user argument is checked against INT_MAX, but it is subsequently multiplied by 10 before being passed to msecs_to_jiffies(). A malicious user can pass a large value (e.g., 429496729) that passes the `arg > INT_MAX` check but overflows when multiplied by 10. This results in a truncated 32-bit unsigned value that bypasses the internal `(int)m < 0` check in `msecs_to_jiffies()`. The truncated value is then assigned to `client->adapter->timeout` (a signed 32-bit int), which is reinterpreted as a negative number. When passed to wait_for_completion_timeout(), this negative value undergoes sign extension to a 64-bit unsigned long, triggering the `schedule_timeout` warning and causing premature returns. This leaves the SMBus state machine in an unrecoverable state, constituting a local Denial of Service (DoS). Fix this by bounding the user argument to `INT_MAX / 10`. Signed-off-by: Mingyu Wang <25181214217@stu.xidian.edu.cn> [wsa: move the comment as well] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index 7bbe0263411eb7..ccaac5e29f906b 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -487,12 +487,13 @@ static long i2cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) client->adapter->retries = arg; break; case I2C_TIMEOUT: - if (arg > INT_MAX) + /* + * For historical reasons, user-space sets the timeout value in + * units of 10 ms. + */ + if (arg > INT_MAX / 10) return -EINVAL; - /* For historical reasons, user-space sets the timeout - * value in units of 10 ms. - */ client->adapter->timeout = msecs_to_jiffies(arg * 10); break; default: From bc851db06045a40c18233dd76ef0562d7f8bb6db Mon Sep 17 00:00:00 2001 From: Shyam Sunder Reddy Padira Date: Tue, 14 Apr 2026 12:43:06 +0530 Subject: [PATCH 269/972] staging: rtl8723bs: os_dep: avoid NULL pointer dereference in rtw_cbuf_alloc The return value of kzalloc_flex() is used without ensuring that the allocation succeeded, and the pointer is dereferenced unconditionally. Guard the access to the allocated structure to avoid a potential NULL pointer dereference if the allocation fails. Fixes: 980cd426a257 ("staging: rtl8723bs: replace rtw_zmalloc() with kzalloc()") Cc: stable Signed-off-by: Shyam Sunder Reddy Padira Reviewed-by: Dan Carpenter Link: https://patch.msgid.link/20260414071308.4781-2-shyamsunderreddypadira@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/osdep_service.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/staging/rtl8723bs/os_dep/osdep_service.c b/drivers/staging/rtl8723bs/os_dep/osdep_service.c index 7959daeabc6ff0..4cfdf7c623440a 100644 --- a/drivers/staging/rtl8723bs/os_dep/osdep_service.c +++ b/drivers/staging/rtl8723bs/os_dep/osdep_service.c @@ -194,7 +194,8 @@ struct rtw_cbuf *rtw_cbuf_alloc(u32 size) struct rtw_cbuf *cbuf; cbuf = kzalloc_flex(*cbuf, bufs, size); - cbuf->size = size; + if (cbuf) + cbuf->size = size; return cbuf; } From 37b0dc5e279f35036fb638d1e187197b6c05a76d Mon Sep 17 00:00:00 2001 From: Hongling Zeng Date: Sun, 3 May 2026 12:17:44 +0800 Subject: [PATCH 270/972] parisc: Fix IRQ leak in LASI driver When request_irq() succeeds but gsc_common_setup() fails later, the IRQ is never released. Fix this by adding proper error handling with goto labels to ensure resources are released in LIFO order. Detected by Smatch: drivers/parisc/lasi.c:216 lasi_init_chip() warn: 'lasi->gsc_irq.irq' from request_irq() not released on lines: 207. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202604180957.4QdAIxP6-lkp@intel.com/ Signed-off-by: Hongling Zeng Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- drivers/parisc/lasi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c index ef6125d838788b..a5b80cd5cc37d7 100644 --- a/drivers/parisc/lasi.c +++ b/drivers/parisc/lasi.c @@ -193,8 +193,7 @@ static int __init lasi_init_chip(struct parisc_device *dev) ret = request_irq(lasi->gsc_irq.irq, gsc_asic_intr, 0, "lasi", lasi); if (ret < 0) { - kfree(lasi); - return ret; + goto err_free; } /* enable IRQ's for devices below LASI */ @@ -203,8 +202,7 @@ static int __init lasi_init_chip(struct parisc_device *dev) /* Done init'ing, register this driver */ ret = gsc_common_setup(dev, lasi); if (ret) { - kfree(lasi); - return ret; + goto err_irq; } gsc_fixup_irqs(dev, lasi, lasi_choose_irq); @@ -214,6 +212,12 @@ static int __init lasi_init_chip(struct parisc_device *dev) SYS_OFF_PRIO_DEFAULT, lasi_power_off, lasi); return ret; + +err_irq: + free_irq(lasi->gsc_irq.irq, lasi); +err_free: + kfree(lasi); + return ret; } static struct parisc_device_id lasi_tbl[] __initdata = { From b47bc7c022ddab7c79a84dd5f3f0d07fe09ec786 Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Wed, 15 Apr 2026 23:50:21 +0300 Subject: [PATCH 271/972] i2c: Compare the return value of gpiod_get_direction against GPIO_LINE_DIRECTION_OUT The GPIO_LINE_DIRECTION_* definitions have just recently been exposed to gpio consumers.h by breaking them out in a separate defs.h file. Use this to validate the gpio direction instead of the hard-coded literal. Signed-off-by: Nikola Z. Ivanov Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 9c46147e3506d1..a2132d70fb360d 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -445,8 +445,7 @@ static int i2c_init_recovery(struct i2c_adapter *adap) bri->set_scl = set_scl_gpio_value; if (bri->sda_gpiod) { bri->get_sda = get_sda_gpio_value; - /* FIXME: add proper flag instead of '0' once available */ - if (gpiod_get_direction(bri->sda_gpiod) == 0) + if (gpiod_get_direction(bri->sda_gpiod) == GPIO_LINE_DIRECTION_OUT) bri->set_sda = set_sda_gpio_value; } } else if (bri->recover_bus == i2c_generic_scl_recovery) { From 088f65e206087bf903743bd18417261d7a4c9644 Mon Sep 17 00:00:00 2001 From: Ivan Hu Date: Thu, 30 Apr 2026 15:41:07 +0800 Subject: [PATCH 272/972] x86/efi: Fix graceful fault handling after FPU softirq changes Since commit d02198550423 ("x86/fpu: Improve crypto performance by making kernel-mode FPU reliably usable in softirqs"), kernel_fpu_begin() calls fpregs_lock() which uses local_bh_disable() instead of the previous preempt_disable(). This sets SOFTIRQ_OFFSET in preempt_count during the entire EFI runtime service call, causing in_interrupt() to return true in normal task context. The graceful page fault handler efi_crash_gracefully_on_page_fault() uses in_interrupt() to bail out for faults in real interrupt context. With SOFTIRQ_OFFSET now set, the handler always bails out, leaving EFI firmware page faults unhandled. This escalates to die() which also sees in_interrupt() as true and calls panic("Fatal exception in interrupt"), resulting in a hard system freeze. On systems with buggy firmware that triggers page faults during EFI runtime calls (e.g., accessing unmapped memory in GetTime()), this causes an unrecoverable hang instead of the expected graceful EFI_ABORTED recovery. Fix by replacing in_interrupt() with !in_task(). This preserves the original intent of bailing for interrupts or NMI faults, while no longer falsely triggering from the FPU code path's local_bh_disable(). Fixes: d02198550423 ("x86/fpu: Improve crypto performance by making kernel-mode FPU reliably usable in softirqs") Cc: Signed-off-by: Ivan Hu [ardb: Sashiko spotted that using 'in_hardirq() || in_nmi()' leaves a window where a softirq may be taken before fpregs_lock() is called, but after efi_rts_work.efi_rts_id has been assigned, and any page faults occurring in that window will then be misidentified as having been caused by the firmware. Instead, use !in_task(), which incorporates in_serving_softirq(). ] Signed-off-by: Ard Biesheuvel --- arch/x86/platform/efi/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index df24ffc6105d64..c8f5e094ed9d74 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -770,7 +770,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) * If we get an interrupt/NMI while processing an EFI runtime service * then this is a regular OOPS, not an EFI failure. */ - if (in_interrupt()) + if (!in_task()) return; /* From 6036b5067a8199ba7a2dc7b377d4b9dd276d5f9e Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 15 Apr 2026 01:23:39 +0800 Subject: [PATCH 273/972] i2c: stub: Reject I2C block transfers with invalid length The I2C_SMBUS_I2C_BLOCK_DATA case in stub_xfer() uses data->block[0] as the transfer length. The existing check only clamps it to avoid overrunning the chip->words[256] register array, but does not validate it against I2C_SMBUS_BLOCK_MAX (32), which is the limit of the union i2c_smbus_data.block buffer (34 bytes total). The driver is a development/test tool (CONFIG_I2C_STUB=m, not built by default) that must be loaded with a chip_addr= parameter. A local user with access to /dev/i2c-* can issue an I2C_SMBUS ioctl with I2C_SMBUS_I2C_BLOCK_DATA and data->block[0] > 32, causing stub_xfer() to read or write past the end of the union i2c_smbus_data.block buffer: BUG: KASAN: stack-out-of-bounds in stub_xfer (drivers/i2c/i2c-stub.c:223) Read of size 1 at addr ffff88800abcfd92 by task exploit/81 Call Trace: stub_xfer (drivers/i2c/i2c-stub.c:223) __i2c_smbus_xfer (drivers/i2c/i2c-core-smbus.c:593) i2c_smbus_xfer (drivers/i2c/i2c-core-smbus.c:536) i2cdev_ioctl_smbus (drivers/i2c/i2c-dev.c:391) i2cdev_ioctl (drivers/i2c/i2c-dev.c:478) __x64_sys_ioctl (fs/ioctl.c:583) do_syscall_64 (arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) The bug exists because i2c-stub implements .smbus_xfer directly, bypassing the I2C_SMBUS_BLOCK_MAX validation in i2c_smbus_xfer_emulated(). The I2C_SMBUS_BLOCK_DATA case in the same function correctly validates against I2C_SMBUS_BLOCK_MAX, but the I2C_SMBUS_I2C_BLOCK_DATA case does not. Fix by rejecting transfers with data->block[0] == 0 or data->block[0] > I2C_SMBUS_BLOCK_MAX with -EINVAL, consistent with both the I2C_SMBUS_BLOCK_DATA case in the same function and the I2C_SMBUS_I2C_BLOCK_DATA validation in i2c_smbus_xfer_emulated(). Fixes: 4710317891e4 ("i2c-stub: Implement I2C block support") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Reviewed-by: Jean Delvare Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-stub.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/i2c/i2c-stub.c b/drivers/i2c/i2c-stub.c index fbb0db41b10e16..04314e3ed24c95 100644 --- a/drivers/i2c/i2c-stub.c +++ b/drivers/i2c/i2c-stub.c @@ -214,6 +214,11 @@ static s32 stub_xfer(struct i2c_adapter *adap, u16 addr, unsigned short flags, * We ignore banks here, because banked chips don't use I2C * block transfers */ + if (data->block[0] == 0 || + data->block[0] > I2C_SMBUS_BLOCK_MAX) { + ret = -EINVAL; + break; + } if (data->block[0] > 256 - command) /* Avoid overrun */ data->block[0] = 256 - command; len = data->block[0]; From 359b626d362127451dbe8a687ac5c240f896ae2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Fri, 1 May 2026 14:45:14 -0300 Subject: [PATCH 274/972] ALSA: pcmtest: Return -EFAULT on pattern read copy failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pattern_write() reports -EFAULT when copy_from_user() fails, but pattern_read() converts copy_to_user() failures into a zero-length read. That makes a userspace buffer fault look like EOF instead of reporting the actual error. Return -EFAULT from pattern_read() when copying the pattern data to userspace fails, and update the file offset only after a successful copy. Fixes: 315a3d57c64c ("ALSA: Implement the new Virtual PCM Test Driver") Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260501-alsa-pcmtest-pattern-read-efault-v1-1-53e1e8c11dda@gmail.com Signed-off-by: Takashi Iwai --- sound/drivers/pcmtest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/drivers/pcmtest.c b/sound/drivers/pcmtest.c index 5bfec4c7bf7147..7f93557b51eca2 100644 --- a/sound/drivers/pcmtest.c +++ b/sound/drivers/pcmtest.c @@ -679,9 +679,9 @@ static ssize_t pattern_read(struct file *file, char __user *u_buff, size_t len, return 0; if (copy_to_user(u_buff, patt_buf->buf + *off, to_read)) - to_read = 0; - else - *off += to_read; + return -EFAULT; + + *off += to_read; return to_read; } From 8acd2d7e0889ac62bc102bd7b648cd7bee04f902 Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Fri, 24 Apr 2026 20:25:18 +0900 Subject: [PATCH 275/972] drm/qxl: Fix missing KMS poll cleanup drm_kms_helper_poll_init() initializes the output polling work and enables polling for the DRM device. qxl enables polling before calling drm_dev_register(), but the drm_dev_register() failure path tears down the modeset and device state without disabling the polling helper. The remove path also unregisters and shuts down the DRM device without first disabling the polling helper. Add matching drm_kms_helper_poll_fini() calls in both paths so the delayed polling work is cancelled before qxl tears down the associated modeset/device state. Signed-off-by: Myeonghun Pak Reviewed-by: Thomas Zimmermann Fixes: 5ff91e442652 ("qxl: use drm helper hotplug support") Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20260424112543.57819-1-mhun512@gmail.com --- drivers/gpu/drm/qxl/qxl_drv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c index 2bbb1168a3ffa5..1e6a2392d7c6c3 100644 --- a/drivers/gpu/drm/qxl/qxl_drv.c +++ b/drivers/gpu/drm/qxl/qxl_drv.c @@ -118,12 +118,13 @@ qxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Complete initialization. */ ret = drm_dev_register(&qdev->ddev, ent->driver_data); if (ret) - goto modeset_cleanup; + goto poll_fini; drm_client_setup(&qdev->ddev, NULL); return 0; -modeset_cleanup: +poll_fini: + drm_kms_helper_poll_fini(&qdev->ddev); qxl_modeset_fini(qdev); unload: qxl_device_fini(qdev); @@ -154,6 +155,7 @@ qxl_pci_remove(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); + drm_kms_helper_poll_fini(dev); drm_dev_unregister(dev); drm_atomic_helper_shutdown(dev); if (pci_is_vga(pdev) && pdev->revision < 5) From c28c22c8cfbd43f2ad71a157324d9fbebc0d0f2e Mon Sep 17 00:00:00 2001 From: Francesco Lavra Date: Tue, 10 Feb 2026 18:35:45 +0100 Subject: [PATCH 276/972] drm/fb-helper: Fix clipping when damage area spans a single scanline When the damage area resulting from a dirty memory range spans a single scanline, the width of the rectangle is calculated dynamically because it may not coincide with the framebuffer width. If the dirty range ends exactly at the end of the scanline, the `bit_end` variable is incorrectly assigned a 0 value, which results in a bogus clip rectangle where the x2 coordinate is 0. This prevents the dirty scanline from being flushed to the hardware. Change the calculation of the `bit_end` value to fix the x2 coordinate value in the above edge case. Fixes: ded74cafeea9 ("drm/fb-helper: Clip damage area horizontally") Signed-off-by: Francesco Lavra Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20260210173545.733937-1-flavra@baylibre.com --- drivers/gpu/drm/drm_fb_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index a80a335f414801..1541fc8a9ac2d3 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -490,7 +490,7 @@ static void drm_fb_helper_memory_range_to_clip(struct fb_info *info, off_t off, * the number of horizontal pixels that need an update. */ off_t bit_off = (off % line_length) * 8; - off_t bit_end = (end % line_length) * 8; + off_t bit_end = bit_off + len * 8; x1 = bit_off / info->var.bits_per_pixel; x2 = DIV_ROUND_UP(bit_end, info->var.bits_per_pixel); From 24e0fd8b852062d5e8a740f7945eaa26818adce8 Mon Sep 17 00:00:00 2001 From: John Madieu Date: Fri, 1 May 2026 13:59:49 +0000 Subject: [PATCH 277/972] spi: imx: Fix precedence bug in spi_imx_dma_max_wml_find() The watermark search in spi_imx_dma_max_wml_find() reads: if (!dma_data->dma_len % (i * bytes_per_word)) break; The unary ! binds tighter than %, so this parses as: if ((!dma_data->dma_len) % (i * bytes_per_word)) break; !dma_data->dma_len is 0 or 1, and `0 % x == 0` for any x; `1 % x` is 0 unless x == 1. The condition is therefore false in every case except dma_len != 0 with i * bytes_per_word == 1, i.e. i == 1 and bytes_per_word == 1. The loop almost always falls through to its end, leaving i == 0, which the post-loop fallback rewrites to 1: if (i == 0) i = 1; So spi_imx->wml ends up at 1 for essentially every DMA transfer, defeating the entire purpose of the function. The DMA engine then requests service after every single FIFO word instead of using multi-word bursts, hurting throughput on every DMA-capable variant. Add the missing parentheses so the modulo is computed first, then negated: if (!(dma_data->dma_len % (i * bytes_per_word))) break; Fixes: faa8e404ad8e ("spi: imx: support dynamic burst length for ECSPI DMA mode") Signed-off-by: John Madieu Reviewed-by: Frank Li Link: https://patch.msgid.link/20260501135951.2416527-2-john.madieu@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index e5c907c45b8786..7ae8078c10efa0 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1836,7 +1836,7 @@ static void spi_imx_dma_max_wml_find(struct spi_imx_data *spi_imx, unsigned int i; for (i = spi_imx->devtype_data->fifo_size / 2; i > 0; i--) { - if (!dma_data->dma_len % (i * bytes_per_word)) + if (!(dma_data->dma_len % (i * bytes_per_word))) break; } /* Use 1 as wml in case no available burst length got */ From f5b5548255040ec3bef05bcb1e9c9c3614dfa7db Mon Sep 17 00:00:00 2001 From: John Madieu Date: Fri, 1 May 2026 13:59:50 +0000 Subject: [PATCH 278/972] spi: imx: Fix UAF on package-1 prepare failure in spi_imx_dma_data_prepare() When transfer->len exceeds MX51_ECSPI_CTRL_MAX_BURST and is not a multiple of it, spi_imx_dma_data_prepare() splits the transfer into two DMA packages. If preparing the second package fails: ret = spi_imx_dma_tx_data_handle(spi_imx, &spi_imx->dma_data[1], transfer->tx_buf + spi_imx->dma_data[0].data_len, false); if (ret) { kfree(spi_imx->dma_data[0].dma_tx_buf); kfree(spi_imx->dma_data[0].dma_rx_buf); kfree(spi_imx->dma_data); } } return 0; the function frees the package-0 buffers and the dma_data array, then falls through to `return 0`, telling the caller the prepare succeeded. The caller then dereferences the freed dma_data array, producing a use-after-free. Return the error from the failure path so the caller takes its existing failure branch. Fixes: faa8e404ad8e ("spi: imx: support dynamic burst length for ECSPI DMA mode") Signed-off-by: John Madieu Reviewed-by: Frank Li Link: https://patch.msgid.link/20260501135951.2416527-3-john.madieu@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 7ae8078c10efa0..4e3dbd01d6191e 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1709,6 +1709,7 @@ static int spi_imx_dma_data_prepare(struct spi_imx_data *spi_imx, kfree(spi_imx->dma_data[0].dma_tx_buf); kfree(spi_imx->dma_data[0].dma_rx_buf); kfree(spi_imx->dma_data); + return ret; } } From 894e04b7116297a6529e0c4ed90e3eb160939805 Mon Sep 17 00:00:00 2001 From: John Madieu Date: Fri, 1 May 2026 13:59:51 +0000 Subject: [PATCH 279/972] spi: imx: Propagate prepare_transfer() error from spi_imx_setupxfer() spi_imx_setupxfer() calls the per-variant prepare_transfer() callback and returns 0 unconditionally: spi_imx->devtype_data->prepare_transfer(spi_imx, spi, t); return 0; mx51_ecspi_prepare_transfer() can return -EINVAL when the requested word_delay does not fit in MX51_ECSPI_PERIOD_MASK. The error is detected after a partial set of register writes (CTRL: BL, clkdiv, SMC), so the controller is left in a partially-configured state and the transfer is then submitted as if setup succeeded. Propagate the return value. The other variants' prepare_transfer callbacks all return 0, so this is a no-op for them. Fixes: a3bb4e663df3 ("spi: imx: support word delay") Signed-off-by: John Madieu Reviewed-by: Frank Li Link: https://patch.msgid.link/20260501135951.2416527-4-john.madieu@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 4e3dbd01d6191e..480d1e8b281f19 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -1382,9 +1382,7 @@ static int spi_imx_setupxfer(struct spi_device *spi, spi_imx->target_burst = t->len; } - spi_imx->devtype_data->prepare_transfer(spi_imx, spi, t); - - return 0; + return spi_imx->devtype_data->prepare_transfer(spi_imx, spi, t); } static void spi_imx_sdma_exit(struct spi_imx_data *spi_imx) From 7672749e1496215e8683ce57cf323119033954cf Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 30 Apr 2026 11:10:18 +0100 Subject: [PATCH 280/972] spi: microchip-core-qspi: control built-in cs manually The coreQSPI IP supports only a single chip select, which is automagically operated by the hardware - set low when the transmit buffer first gets written to and set high when the number of bytes written to the TOTALBYTES field of the FRAMES register have been sent on the bus. Additional devices must use GPIOs for their chip selects. It was reported to me that if there are two devices attached to this QSPI controller that the in-built chip select is set low while linux tries to access the device attached to the GPIO. This went undetected as the boards that connected multiple devices to the SPI controller all exclusively used GPIOs for chip selects, not relying on the built-in chip select at all. It turns out that this was because the built-in chip select, when controlled automagically, is set low when active and high when inactive, thereby ruling out its use for active-high devices or devices that need to transmit with the chip select disabled. Modify the driver so that it controls chip select directly, retaining the behaviour for mem_ops of setting the chip select active for the entire duration of the transfer in the exec_op callback. For regular transfers, implement the set_cs callback for the core to use. As part of this, the existing setup callback, mchp_coreqspi_setup_op(), is removed. Modifying the CLKIDLE field is not safe to do during operation when there are multiple devices, so this code is removed entirely. Setting the MASTER and ENABLE fields is something that can be done once at probe, it doesn't need to be re-run for each device. Instead the new setup callback sets the built-in chip select to its inactive state for active-low devices, as the reset value of the chip select in software controlled mode is low. Fixes: 8f9cf02c88528 ("spi: microchip-core-qspi: Add regular transfers") Fixes: 8596124c4c1bc ("spi: microchip-core-qspi: Add support for microchip fpga qspi controllers") CC: stable@vger.kernel.org Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20260430-hamstring-busload-f941d0347b5e@spud Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core-qspi.c | 79 ++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index eab059fb0bc2ce..ffa0f33a0ae099 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -74,6 +74,13 @@ #define STATUS_FLAGSX4 BIT(8) #define STATUS_MASK GENMASK(8, 0) +/* + * QSPI Direct Access register defines + */ +#define DIRECT_ACCESS_EN_SSEL BIT(0) +#define DIRECT_ACCESS_OP_SSEL BIT(1) +#define DIRECT_ACCESS_OP_SSEL_SHIFT 1 + #define BYTESUPPER_MASK GENMASK(31, 16) #define BYTESLOWER_MASK GENMASK(15, 0) @@ -158,6 +165,38 @@ static int mchp_coreqspi_set_mode(struct mchp_coreqspi *qspi, const struct spi_m return 0; } +static void mchp_coreqspi_set_cs(struct spi_device *spi, bool enable) +{ + struct mchp_coreqspi *qspi = spi_controller_get_devdata(spi->controller); + u32 val; + + val = readl(qspi->regs + REG_DIRECT_ACCESS); + + val &= ~DIRECT_ACCESS_OP_SSEL; + val |= !enable << DIRECT_ACCESS_OP_SSEL_SHIFT; + + writel(val, qspi->regs + REG_DIRECT_ACCESS); +} + +static int mchp_coreqspi_setup(struct spi_device *spi) +{ + struct mchp_coreqspi *qspi = spi_controller_get_devdata(spi->controller); + u32 val; + + /* + * Active low devices need to be specifically set to their inactive + * states during probe. + */ + if (spi->mode & SPI_CS_HIGH) + return 0; + + val = readl(qspi->regs + REG_DIRECT_ACCESS); + val |= DIRECT_ACCESS_OP_SSEL; + writel(val, qspi->regs + REG_DIRECT_ACCESS); + + return 0; +} + static inline void mchp_coreqspi_read_op(struct mchp_coreqspi *qspi) { u32 control, data; @@ -380,19 +419,6 @@ static int mchp_coreqspi_setup_clock(struct mchp_coreqspi *qspi, struct spi_devi return 0; } -static int mchp_coreqspi_setup_op(struct spi_device *spi_dev) -{ - struct spi_controller *ctlr = spi_dev->controller; - struct mchp_coreqspi *qspi = spi_controller_get_devdata(ctlr); - u32 control = readl_relaxed(qspi->regs + REG_CONTROL); - - control |= (CONTROL_MASTER | CONTROL_ENABLE); - control &= ~CONTROL_CLKIDLE; - writel_relaxed(control, qspi->regs + REG_CONTROL); - - return 0; -} - static inline void mchp_coreqspi_config_op(struct mchp_coreqspi *qspi, const struct spi_mem_op *op) { u32 idle_cycles = 0; @@ -483,6 +509,7 @@ static int mchp_coreqspi_exec_op(struct spi_mem *mem, const struct spi_mem_op *o reinit_completion(&qspi->data_completion); mchp_coreqspi_config_op(qspi, op); + mchp_coreqspi_set_cs(mem->spi, true); if (op->cmd.opcode) { qspi->txbuf = &opcode; qspi->rxbuf = NULL; @@ -523,6 +550,7 @@ static int mchp_coreqspi_exec_op(struct spi_mem *mem, const struct spi_mem_op *o err = -ETIMEDOUT; error: + mchp_coreqspi_set_cs(mem->spi, false); mutex_unlock(&qspi->op_lock); mchp_coreqspi_disable_ints(qspi); @@ -686,6 +714,7 @@ static int mchp_coreqspi_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct device_node *np = dev->of_node; int ret; + u32 num_cs, val; ctlr = devm_spi_alloc_host(&pdev->dev, sizeof(*qspi)); if (!ctlr) @@ -718,10 +747,18 @@ static int mchp_coreqspi_probe(struct platform_device *pdev) return ret; } + /* + * The IP core only has a single CS, any more have to be provided via + * gpios + */ + if (of_property_read_u32(pdev->dev.of_node, "num-cs", &num_cs)) + num_cs = 1; + + ctlr->num_chipselect = num_cs; + ctlr->bits_per_word_mask = SPI_BPW_MASK(8); ctlr->mem_ops = &mchp_coreqspi_mem_ops; ctlr->mem_caps = &mchp_coreqspi_mem_caps; - ctlr->setup = mchp_coreqspi_setup_op; ctlr->mode_bits = SPI_CPOL | SPI_CPHA | SPI_RX_DUAL | SPI_RX_QUAD | SPI_TX_DUAL | SPI_TX_QUAD; ctlr->dev.of_node = np; @@ -729,9 +766,21 @@ static int mchp_coreqspi_probe(struct platform_device *pdev) ctlr->prepare_message = mchp_coreqspi_prepare_message; ctlr->unprepare_message = mchp_coreqspi_unprepare_message; ctlr->transfer_one = mchp_coreqspi_transfer_one; - ctlr->num_chipselect = 2; + ctlr->setup = mchp_coreqspi_setup; + ctlr->set_cs = mchp_coreqspi_set_cs; ctlr->use_gpio_descriptors = true; + val = readl_relaxed(qspi->regs + REG_CONTROL); + val |= (CONTROL_MASTER | CONTROL_ENABLE); + writel_relaxed(val, qspi->regs + REG_CONTROL); + + /* + * Put cs into software controlled mode + */ + val = readl_relaxed(qspi->regs + REG_DIRECT_ACCESS); + val |= DIRECT_ACCESS_EN_SSEL; + writel(val, qspi->regs + REG_DIRECT_ACCESS); + ret = spi_register_controller(ctlr); if (ret) return dev_err_probe(&pdev->dev, ret, From eb56deaabf127e8985fc91fa6c97bf8a3b062844 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 30 Apr 2026 11:10:19 +0100 Subject: [PATCH 281/972] spi: microchip-core-qspi: don't attempt to transmit during emulated read-only dual/quad operations The core will deal with reads by creating clock cycles itself, there's no need to generate clock cycles by transmitting garbage data at the driver level. Further, transmitting garbage data just bricks the transfer since QSPI doesn't have a dedicated master-out line like MOSI in regular SPI. I'm not entirely sure if the transfer is bricked because of the garbage data being transmitted on the bus or because the core loses track of whether it is supposed to be sending or receiving data. Fixes: 8f9cf02c88528 ("spi: microchip-core-qspi: Add regular transfers") CC: stable@vger.kernel.org Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20260430-freezing-saloon-95b1f3d9dad0@spud Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core-qspi.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index ffa0f33a0ae099..70215a407b5a32 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -690,18 +690,28 @@ static int mchp_coreqspi_transfer_one(struct spi_controller *ctlr, struct spi_de struct spi_transfer *t) { struct mchp_coreqspi *qspi = spi_controller_get_devdata(ctlr); + bool dual_quad = false; qspi->tx_len = t->len; + if (t->tx_nbits == SPI_NBITS_QUAD || t->rx_nbits == SPI_NBITS_QUAD || + t->tx_nbits == SPI_NBITS_DUAL || + t->rx_nbits == SPI_NBITS_DUAL) + dual_quad = true; + if (t->tx_buf) qspi->txbuf = (u8 *)t->tx_buf; if (!t->rx_buf) { mchp_coreqspi_write_op(qspi); - } else { + } else if (!dual_quad) { qspi->rxbuf = (u8 *)t->rx_buf; qspi->rx_len = t->len; mchp_coreqspi_write_read_op(qspi); + } else { + qspi->rxbuf = (u8 *)t->rx_buf; + qspi->rx_len = t->len; + mchp_coreqspi_read_op(qspi); } return 0; From 0b2eb1f8473eddeff5317e521498329581432f89 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 30 Apr 2026 11:10:20 +0100 Subject: [PATCH 282/972] spi: microchip-core-qspi: remove some inline markings Remove inline markings from a number of functions that are called as part of mem ops callbacks. None of them are either particularly trivial or sensitive to overhead of a function call. Just let the compiler decide what to do with them. Signed-off-by: Conor Dooley Link: https://patch.msgid.link/20260430-serpent-stimulate-59fb860ef429@spud Signed-off-by: Mark Brown --- drivers/spi/spi-microchip-core-qspi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-microchip-core-qspi.c b/drivers/spi/spi-microchip-core-qspi.c index 70215a407b5a32..4dee0fea1df8dd 100644 --- a/drivers/spi/spi-microchip-core-qspi.c +++ b/drivers/spi/spi-microchip-core-qspi.c @@ -197,7 +197,7 @@ static int mchp_coreqspi_setup(struct spi_device *spi) return 0; } -static inline void mchp_coreqspi_read_op(struct mchp_coreqspi *qspi) +static void mchp_coreqspi_read_op(struct mchp_coreqspi *qspi) { u32 control, data; @@ -233,7 +233,7 @@ static inline void mchp_coreqspi_read_op(struct mchp_coreqspi *qspi) } } -static inline void mchp_coreqspi_write_op(struct mchp_coreqspi *qspi) +static void mchp_coreqspi_write_op(struct mchp_coreqspi *qspi) { u32 control, data; @@ -261,7 +261,7 @@ static inline void mchp_coreqspi_write_op(struct mchp_coreqspi *qspi) } } -static inline void mchp_coreqspi_write_read_op(struct mchp_coreqspi *qspi) +static void mchp_coreqspi_write_read_op(struct mchp_coreqspi *qspi) { u32 control, data; @@ -419,7 +419,7 @@ static int mchp_coreqspi_setup_clock(struct mchp_coreqspi *qspi, struct spi_devi return 0; } -static inline void mchp_coreqspi_config_op(struct mchp_coreqspi *qspi, const struct spi_mem_op *op) +static void mchp_coreqspi_config_op(struct mchp_coreqspi *qspi, const struct spi_mem_op *op) { u32 idle_cycles = 0; int total_bytes, cmd_bytes, frames, ctrl; From d581fc99d3b958cb6e363104e9aab57f36aee6f3 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 23 Apr 2026 13:56:47 +0100 Subject: [PATCH 283/972] mm/memfd_luo: reject memfds whose page count exceeds UINT_MAX memfd_luo_preserve_folios() declares max_folios as unsigned int and computes it from the inode size, then passes it to memfd_pin_folios() which itself caps max_folios at unsigned int. For files whose base-page count exceeds UINT_MAX (larger than 16 TiB with 4 KiB pages), the assignment truncates silently: only a prefix of the file gets pinned and preserved, while memfd_luo_preserve() still records the full inode size in ser->size. On retrieve the inode is restored to the full size but only the preserved prefix repopulates the page cache, so the tail comes back as holes and user data is silently lost across the live update. Reject such files at preserve time with -EFBIG rather than chunk the pin loop, which would also require enlarging the preserved folios array well beyond what is practical. Fixes: b3749f174d68 ("mm: memfd_luo: allow preserving memfd") Signed-off-by: David Carlier Reviewed-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Link: https://patch.msgid.link/20260423125648.152113-1-devnexen@gmail.com Signed-off-by: Pasha Tatashin --- mm/memfd_luo.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 35d1247281e047..94ae113f68f675 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -259,7 +259,7 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) struct inode *inode = file_inode(args->file); struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; - u64 nr_folios; + u64 nr_folios, inode_size; int err = 0, seals; inode_lock(inode); @@ -285,7 +285,18 @@ static int memfd_luo_preserve(struct liveupdate_file_op_args *args) } ser->pos = args->file->f_pos; - ser->size = i_size_read(inode); + inode_size = i_size_read(inode); + + /* + * memfd_pin_folios() caps at UINT_MAX folios; refuse larger + * files to avoid silently preserving only a prefix. + */ + if (DIV_ROUND_UP_ULL(inode_size, PAGE_SIZE) > UINT_MAX) { + err = -EFBIG; + goto err_free_ser; + } + + ser->size = inode_size; ser->seals = seals; err = memfd_luo_preserve_folios(args->file, &ser->folios, From 7b0b68b2b95606e65594958686833e53423f58f2 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 23 Apr 2026 13:56:48 +0100 Subject: [PATCH 284/972] mm/memfd_luo: document preservation of file seals Commit 8a552d68a86e ("mm: memfd_luo: preserve file seals") started preserving file seals across live update and restoring them via memfd_add_seals() on retrieve, but the DOC header was not updated and still listed seals under "Non-Preserved Properties" as being unsealed on restore. Move the Seals entry to the "Preserved Properties" section and describe the actual behavior, including the MEMFD_LUO_ALL_SEALS restriction that both preserve and retrieve enforce. Fixes: 8a552d68a86e ("mm: memfd_luo: preserve file seals") Signed-off-by: David Carlier Reviewed-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Link: https://patch.msgid.link/20260423125648.152113-2-devnexen@gmail.com Signed-off-by: Pasha Tatashin --- mm/memfd_luo.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 94ae113f68f675..59de210bee5f9e 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -50,6 +50,11 @@ * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property * is maintained. * + * Seals + * File seals set on the memfd are preserved and re-applied on restore. + * Only seals known to this LUO version (see ``MEMFD_LUO_ALL_SEALS``) may + * be present; preservation fails with ``-EOPNOTSUPP`` otherwise. + * * Non-Preserved Properties * ======================== * @@ -61,10 +66,6 @@ * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set * again after restore via ``fcntl()``. - * - * Seals - * File seals are not preserved. The file is unsealed on restore and if - * needed, must be sealed again via ``fcntl()``. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt From 05c5078de822148e7cb84968a8783ddfcb6c9ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Wed, 22 Apr 2026 18:32:58 +0200 Subject: [PATCH 285/972] wifi: ath12k: fix leak in some ath12k_wmi_xxx() functions Some wmi functions were using plain 'return ath12k_wmi_cmd_send(...)' without explicitly handling the error code. This leads to leaking the skb in case of error. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00218-QCAHKSWPL_SILICONZ-1 Fixes: 66a9448b1b89 ("wifi: ath12k: implement hardware data filter") Fixes: 593174170919 ("wifi: ath12k: implement WoW enable and wakeup commands") Fixes: 4a3c212eee0e ("wifi: ath12k: add basic WoW functionalities") Fixes: 16f474d6d49d ("wifi: ath12k: add WoW net-detect functionality") Fixes: 1666108c74c4 ("wifi: ath12k: support ARP and NS offload") Fixes: aab4ae566fa1 ("wifi: ath12k: support GTK rekey offload") Fixes: 7af01e569529 ("wifi: ath12k: handle keepalive during WoWLAN suspend and resume") Signed-off-by: Nicolas Escande Reviewed-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260422163258.3013872-1-nico.escande@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 103 ++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 65a05a9520ffcb..75c87edd2a8a51 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -10251,7 +10251,7 @@ int ath12k_wmi_hw_data_filter_cmd(struct ath12k *ar, struct wmi_hw_data_filter_a { struct wmi_hw_data_filter_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10275,7 +10275,13 @@ int ath12k_wmi_hw_data_filter_cmd(struct ath12k *ar, struct wmi_hw_data_filter_a "wmi hw data filter enable %d filter_bitmap 0x%x\n", arg->enable, arg->hw_filter_bitmap); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_HW_DATA_FILTER_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_HW_DATA_FILTER_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_HW_DATA_FILTER_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) @@ -10283,6 +10289,7 @@ int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) struct wmi_wow_host_wakeup_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10295,14 +10302,20 @@ int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow host wakeup ind\n"); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_enable(struct ath12k *ar) { struct wmi_wow_enable_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10317,7 +10330,13 @@ int ath12k_wmi_wow_enable(struct ath12k *ar) cmd->pause_iface_config = cpu_to_le32(WOW_IFACE_PAUSE_ENABLED); ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow enable\n"); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ENABLE_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, @@ -10327,6 +10346,7 @@ int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, struct wmi_wow_add_del_event_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10343,7 +10363,13 @@ int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow add wakeup event %s enable %d vdev_id %d\n", wow_wakeup_event(event), enable, vdev_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, @@ -10356,6 +10382,7 @@ int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, struct sk_buff *skb; void *ptr; size_t len; + int ret; len = sizeof(*cmd) + sizeof(*tlv) + /* array struct */ @@ -10435,7 +10462,13 @@ int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, ath12k_dbg_dump(ar->ab, ATH12K_DBG_WMI, NULL, "wow bitmask: ", bitmap->bitmaskbuf, pattern_len); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ADD_WAKE_PATTERN_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ADD_WAKE_PATTERN_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ADD_WAKE_PATTERN_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) @@ -10443,6 +10476,7 @@ int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) struct wmi_wow_del_pattern_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10459,7 +10493,13 @@ int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow del pattern vdev_id %d pattern_id %d\n", vdev_id, pattern_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_DEL_WAKE_PATTERN_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_DEL_WAKE_PATTERN_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_DEL_WAKE_PATTERN_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } static struct sk_buff * @@ -10595,6 +10635,7 @@ int ath12k_wmi_wow_config_pno(struct ath12k *ar, u32 vdev_id, struct wmi_pno_scan_req_arg *pno_scan) { struct sk_buff *skb; + int ret; if (pno_scan->enable) skb = ath12k_wmi_op_gen_config_pno_start(ar, vdev_id, pno_scan); @@ -10604,7 +10645,13 @@ int ath12k_wmi_wow_config_pno(struct ath12k *ar, u32 vdev_id, if (IS_ERR_OR_NULL(skb)) return -ENOMEM; - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } static void ath12k_wmi_fill_ns_offload(struct ath12k *ar, @@ -10717,6 +10764,7 @@ int ath12k_wmi_arp_ns_offload(struct ath12k *ar, void *buf_ptr; size_t len; u8 ns_cnt, ns_ext_tuples = 0; + int ret; ns_cnt = offload->ipv6_count; @@ -10752,7 +10800,13 @@ int ath12k_wmi_arp_ns_offload(struct ath12k *ar, if (ns_ext_tuples) ath12k_wmi_fill_ns_offload(ar, offload, &buf_ptr, enable, 1); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_SET_ARP_NS_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_SET_ARP_NS_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_SET_ARP_NS_OFFLOAD_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, @@ -10762,7 +10816,7 @@ int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, struct wmi_gtk_rekey_offload_cmd *cmd; struct sk_buff *skb; __le64 replay_ctr; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10789,7 +10843,13 @@ int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "offload gtk rekey vdev: %d %d\n", arvif->vdev_id, enable); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_GTK_OFFLOAD_CMDID offload\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, @@ -10797,7 +10857,7 @@ int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, { struct wmi_gtk_rekey_offload_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10811,7 +10871,13 @@ int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "get gtk rekey vdev_id: %d\n", arvif->vdev_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_GTK_OFFLOAD_CMDID getinfo\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_sta_keepalive(struct ath12k *ar, @@ -10822,6 +10888,7 @@ int ath12k_wmi_sta_keepalive(struct ath12k *ar, struct wmi_sta_keepalive_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd) + sizeof(*arp); skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); @@ -10849,7 +10916,13 @@ int ath12k_wmi_sta_keepalive(struct ath12k *ar, "wmi sta keepalive vdev %d enabled %d method %d interval %d\n", arg->vdev_id, arg->enabled, arg->method, arg->interval); - return ath12k_wmi_cmd_send(wmi, skb, WMI_STA_KEEPALIVE_CMDID); + ret = ath12k_wmi_cmd_send(wmi, skb, WMI_STA_KEEPALIVE_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_STA_KEEPALIVE_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_mlo_setup(struct ath12k *ar, struct wmi_mlo_setup_arg *mlo_params) From 81594a12d5cecb3ab35b603a00037c7c3ee87ab2 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 27 Apr 2026 16:00:11 +0530 Subject: [PATCH 286/972] wifi: ath12k: initialize RSSI dBm conversion event state Currently, the RSSI dBm conversion event handler leaves struct ath12k_wmi_rssi_dbm_conv_info_arg uninitialized on the stack before calling the TLV parser. If one of the optional sub-TLVs is absent, the corresponding *_present flag retains stack garbage and later gets read in ath12k_wmi_update_rssi_offsets(). With UBSAN enabled this triggers an invalid-load report for _Bool: UBSAN: invalid-load in drivers/net/wireless/ath/ath12k/wmi.c:9682:15 load of value 9 is not a valid value for type '_Bool' Call Trace: ath12k_wmi_rssi_dbm_conversion_params_info_event.cold+0x72/0x85 [ath12k] ath12k_wmi_op_rx+0x1871/0x2ab0 [ath12k] ath12k_htc_rx_completion_handler+0x44b/0x810 [ath12k] ath12k_ce_recv_process_cb+0x554/0x9f0 [ath12k] ath12k_ce_per_engine_service+0xbe/0xf0 [ath12k] ath12k_pci_ce_workqueue+0x69/0x120 [ath12k] Initialize the parsed event state to zero before passing it to the TLV parser so missing sub-TLVs correctly leave the presence flags false. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Fixes: 0314ee81a91d ("wifi: ath12k: handle WMI event for real noise floor calculation") Signed-off-by: Rameshkumar Sundaram Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20260427103011.2983269-1-rameshkumar.sundaram@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 75c87edd2a8a51..b5e904a55aeabb 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -9778,7 +9778,7 @@ static void ath12k_wmi_rssi_dbm_conversion_params_info_event(struct ath12k_base *ab, struct sk_buff *skb) { - struct ath12k_wmi_rssi_dbm_conv_info_arg rssi_info; + struct ath12k_wmi_rssi_dbm_conv_info_arg rssi_info = {}; struct ath12k *ar; s32 noise_floor; u32 pdev_id; From 0e1308803d2c3fd365a6d21e6be355ec1e28eaaf Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Mon, 27 Apr 2026 13:51:41 +0800 Subject: [PATCH 287/972] wifi: ath12k: fix peer_id usage in normal RX path ath12k_dp_rx_deliver_msdu() currently uses hal_rx_desc_data::peer_id parsed from mpdu_start descriptor to do peer lookup. However In an A-MSDU aggregation scenario, hardware only populates mpdu_start descriptor for the first sub-msdu, but not the following ones. In that case peer_id could be invalid, leading to peer lookup failure: ath12k_wifi7_pci 0000:06:00.0: rx skb 00000000c391c041 len 1532 peer (null) 0 ucast sn 0 eht320 rate_idx 12 vht_nss 2 freq 6105 band 3 flag 0x40d1a fcs-err 0 mic-err 0 amsdu-more 0 As a result pubsta is NULL and parts of ieee80211_rx_status structure are left uninitialized, which may cause unexpected behavior. Fix it by switching the normal RX path to use ath12k_skb_rxcb::peer_id which is parsed from REO ring's rx_mpdu_desc and is always valid. hal_rx_desc_data::peer_id is still used in ath12k_wifi7_dp_rx_frag_h_mpdu(), which is safe since A-MSDU aggregation does not occur for fragmented frames. Similarly, ath12k_skb_rxcb::peer_id may be overwritten by hal_rx_desc_data::peer_id in ath12k_wifi7_dp_rx_h_mpdu(), which only handles non-aggregated multicast/broadcast traffic. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 11157e0910fd ("wifi: ath12k: Use ath12k_dp_peer in per packet Tx & Rx paths") Signed-off-by: Baochen Qiang Reviewed-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20260427-ath12k-fix-peer-id-source-v1-1-b5f701fb8e88@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/dp_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 25557dea582679..b108ccd0f63703 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -1340,7 +1340,7 @@ void ath12k_dp_rx_deliver_msdu(struct ath12k_pdev_dp *dp_pdev, struct napi_struc bool is_mcbc = rxcb->is_mcbc; bool is_eapol = rxcb->is_eapol; - peer = ath12k_dp_peer_find_by_peerid(dp_pdev, rx_info->peer_id); + peer = ath12k_dp_peer_find_by_peerid(dp_pdev, rxcb->peer_id); pubsta = peer ? peer->sta : NULL; From d748603f12baff112caa3ab7d39f50100f010dbd Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 9 Dec 2025 11:04:59 +0100 Subject: [PATCH 288/972] wifi: ath5k: do not access array OOB Vincent reports: > The ath5k driver seems to do an array-index-out-of-bounds access as > shown by the UBSAN kernel message: > UBSAN: array-index-out-of-bounds in drivers/net/wireless/ath/ath5k/base.c:1741:20 > index 4 is out of range for type 'ieee80211_tx_rate [4]' > ... > Call Trace: > > dump_stack_lvl+0x5d/0x80 > ubsan_epilogue+0x5/0x2b > __ubsan_handle_out_of_bounds.cold+0x46/0x4b > ath5k_tasklet_tx+0x4e0/0x560 [ath5k] > tasklet_action_common+0xb5/0x1c0 It is real. 'ts->ts_final_idx' can be 3 on 5212, so: info->status.rates[ts->ts_final_idx + 1].idx = -1; with the array defined as: struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; while the size is: #define IEEE80211_TX_MAX_RATES 4 is indeed bogus. Set this 'idx = -1' sentinel only if the array index is less than the array size. As mac80211 will not look at rates beyond the size (IEEE80211_TX_MAX_RATES). Note: The effect of the OOB write is negligible. It just overwrites the next member of info->status, i.e. ack_signal. Signed-off-by: Jiri Slaby (SUSE) Reported-by: Vincent Danjean Link: https://lore.kernel.org/all/aQYUkIaT87ccDCin@eldamar.lan Closes: https://bugs.debian.org/1119093 Fixes: 6d7b97b23e11 ("ath5k: fix tx status reporting issues") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251209100459.2253198-1-jirislaby@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath5k/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c index 05c9c07591fcb1..6ca31d4ea437bd 100644 --- a/drivers/net/wireless/ath/ath5k/base.c +++ b/drivers/net/wireless/ath/ath5k/base.c @@ -1738,7 +1738,8 @@ ath5k_tx_frame_completed(struct ath5k_hw *ah, struct sk_buff *skb, } info->status.rates[ts->ts_final_idx].count = ts->ts_final_retry; - info->status.rates[ts->ts_final_idx + 1].idx = -1; + if (ts->ts_final_idx + 1 < IEEE80211_TX_MAX_RATES) + info->status.rates[ts->ts_final_idx + 1].idx = -1; if (unlikely(ts->ts_status)) { ah->stats.ack_fail++; From 2a46a9356ba7b1bdd741c8b41e5374edcd960557 Mon Sep 17 00:00:00 2001 From: "Kory Maincent (TI)" Date: Tue, 28 Apr 2026 11:04:56 +0200 Subject: [PATCH 289/972] drm/bridge: tda998x: Use __be32 for audio port OF property pointer of_get_property() returns a pointer to big-endian (__be32) data, but port_data in tda998x_get_audio_ports() was declared as const u32 *, causing a sparse endianness type mismatch warning. Fix the declaration to use const __be32 *. Fixes: 7e567624dc5a4 ("drm/i2c: tda998x: Register ASoC hdmi-codec and add audio DT binding") Cc: stable@vger.kernel.org Signed-off-by: Kory Maincent (TI) Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20260428090457.121894-1-kory.maincent@bootlin.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/tda998x_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/tda998x_drv.c b/drivers/gpu/drm/bridge/tda998x_drv.c index d9b388165de151..779b976f601c1f 100644 --- a/drivers/gpu/drm/bridge/tda998x_drv.c +++ b/drivers/gpu/drm/bridge/tda998x_drv.c @@ -1762,7 +1762,7 @@ static const struct drm_bridge_funcs tda998x_bridge_funcs = { static int tda998x_get_audio_ports(struct tda998x_priv *priv, struct device_node *np) { - const u32 *port_data; + const __be32 *port_data; u32 size; int i; From b5d0ad616ca8dd8c7b6b24dc13012e342278a085 Mon Sep 17 00:00:00 2001 From: "Kory Maincent (TI)" Date: Fri, 17 Apr 2026 17:54:45 +0200 Subject: [PATCH 290/972] drm/bridge: tda998x: Return NULL instead of 0 in tda998x_edid_read() tda998x_edid_read() returns a const struct drm_edid pointer, but when tda998x_edid_delay_wait() fails (process killed while waiting for the HPD timeout), the integer literal 0 is returned instead of NULL, triggering a sparse warning: "Using plain integer as NULL pointer" Replace 0 with NULL to fix the sparse warning. Fixes: c76a8be4feec ("drm/bridge: tda998x: Add support for DRM_BRIDGE_ATTACH_NO_CONNECTOR") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604172257.Imo6GOH9-lkp@intel.com/ Signed-off-by: Kory Maincent (TI) Reviewed-by: Luca Ceresoli Link: https://patch.msgid.link/20260417155446.1068893-1-kory.maincent@bootlin.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/bridge/tda998x_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/tda998x_drv.c b/drivers/gpu/drm/bridge/tda998x_drv.c index 779b976f601c1f..6c427bc75896bc 100644 --- a/drivers/gpu/drm/bridge/tda998x_drv.c +++ b/drivers/gpu/drm/bridge/tda998x_drv.c @@ -1293,7 +1293,7 @@ static const struct drm_edid *tda998x_edid_read(struct tda998x_priv *priv, * can't handle signals gracefully. */ if (tda998x_edid_delay_wait(priv)) - return 0; + return NULL; if (priv->rev == TDA19988) reg_clear(priv, REG_TX4, TX4_PD_RAM); From f80785888f7c0980a49545b87a80e3817c9ed7c6 Mon Sep 17 00:00:00 2001 From: Anton Swart Date: Sun, 3 May 2026 23:15:17 +0200 Subject: [PATCH 291/972] ALSA: usb-audio: Add quirk flags for AlphaTheta EUPHONIA The AlphaTheta EUPHONIA (VID 0x2b73, PID 0x0047) is a USB Audio Class 2 DJ mixer that requires implicit feedback for full-duplex operation. The capture endpoint (0x83 IN, interface 2) acts as the implicit feedback source for the playback endpoint (0x03 OUT, interface 1), and the device firmware does not send isochronous data on the capture endpoint unless the host is simultaneously sending data on the playback endpoint, i.e. playback must be started first. Without QUIRK_FLAG_PLAYBACK_FIRST the kernel waits for capture URBs before submitting playback URBs, creating a deadlock: the device waits for playback data and the host waits for capture data. Without QUIRK_FLAG_GENERIC_IMPLICIT_FB the kernel does not detect the implicit feedback relationship between the two interfaces. The same flag combination is already used for the Behringer UMC202HD, UMC204HD and UMC404HD (0x1397:0x0507/0x0508/0x0509), which exhibit the identical implicit-feedback topology. Tested on Raspberry Pi 5 with kernel 6.12.75; continuous full-duplex streaming at 96 kHz / 24-bit, zero XRUNs. Signed-off-by: Anton Swart Link: https://patch.msgid.link/20260503211517.14332-1-anton.swart.jhb@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index f77ff05dff0be9..b17bdae09bc342 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2460,6 +2460,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2b53, 0x0031, /* Fiero SC-01 (firmware v1.1.0) */ QUIRK_FLAG_GENERIC_IMPLICIT_FB), + DEVICE_FLG(0x2b73, 0x0047, /* AlphaTheta EUPHONIA */ + QUIRK_FLAG_PLAYBACK_FIRST | QUIRK_FLAG_GENERIC_IMPLICIT_FB), DEVICE_FLG(0x2d95, 0x8011, /* VIVO USB-C HEADSET */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */ From 0749daa8eb5ab90334aaad3b0671efd7150d43b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Sun, 3 May 2026 21:55:52 -0300 Subject: [PATCH 292/972] ALSA: firewire-tascam: Do not drop unread control events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tscm_hwdep_read_queue() copies as many queued control events as fit in the userspace buffer. When the buffer is smaller than the current contiguous queue segment, length is rounded down to the number of bytes that can be copied. However, after copying that shortened length, the code advances pull_pos to the original tail_pos, marking the whole contiguous segment as consumed. Any events between the copied portion and tail_pos are lost. Limit tail_pos to the position after the entries actually copied before updating pull_pos. When the whole segment fits, this is equivalent to the old tail_pos update; when the buffer is smaller, the remaining events stay queued for the next read. Fixes: a8c0d13267a4 ("ALSA: firewire-tascam: notify events of change of state for userspace applications") Cc: stable@vger.kernel.org Suggested-by: Takashi Sakamoto Signed-off-by: Cássio Gabriel Reviewed-by: Takashi Sakamoto Co-developed-by: Takashi Sakamoto Signed-off-by: Takashi Sakamoto Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260503-alsa-firewire-tascam-read-queue-v2-1-126c6efd7642@gmail.com --- sound/firewire/tascam/tascam-hwdep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c index 867b4ea1096e13..6270263e7bf48b 100644 --- a/sound/firewire/tascam/tascam-hwdep.c +++ b/sound/firewire/tascam/tascam-hwdep.c @@ -73,6 +73,7 @@ static long tscm_hwdep_read_queue(struct snd_tscm *tscm, char __user *buf, length = rounddown(remained, sizeof(*entries)); if (length == 0) break; + tail_pos = head_pos + length / sizeof(*entries); spin_unlock_irq(&tscm->lock); if (copy_to_user(pos, &entries[head_pos], length)) From db4cd7d6c571c5ec6b52a6e80ec59ad99c4aa1d6 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Mon, 4 May 2026 19:38:05 +0800 Subject: [PATCH 293/972] ALSA: usb-audio: Add quirk flags for JBL Pebbles JBL Pebbles is a pair of desktop speakers with UAC interface. Its Playback and Capture mixers use linear volume with val = 0/999/1 and 0/3996/4. Meanwhile, the reported sample rates are truncated to multiples of 0x100 (i.e., 44100 => 44032), resulting in noisy kmsg, as a warning message is printed each time a stream is opened. Add a quirk table entry matching VID/PID=0x05fc/0x0231 and applying linear volume and sample rate quirk flags, so that it can work properly. Also note that the volume control knob on device is an incremental encoder. It does nothing but sends KEY_VOLUMEUP and KEY_VOLUMEDOWN per rotation, controlling the UAC Playback volume mixer indirectly. Hence, the linear volume quirk flags also enable the volume control knob to function properly. Quirky device sample: usb 5-1.1: new full-speed USB device number 12 using xhci_hcd usb 5-1.1: New USB device found, idVendor=05fc, idProduct=0231, bcdDevice= 1.00 usb 5-1.1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 usb 5-1.1: Product: JBL Pebbles usb 5-1.1: Manufacturer: Harman International Industries usb 5-1.1: SerialNumber: 1.0.0 usb-storage 5-1.1:1.0: USB Mass Storage device detected scsi host0: usb-storage 5-1.1:1.0 usb 5-1.1: Found last interface = 1 usb 5-1.1: 2:1: add audio endpoint 0x5 usb 5-1.1: Creating new data endpoint #5 usb 5-1.1: 2:1 Set sample rate 44100, clock 0 usb 5-1.1: current rate 44032 is different from the runtime rate 44100 usb 5-1.1: 3:1: add audio endpoint 0x84 usb 5-1.1: Creating new data endpoint #84 usb 5-1.1: 3:1 Set sample rate 44100, clock 0 usb 5-1.1: current rate 44032 is different from the runtime rate 44100 usb 5-1.1: [2] FU [PCM Playback Switch] ch = 1, val = 0/1/1 usb 5-1.1: Warning! Unlikely big volume step count (=999), linear volume or wrong cval->res? usb 5-1.1: [2] FU [PCM Playback Volume] ch = 2, val = 0/999/1 usb 5-1.1: [5] FU [Mic Capture Switch] ch = 1, val = 0/1/1 usb 5-1.1: Warning! Unlikely big volume step count (=999), linear volume or wrong cval->res? usb 5-1.1: [5] FU [Mic Capture Volume] ch = 2, val = 0/3996/4 input: Harman International Industries JBL Pebbles as /devices/pci0000:00/0000:00:08.3/0000:67:00.3/usb5/5-1/5-1.1/5-1.1:1.4/0003:05FC:0231.0018/input/input55 hid-generic 0003:05FC:0231.0018: input,hidraw2: USB HID v2.01 Device [Harman International Industries JBL Pebbles] on usb-0000:67:00.3-1.1/input4 Signed-off-by: Rong Zhang Link: https://patch.msgid.link/20260504-uac-jbl-pebbles-v1-1-c888d592a286@rong.moe Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index b17bdae09bc342..17983d9774f844 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2277,6 +2277,9 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_ALIGN_TRANSFER), DEVICE_FLG(0x05e1, 0x0480, /* Hauppauge Woodbury */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x05fc, 0x0231, /* JBL Pebbles */ + QUIRK_FLAG_MIXER_PLAYBACK_LINEAR_VOL | QUIRK_FLAG_MIXER_CAPTURE_LINEAR_VOL | + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0624, 0x3d3f, /* AB13X USB Audio */ QUIRK_FLAG_FORCE_IFACE_RESET | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x0644, 0x8043, /* TEAC UD-501/UD-501V2/UD-503/NT-503 */ From dc1e0172be54e742bccb28d5f14c0c395e28c098 Mon Sep 17 00:00:00 2001 From: Fernando Antunez Antonio Date: Mon, 4 May 2026 07:33:26 -0600 Subject: [PATCH 294/972] ALSA: hda/realtek: Fix mute and mic-mute LEDs for HP Envy X360 15-fh0xxx This enables the mute and mic-mute LEDs on the HP Envy X360 15-fh0xxx 2-in-1 laptops. The quirk 'ALC245_FIXUP_HP_ENVY_X360_15_FH0XXX' has been created and is now enabled for this device. This is my first patch, and I'm still getting to grips with the code, so there's probably a better way to implement this fix. I apologize for any inconvenience caused by the constant release of new versions of this patch. Signed-off-by: Fernando Antunez Antonio Link: https://patch.msgid.link/20260504-hpenvy-muteled-fix-v3-1-5567fd9b3d25@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index e061673d3c2eb7..a7525ed83e2be3 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -4137,6 +4137,7 @@ enum { ALC245_FIXUP_CS35L41_I2C_2_MUTE_LED, ALC236_FIXUP_HP_DMIC, ALC256_FIXUP_HONOR_MRB_XXX_M1020_AUDIO, + ALC245_FIXUP_HP_ENVY_X360_15_FH0XXX, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -6693,6 +6694,12 @@ static const struct hda_fixup alc269_fixups[] = { { 0x1b, 0x90170110 }, { } } + }, + [ALC245_FIXUP_HP_ENVY_X360_15_FH0XXX] = { + .type = HDA_FIXUP_FUNC, + .v.func = cs35l41_fixup_i2c_two, + .chained = true, + .chain_id = ALC245_FIXUP_HP_X360_MUTE_LEDS } }; @@ -7115,7 +7122,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8be6, "HP Envy 16", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8be7, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8be8, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), - SND_PCI_QUIRK(0x103c, 0x8be9, "HP Envy 15", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8be9, "HP Envy x360 2-in-1 Laptop 15-fh0xxx", ALC245_FIXUP_HP_ENVY_X360_15_FH0XXX), SND_PCI_QUIRK(0x103c, 0x8bf0, "HP", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c15, "HP Spectre x360 2-in-1 Laptop 14-eu0xxx", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x8c16, "HP Spectre x360 2-in-1 Laptop 16-aa0xxx", ALC245_FIXUP_HP_SPECTRE_X360_16_AA0XXX), From f3c57c9c2a49a21d784b7c04a2c883bffc070659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 4 May 2026 11:08:45 -0300 Subject: [PATCH 295/972] ALSA: usb-audio: midi2: Restart output URBs on resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit USB MIDI 2.0 suspend saves the endpoint running state, clears it and kills all endpoint URBs. Resume restores the running state, but only restarts input endpoints. For a running output endpoint, this leaves the endpoint marked running with an empty URB queue. Output transfer progress depends on either the rawmidi trigger path starting the queue or an output completion refilling it. After suspend there is no completion left, and output data that remains queued in the raw UMP or legacy rawmidi buffer can stay stalled until userspace happens to trigger the stream again. Restore the saved state with atomic accessors, keep input endpoints restarted as before, and restart output endpoints that were running before suspend. Clear the saved suspend state after restoring it. Fixes: ff49d1df79ae ("ALSA: usb-audio: USB MIDI 2.0 UMP support") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260504-usb-midi2-output-resume-v1-1-c089cc8ad3c6@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/midi2.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sound/usb/midi2.c b/sound/usb/midi2.c index 3546ba926cb310..2785600d23124a 100644 --- a/sound/usb/midi2.c +++ b/sound/usb/midi2.c @@ -227,7 +227,7 @@ static void kill_midi_urbs(struct snd_usb_midi2_endpoint *ep, bool suspending) if (!ep) return; if (suspending) - ep->suspended = ep->running; + atomic_set(&ep->suspended, atomic_read(&ep->running)); atomic_set(&ep->running, 0); for (i = 0; i < ep->num_urbs; i++) { if (!ep->urbs[i].urb) @@ -1188,10 +1188,11 @@ void snd_usb_midi_v2_suspend_all(struct snd_usb_audio *chip) static void resume_midi2_endpoint(struct snd_usb_midi2_endpoint *ep) { - ep->running = ep->suspended; - if (ep->direction == STR_IN) + atomic_set(&ep->running, atomic_read(&ep->suspended)); + atomic_set(&ep->suspended, 0); + + if (ep->direction == STR_IN || atomic_read(&ep->running)) submit_io_urbs(ep); - /* FIXME: does it all? */ } void snd_usb_midi_v2_resume_all(struct snd_usb_audio *chip) From 320e55722ca466a7d40dd69e1aea982cb6189006 Mon Sep 17 00:00:00 2001 From: Nicola Lunghi Date: Mon, 4 May 2026 16:45:20 +0200 Subject: [PATCH 296/972] ALSA: usb-audio: add clock quirk for Motu 1248 The Motu 1248 (and probably other older Motu AVB interfaces) take more than 2 seconds to switch clock. During the clock switching process the device return that the clock is not valid. This is similar to what already implemented for the Microbook II interface. Add the Motu 1248 usb id to the existing Motu quirk. Signed-off-by: Nicola Lunghi Link: https://patch.msgid.link/20260504144520.699522-2-nick83ola@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/clock.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index 842ba5b801eae8..2e0c18e3528123 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -208,11 +208,18 @@ static bool uac_clock_source_is_valid_quirk(struct snd_usb_audio *chip, } /* - * MOTU MicroBook IIc - * Sample rate changes takes more than 2 seconds for this device. Clock - * validity request returns false during that period. + * Quirk for older MOTU AVB / hybrid interfaces + * + * These devices take more than 2 seconds to switch sample rate or + * clock source. During this period the clock validity request + * returns false, causing ALSA to fail prematurely. + * + * Affected models (all use vendor 0x07fd): + * - MicroBook IIc → 0x0004 + * - 1248, 624, 8A, UltraLite AVB, 8M, 16A, ... → 0x0005 */ - if (chip->usb_id == USB_ID(0x07fd, 0x0004)) { + if (chip->usb_id == USB_ID(0x07fd, 0x0004) || /* MicroBook IIc */ + chip->usb_id == USB_ID(0x07fd, 0x0005)) { /* 1248 / 624 / 8A / UltraLite AVB / ... */ count = 0; while ((!ret) && (count < 50)) { From 17e4c68ff35090d8cb743e3c82c09f92fda1ebda Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:53 +0800 Subject: [PATCH 297/972] kunit: config: Enable KUNIT_DEBUGFS by default The KUNIT_DEBUGFS option is currently enabled based on the value of KUNIT_ALL_TESTS, but it really doesn't have anything to do with the set of enabled tests, so just enable it by default anyway. In particular, this shouldn't be only visible if KUNIT_ALL_TESTS is set, which is quite confusing. Link: https://lore.kernel.org/r/20260425034155.53913-1-david@davidgow.net Fixes: beaed42c427d ("kunit: default KUNIT_* fragments to KUNIT_ALL_TESTS") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index 498cc51e493dc9..f80ca3aeedb05d 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -16,8 +16,8 @@ menuconfig KUNIT if KUNIT config KUNIT_DEBUGFS - bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" if !KUNIT_ALL_TESTS - default KUNIT_ALL_TESTS + bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + default y help Enable debugfs representation for kunit. Currently this consists of /sys/kernel/debug/kunit//results files for each From 8f80b5b227ef9ea422080487715c841856339aed Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 25 Apr 2026 11:41:54 +0800 Subject: [PATCH 298/972] kunit: config: KUNIT_DEBUGFS should depend on DEBUG_FS CONFIG_KUNIT_DEBUGFS is totally useless without debugfs, so it should depend on CONFIG_DEBUG_FS. Link: https://lore.kernel.org/r/20260425034155.53913-2-david@davidgow.net Fixes: e2219db280e3 ("kunit: add debugfs /sys/kernel/debug/kunit//results display") Signed-off-by: David Gow Signed-off-by: Shuah Khan --- lib/kunit/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig index f80ca3aeedb05d..94ff8e4089bfbd 100644 --- a/lib/kunit/Kconfig +++ b/lib/kunit/Kconfig @@ -17,6 +17,7 @@ if KUNIT config KUNIT_DEBUGFS bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation" + depends on DEBUG_FS default y help Enable debugfs representation for kunit. Currently this consists From 93618edf753838a727dbff63c7c291dee22d656b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 May 2026 08:31:22 -1000 Subject: [PATCH 299/972] cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated A chain of commits going back to v7.0 reworked rmdir to satisfy the controller invariant that a subsystem's ->css_offline() must not run while tasks are still doing kernel-side work in the cgroup. [1] d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") [2] a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") [3] 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") [4] 4c56a8ac6869 ("cgroup: Fix cgroup_drain_dying() testing the wrong condition") [5] 13e786b64bd3 ("cgroup: Increment nr_dying_subsys_* from rmdir context") [1] moved task cset unlink from do_exit() to finish_task_switch() so a task's cset link drops only after the task has fully stopped scheduling. That made tasks past exit_signals() linger on cset->tasks until their final context switch, which led to a series of problems as what userspace expected to see after rmdir diverged from what the kernel needs to wait for. [2]-[5] tried to bridge that divergence: [2] filtered the exiting tasks from cgroup.procs; [3] had rmdir(2) sleep in TASK_UNINTERRUPTIBLE for them; [4] fixed the wait's condition; [5] made nr_dying_subsys_* visible synchronously. The cgroup_drain_dying() wait in [3] turned out to be a dead end. When the rmdir caller is also the reaper of a zombie that pins a pidns teardown (e.g. host PID 1 systemd reaping orphan pids that were re-parented to it during the same teardown), rmdir blocks in TASK_UNINTERRUPTIBLE waiting for those pids to free, the pids can't free because PID 1 is the reaper and it's stuck in rmdir, and the system A-A deadlocks. No internal lock ordering breaks this; the wait itself is the bug. The css killing side that drove the original reorder, however, can be made cleanly asynchronous: ->css_offline() is already async, run from css_killed_work_fn() driven by percpu_ref_kill_and_confirm(). The fix is to make that chain start only after all tasks have left the cgroup. rmdir's user-visible side then returns as soon as cgroup.procs and friends are empty, while ->css_offline() still runs only after the cgroup is fully drained. Verified by the original reproducer (pidns teardown + zombie reaper, runs under vng) which hangs vanilla and succeeds here, and by per-commit deterministic repros for [2], [3], [4], [5] with a boot parameter that widens the post-exit_signals() window so each state is reliably reachable. Some stress tests on top of that. cgroup_apply_control_disable() has the same shape of pre-existing race: when a controller is disabled via subtree_control, kill_css() ran synchronously while tasks past exit_signals() could still be linked to the cgroup's csets, and ->css_offline() could fire before they drained. This patch preserves the existing synchronous behavior at that call site (kill_css_sync() + kill_css_finish() back-to-back) and a follow-up patch will defer kill_css_finish() there using a per-css trigger. This seems like the right approach and I don't see problems with it. The changes are somewhat invasive but not excessively so, so backporting to -stable should be okay. If something does turn out to be wrong, the fallback is to revert the entire chain ([1]-[5]) and rework in the development branch instead. v2: Pin cgrp across the deferred destroy work with explicit cgroup_get()/cgroup_put() around queue_work() and the work_fn. v1 wasn't actually broken (ordered cgroup_offline_wq + queue_work order in cgroup_task_dead() saved it) but the explicit ref removes the dependency on those non-obvious invariants. Also note the pre-existing cgroup_apply_control_disable() race in the description; a follow-up will defer kill_css_finish() there. Fixes: 1b164b876c36 ("cgroup: Wait for dying tasks to leave on rmdir") Cc: stable@vger.kernel.org # v7.0+ Reported-and-tested-by: Martin Pitt Link: https://lore.kernel.org/all/afHNg2VX2jy9bW7y@piware.de/ Link: https://lore.kernel.org/all/35e0670adb4abeab13da2c321582af9f@kernel.org/ Signed-off-by: Tejun Heo Acked-by: Sebastian Andrzej Siewior --- include/linux/cgroup-defs.h | 4 +- kernel/cgroup/cgroup.c | 250 +++++++++++++++++------------------- 2 files changed, 119 insertions(+), 135 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f42563739d2e53..50a784da7a81aa 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -611,8 +611,8 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; - /* used by cgroup_rmdir() to wait for dying tasks to leave */ - wait_queue_head_t dying_populated_waitq; + /* defers killing csses after removal until cgroup is depopulated */ + struct work_struct finish_destroy_work; /* used to schedule release agent */ struct work_struct release_agent_work; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c928dea9dea6bd..bd10a7e2f9c550 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void cgroup_finish_destroy(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) if (was_populated == cgroup_is_populated(cgrp)) break; + /* + * Subtree just emptied below an offlined cgrp. Fire deferred + * destroy. The transition is one-shot. + */ + if (was_populated && !css_is_online(&cgrp->self)) { + cgroup_get(cgrp); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, + &cgrp->finish_destroy_work)); + } + cgroup1_check_for_release(cgrp); TRACE_CGROUP_PATH(notify_populated, cgrp, cgroup_is_populated(cgrp)); @@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } +static void cgroup_finish_destroy_work_fn(struct work_struct *work) +{ + struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work); + + cgroup_lock(); + cgroup_finish_destroy(cgrp); + cgroup_unlock(); + cgroup_put(cgrp); +} + static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - init_waitqueue_head(&cgrp->dying_populated_waitq); + INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -5514,7 +5537,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5993,7 +6016,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -6025,15 +6048,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref) } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; @@ -6056,24 +6076,6 @@ static void kill_css(struct cgroup_subsys_state *css) */ css_clear_dir(css); - /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). - */ - css_get(css); - - /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - */ - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - css->cgroup->nr_dying_subsys[ss->id]++; /* * Parent css and cgroup cannot be freed until after the freeing @@ -6086,44 +6088,88 @@ static void kill_css(struct cgroup_subsys_state *css) } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + + /* + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + + /* + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + +/** + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on + * each subsystem css. Fires once @cgrp's subtree is fully drained, either + * inline here or from cgroup_update_populated(). + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6147,9 +6193,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6180,79 +6225,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + if (!cgroup_is_populated(cgrp)) + cgroup_finish_destroy(cgrp); + return 0; }; /** - * cgroup_drain_dying - wait for dying tasks to leave before rmdir - * @cgrp: the cgroup being removed - * - * cgroup.procs and cgroup.threads use css_task_iter which filters out - * PF_EXITING tasks so that userspace doesn't see tasks that have already been - * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the - * cgroup has non-empty css_sets - is only updated when dying tasks pass through - * cgroup_task_dead() in finish_task_switch(). This creates a window where - * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir - * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no - * tasks. - * - * This function aligns cgroup_has_tasks() with what userspace can observe. If - * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are - * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the - * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. + * cgroup_finish_destroy - deferred half of @cgrp destruction + * @cgrp: cgroup whose subtree just became empty * - * This function only concerns itself with this cgroup's own dying tasks. - * Whether the cgroup has children is cgroup_destroy_locked()'s problem. - * - * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we - * retry the full check from scratch. - * - * Must be called with cgroup_mutex held. + * See cgroup_destroy_locked() for the rationale. */ -static int cgroup_drain_dying(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +static void cgroup_finish_destroy(struct cgroup *cgrp) { - struct css_task_iter it; - struct task_struct *task; - DEFINE_WAIT(wait); + struct cgroup_subsys_state *css; + int ssid; lockdep_assert_held(&cgroup_mutex); -retry: - if (!cgroup_has_tasks(cgrp)) - return 0; - - /* Same iterator as cgroup.threads - if any task is visible, it's busy */ - css_task_iter_start(&cgrp->self, 0, &it); - task = css_task_iter_next(&it); - css_task_iter_end(&it); - - if (task) - return -EBUSY; - /* - * All remaining tasks are PF_EXITING and will pass through - * cgroup_task_dead() shortly. Wait for a kick and retry. - * - * cgroup_has_tasks() can't transition from false to true while we're - * holding cgroup_mutex, but the true to false transition happens - * under css_set_lock (via cgroup_task_dead()). We must retest and - * prepare_to_wait() under css_set_lock. Otherwise, the transition - * can happen between our first test and prepare_to_wait(), and we - * sleep with no one to wake us. - */ - spin_lock_irq(&css_set_lock); - if (!cgroup_has_tasks(cgrp)) { - spin_unlock_irq(&css_set_lock); - return 0; - } - prepare_to_wait(&cgrp->dying_populated_waitq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - schedule(); - finish_wait(&cgrp->dying_populated_waitq, &wait); - mutex_lock(&cgroup_mutex); - goto retry; + for_each_css(css, ssid, cgrp) + kill_css_finish(css); } int cgroup_rmdir(struct kernfs_node *kn) @@ -6264,12 +6257,9 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_drain_dying(cgrp); - if (!ret) { - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); - } + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; @@ -7029,7 +7019,6 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { - struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7043,11 +7032,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); - /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - if (waitqueue_active(&link->cgrp->dying_populated_waitq)) - wake_up(&link->cgrp->dying_populated_waitq); - if (dl_task(tsk)) dec_dl_tasks_cs(tsk); From 360f61bc29085d65a24dbaaf707c802553a239fd Mon Sep 17 00:00:00 2001 From: Peter Rosin Date: Thu, 30 Apr 2026 06:09:58 +0200 Subject: [PATCH 300/972] MAINTAINERS: Update mail for Peter Rosin I'm resigning from my position at Axentia. Signed-off-by: Peter Rosin Signed-off-by: Wolfram Sang --- .mailmap | 1 + MAINTAINERS | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.mailmap b/.mailmap index b78aa092b4bb85..eec4a740f7ca0b 100644 --- a/.mailmap +++ b/.mailmap @@ -682,6 +682,7 @@ Peter A Jonsson Peter Hilber Peter Oruba Peter Oruba +Peter Rosin Pierre-Louis Bossart Pratyush Anand Pratyush Yadav diff --git a/MAINTAINERS b/MAINTAINERS index 882214b0e7db53..1b0c453bca464d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4299,18 +4299,16 @@ F: Documentation/devicetree/bindings/leds/backlight/awinic,aw99706.yaml F: drivers/video/backlight/aw99706.c AXENTIA ARM DEVICES -M: Peter Rosin L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained +S: Orphan F: arch/arm/boot/dts/microchip/at91-linea.dtsi F: arch/arm/boot/dts/microchip/at91-natte.dtsi F: arch/arm/boot/dts/microchip/at91-nattis-2-natte-2.dts F: arch/arm/boot/dts/microchip/at91-tse850-3.dts AXENTIA ASOC DRIVERS -M: Peter Rosin L: linux-sound@vger.kernel.org -S: Maintained +S: Orphan F: Documentation/devicetree/bindings/sound/axentia,* F: sound/soc/atmel/tse850-pcm5142.c @@ -12046,7 +12044,7 @@ F: Documentation/i2c/busses/i2c-nvidia-gpu.rst F: drivers/i2c/busses/i2c-nvidia-gpu.c I2C MUXES -M: Peter Rosin +M: Peter Rosin L: linux-i2c@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-arb* @@ -12447,7 +12445,7 @@ F: drivers/iio/industrialio-backend.c F: include/linux/iio/backend.h IIO DIGITAL POTENTIOMETER DAC -M: Peter Rosin +M: Peter Rosin L: linux-iio@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-bus-iio-dac-dpot-dac @@ -12455,7 +12453,7 @@ F: Documentation/devicetree/bindings/iio/dac/dpot-dac.yaml F: drivers/iio/dac/dpot-dac.c IIO ENVELOPE DETECTOR -M: Peter Rosin +M: Peter Rosin L: linux-iio@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-bus-iio-adc-envelope-detector @@ -12471,7 +12469,7 @@ F: include/linux/iio/iio-gts-helper.h F: drivers/iio/test/iio-test-gts.c IIO MULTIPLEXER -M: Peter Rosin +M: Peter Rosin L: linux-iio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/iio/multiplexer/io-channel-mux.yaml @@ -12502,7 +12500,7 @@ F: include/linux/iio/ F: tools/iio/ IIO UNIT CONVERTER -M: Peter Rosin +M: Peter Rosin L: linux-iio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/iio/afe/current-sense-amplifier.yaml @@ -15718,7 +15716,7 @@ F: Documentation/devicetree/bindings/media/i2c/maxim,max96717.yaml F: drivers/media/i2c/max96717.c MAX9860 MONO AUDIO VOICE CODEC DRIVER -M: Peter Rosin +M: Peter Rosin L: linux-sound@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/sound/max9860.txt @@ -15933,7 +15931,7 @@ F: Documentation/devicetree/bindings/net/can/microchip,mcp251xfd.yaml F: drivers/net/can/spi/mcp251xfd/ MCP4018 AND MCP4531 MICROCHIP DIGITAL POTENTIOMETER DRIVERS -M: Peter Rosin +M: Peter Rosin L: linux-iio@vger.kernel.org S: Maintained F: Documentation/ABI/testing/sysfs-bus-iio-potentiometer-mcp4531 @@ -18238,7 +18236,7 @@ F: include/linux/mmc/ F: include/uapi/linux/mmc/ MULTIPLEXER SUBSYSTEM -M: Peter Rosin +M: Peter Rosin S: Odd Fixes F: Documentation/ABI/testing/sysfs-class-mux* F: Documentation/devicetree/bindings/mux/ @@ -19347,7 +19345,7 @@ F: include/dt-bindings/display/tda998x.h K: "nxp,tda998x" NXP TFA9879 DRIVER -M: Peter Rosin +M: Peter Rosin L: linux-sound@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/sound/trivial-codec.yaml From 60f21a2649308bbd84919ba6656d5ccd660953cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:34 -1000 Subject: [PATCH 301/972] cgroup, sched_ext: Include exiting tasks in cgroup iter a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") made css_task_iter_advance() skip exiting tasks so cgroup.procs stays consistent with waitpid() visibility. Unfortunately, this broke scx_task_iter. scx_task_iter walks either scx_tasks (global) or a cgroup subtree via css_task_iter() and the two modes are expected to cover the same set of tasks. After the above change the cgroup-scoped mode silently skips tasks past exit_signals() that are still on scx_tasks. scx_sub_enable_workfn()'s abort path is one of the symptoms: an exiting SCX_TASK_SUB_INIT task can race past the cgroup iter leaking __scx_init_task() state. Other iterations share the same gap. Add CSS_TASK_ITER_WITH_DEAD to opt out of the skip and use it from scx_task_iter(). Fixes: b0e4c2f8a0f0 ("sched_ext: Implement cgroup subtree iteration for scx_task_iter") Reported-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 8 +++++--- kernel/sched/ext.c | 6 ++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4b5..f6d037a30fd899 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -53,6 +53,7 @@ struct kernel_clone_args; enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ + CSS_TASK_ITER_WITH_DEAD = (1U << 2), /* include exiting tasks */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1f084ee71443e6..e51ce4cd37395f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5059,10 +5059,12 @@ static void css_task_iter_advance(struct css_task_iter *it) task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9483be03a4ca05..dc5d4787296bc6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -766,7 +766,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -866,7 +867,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } From ff9eda4ea906b1f02fc260ddc42d2d9bd736a49c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 27 Apr 2026 14:16:35 -1000 Subject: [PATCH 302/972] sched_ext: Skip past-sched_ext_dead() tasks in scx_task_iter_next_locked() scx_task_iter's cgroup-scoped mode can return tasks whose sched_ext_dead() has already completed: cgroup_task_dead() removes from cset->tasks after sched_ext_dead() in finish_task_switch() and is irq-work deferred on PREEMPT_RT. The global mode is fine - sched_ext_dead() removes from scx_tasks via list_del_init() first. Callers (sub-sched enable prep/abort/apply, scx_sub_disable(), scx_fail_parent()) assume returned tasks are still on @sch and trip WARN_ON_ONCE() or operate on torn-down state otherwise. Set %SCX_TASK_OFF_TASKS in sched_ext_dead() under @p's rq lock and have scx_task_iter_next_locked() skip flagged tasks under the same lock. Setter and reader serialize on the per-task rq lock - no race. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 33 +++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 1a3af2ea2a794a..adb9a4de068ae8 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,6 +101,7 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ + SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* * Bits 8 and 9 are used to carry task state: diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index dc5d4787296bc6..3f0d8aeaed819b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -928,16 +928,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS + * is set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (p->scx.flags & SCX_TASK_OFF_TASKS) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -3850,6 +3861,11 @@ void sched_ext_dead(struct task_struct *p) /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; @@ -3857,6 +3873,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); + p->scx.flags |= SCX_TASK_OFF_TASKS; task_rq_unlock(rq, p, &rf); } } From 84ae1840260fece9b6b70d3872b79384bbe5a90b Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Thu, 23 Apr 2026 22:06:19 +0200 Subject: [PATCH 303/972] drm/sti: remove bridge when sti_hda component_add fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use devm_drm_bridge_add() so the bridge is released if probe fails after registration, and drop the manual drm_bridge_remove() in remove(). Check the return value of devm_drm_bridge_add(). Signed-off-by: Osama Abdelkader Fixes: d28726efc637 ("drm/sti: hda: add bridge before attaching") Cc: stable@vger.kernel.org Reviewed-by: Luca Ceresoli Acked-by: Raphaël Gallais-Pou Link: https://patch.msgid.link/20260423200622.325076-1-osama.abdelkader@gmail.com Signed-off-by: Raphael Gallais-Pou --- drivers/gpu/drm/sti/sti_hda.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/sti/sti_hda.c b/drivers/gpu/drm/sti/sti_hda.c index b7397827889c94..360a88ca8f0c5a 100644 --- a/drivers/gpu/drm/sti/sti_hda.c +++ b/drivers/gpu/drm/sti/sti_hda.c @@ -741,6 +741,7 @@ static int sti_hda_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct sti_hda *hda; struct resource *res; + int ret; DRM_INFO("%s\n", __func__); @@ -779,7 +780,9 @@ static int sti_hda_probe(struct platform_device *pdev) return PTR_ERR(hda->clk_hddac); } - drm_bridge_add(&hda->bridge); + ret = devm_drm_bridge_add(dev, &hda->bridge); + if (ret) + return ret; platform_set_drvdata(pdev, hda); @@ -788,10 +791,7 @@ static int sti_hda_probe(struct platform_device *pdev) static void sti_hda_remove(struct platform_device *pdev) { - struct sti_hda *hda = platform_get_drvdata(pdev); - component_del(&pdev->dev, &sti_hda_ops); - drm_bridge_remove(&hda->bridge); } static const struct of_device_id hda_of_match[] = { From b34c82777a2c0648ee053595f4b290fd5249b093 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Thu, 30 Apr 2026 10:27:47 +0100 Subject: [PATCH 304/972] sched_ext: idle: Recheck prev_cpu after narrowing allowed mask scx_select_cpu_dfl() narrows @allowed to @cpus_allowed & @p->cpus_ptr when the BPF caller supplies a @cpus_allowed that differs from @p->cpus_ptr and @p doesn't have full affinity. However, @is_prev_allowed was computed against the original (wider) @cpus_allowed, so the prev_cpu fast paths could pick a @prev_cpu that is in @cpus_allowed but not in @p->cpus_ptr, violating the intended invariant that the returned CPU is always usable by @p. The kernel masks this via the SCX_EV_SELECT_CPU_FALLBACK fallback, but the behavior contradicts the documented contract. Move the @is_prev_allowed evaluation past the narrowing block so it tests against the final @allowed mask. Fixes: ee9a4e92799d ("sched_ext: idle: Properly handle invalid prev_cpu during idle selection") Cc: stable@vger.kernel.org # v6.16+ Assisted-by: Claude Signed-off-by: David Carlier Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d8041..6e1980763270df 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -465,12 +465,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, preempt_disable(); - /* - * Check whether @prev_cpu is still within the allowed set. If not, - * we can still try selecting a nearby CPU. - */ - is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); - /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ @@ -487,6 +481,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, } } + /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + /* * This is necessary to protect llc_cpus. */ From d8769544bde51b0ac980d10f8fe9f9fed6c95995 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Thu, 30 Apr 2026 13:11:42 -0700 Subject: [PATCH 305/972] docs: cgroup-v1: Update charge-commit section Commit 1d8f136a421f ("memcg/hugetlb: remove memcg hugetlb try-commit-cancel protocol") removed mem_cgroup_commit_charge() and mem_cgroup_cancel_charge(), but the docs still refer to those functions. There is no longer any charge cancellation. Update the docs to match the code. Signed-off-by: T.J. Mercier Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memcg_test.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst index 9f8e27355cba54..7c7cd457cf6952 100644 --- a/Documentation/admin-guide/cgroup-v1/memcg_test.rst +++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst @@ -47,21 +47,19 @@ Please note that implementation details can be changed. Called when swp_entry's refcnt goes down to 0. A charge against swap disappears. -3. charge-commit-cancel +3. charge-commit ======================= Memcg pages are charged in two steps: - mem_cgroup_try_charge() - - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() + - commit_charge() At try_charge(), there are no flags to say "this page is charged". at this point, usage += PAGE_SIZE. At commit(), the page is associated with the memcg. - At cancel(), simply usage -= PAGE_SIZE. - Under below explanation, we assume CONFIG_SWAP=y. 4. Anonymous From a200cdbf95932631ec338d08a6e9e31b34c4e8a6 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 27 Apr 2026 12:00:11 +0800 Subject: [PATCH 306/972] ovpn: reset MAC header before passing skb up After decapsulating a packet, the skb->mac_header still points to the outer transport header. Fix this by calling skb_reset_mac_header() in ovpn_netdev_write() to ensure the MAC header points to the beginning of the inner IP/network packet, as expected by the rest of the stack. Reported-by: Minqiang Chen Fixes: 8534731dbf2d ("ovpn: implement packet processing") Signed-off-by: Qingfang Deng Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index db43a1f8a07a27..d92bb87be2b2e1 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -85,6 +85,7 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) skb_scrub_packet(skb, true); /* network header reset in ovpn_decrypt_post() */ + skb_reset_mac_header(skb); skb_reset_transport_header(skb); skb_reset_inner_headers(skb); From c539cb30f93f119566f2ae9d016cce11f188d780 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Wed, 25 Mar 2026 17:49:18 +0100 Subject: [PATCH 307/972] ovpn: ensure packet delivery happens with BH disabled ovpn injects decrypted packets into the netdev RX path through ovpn_netdev_write() which invokes gro_cells_receive() and dev_dstats_rx_add(). ovpn_netdev_write() is normally called in softirq context, however, in case of TCP connections it may also be invoked process context. When this happens gro_cells_receive() will throw a warning: [ 230.183747][ T12] WARNING: net/core/gro_cells.c:30 at gro_cells_receive+0x708/0xaa0, CPU#1: kworker/u16:0/12 and lockdep will also report a potential inconsistent lock state: WARNING: inconsistent lock state 7.0.0-rc4+ #246 Tainted: G W -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. because attempts to acquire gro_cells->bh_lock by both contexts may lead to a deadlock. At the same time, dev_dstats_rx_add() does not expect to race with a softirq (which may happen when invoked in process context), because the latter may access its per-cpu state and corrupt it. Fix all this by invoking local_bh_disable/enable() around gro_cells_receive() and dev_dstats_rx_add() to ensure that bottom halves are always disabled before calling both of them. Fixes: 11851cbd60ea ("ovpn: implement TCP transport") Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index d92bb87be2b2e1..22c555dd962ec7 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -91,12 +91,18 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) /* cause packet to be "received" by the interface */ pkt_len = skb->len; + /* we may get here in process context in case of TCP connections, + * therefore we have to disable BHs to ensure gro_cells_receive() + * and dev_dstats_rx_add() do not get corrupted or enter deadlock + */ + local_bh_disable(); ret = gro_cells_receive(&peer->ovpn->gro_cells, skb); if (likely(ret == NET_RX_SUCCESS)) { /* update RX stats with the size of decrypted packet */ ovpn_peer_stats_increment_rx(&peer->vpn_stats, pkt_len); dev_dstats_rx_add(peer->ovpn->dev, pkt_len); } + local_bh_enable(); } void ovpn_decrypt_post(void *data, int ret) From 201ba706318d460a2ea660e3652610be62532a70 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Wed, 29 Apr 2026 10:00:16 +0200 Subject: [PATCH 308/972] selftests: ovpn: reduce ping count in test.sh The second stage of test.sh ("run baseline data traffic") performs a basic connectivity check with ping -qfc 500 -w 3. On slower CI instances this is too strict for TCP: the RTT is high enough that 500 echo requests do not reliably complete within 3 seconds, so the stage flakes and the test fails even though the ovpn setup is healthy. Reduce the packet count to 100 for both the plain and 3000-byte pings in that stage. This still verifies peer setup, key exchange, routing, and data-path traffic, without making the basic connectivity check depend on timing out under load. Fixes: 959bc330a439 ("testing/selftests: add test tool and scripts for ovpn module") Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli --- tools/testing/selftests/net/ovpn/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh index b50dbe45a4d00a..c06e3135fbef8e 100755 --- a/tools/testing/selftests/net/ovpn/test.sh +++ b/tools/testing/selftests/net/ovpn/test.sh @@ -98,10 +98,10 @@ ovpn_run_basic_traffic() { sleep 0.3 ovpn_cmd_ok "send baseline traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -w 3 5.5.5.$((p + 1)) ovpn_cmd_ok "send large-payload traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -s 3000 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -s 3000 -w 3 5.5.5.$((p + 1)) wait "${tcpdump_pid1}" || return 1 wait "${tcpdump_pid2}" || return 1 From afbd961305eb483515650ccfcb7743608e7add78 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:13 +0300 Subject: [PATCH 309/972] ipvs: fixes for the new ip_vs_status info Sashiko reports some problems for the recently added /proc/net/ip_vs_status: * ip_vs_status_show() as a table reader may run long after the conn_tab and svc_table table are released. While ip_vs_conn_flush() properly changes the conn_tab_changes counter when conn_tab is removed, ip_vs_del_service() and ip_vs_flush() were missing such change for the svc_table_changes counter. As result, readers like ip_vs_dst_event() and ip_vs_status_show() may continue to use a freed table after a cond_resched_rcu() call. * While counting the buckets in ip_vs_status_show() make sure we traverse only the needed number of entries in the chain. This also prevents possible overflow of the 'count' variable. * Add check for 'loops' to prevent infinite loops while restarting the traversal on table change. * While IP_VS_CONN_TAB_MAX_BITS is 20 on 32-bit platforms and there is no risk to overflow when multiplying the number of conn_tab buckets to 100, prefer the div_u64() helper to make the following dividing safer. * Use 0440 permissions for ip_vs_status to restrict the info only to root due to the exported information for hash distribution. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 9a9ccef907a7 ("ipvs: add ip_vs_status info") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 51 ++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6632daa87ded1c..27e50afe9a545d 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2032,6 +2032,9 @@ static int ip_vs_del_service(struct ip_vs_service *svc) cancel_delayed_work_sync(&ipvs->svc_resize_work); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -2078,6 +2081,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -3004,7 +3010,8 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) int old_gen, new_gen; u32 counts[8]; u32 bucket; - int count; + u32 count; + int loops; u32 sum1; u32 sum; int i; @@ -3020,6 +3027,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) if (!atomic_read(&ipvs->conn_count)) goto after_conns; old_gen = atomic_read(&ipvs->conn_tab_changes); + loops = 0; repeat_conn: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ @@ -3032,8 +3040,11 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) resched_score++; ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; - hlist_bl_for_each_entry_rcu(hn, e, head, node) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3042,37 +3053,41 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) new_gen = atomic_read(&ipvs->conn_tab_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_conns; old_gen = new_gen; goto repeat_conn; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_conns: t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); - seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Services:\t%u\n", count); seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); + loops = 0; repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ @@ -3086,8 +3101,11 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; hlist_bl_for_each_entry_rcu(svc, e, head, - s_list) + s_list) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3096,24 +3114,27 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) new_gen = atomic_read(&ipvs->svc_table_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_svc; old_gen = new_gen; goto repeat_svc; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_svc: @@ -5039,7 +5060,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; - if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, ip_vs_status_show, NULL)) goto err_status; #endif From f2da9a96abb4b7a64626e931cedd85f05d5498ca Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:14 +0300 Subject: [PATCH 310/972] ipvs: fix races around the conn_lfactor and svc_lfactor sysctl vars Sashiko warns that the new sysctls vars can be changed after the hash tables are destroyed and their respective resizing works canceled, leading to mod_delayed_work() being called for canceled works. Solve this in different ways. conn_tab can be present even without services and is destroyed only on netns exit, so use disable_delayed_work_sync() to disable the work instead of adding more synchronization mechanisms. As for the svc_table, it is destroyed when the services are deleted, so we must be sure that netns exit is not called yet (the check for 'enable') and the work is not canceled by checking all under same mutex lock. Also, use WRITE_ONCE when updating the sysctl vars as we already read them with READ_ONCE. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 8d7de5477e47 ("ipvs: add conn_lfactor and svc_lfactor sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 2082bfb2d93cdf..84a4921a7865a9 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1835,7 +1835,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) if (!rcu_dereference_protected(ipvs->conn_tab, 1)) return; - cancel_delayed_work_sync(&ipvs->conn_resize_work); + disable_delayed_work_sync(&ipvs->conn_resize_work); if (!atomic_read(&ipvs->conn_count)) goto unreg; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 27e50afe9a545d..caec516856e953 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2469,7 +2469,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; + WRITE_ONCE(*valp, val); if (rcu_access_pointer(ipvs->conn_tab)) mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); @@ -2496,10 +2496,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; - if (rcu_access_pointer(ipvs->svc_table)) + mutex_lock(&ipvs->service_mutex); + WRITE_ONCE(*valp, val); + /* Make sure the services are present */ + if (rcu_access_pointer(ipvs->svc_table) && + READ_ONCE(ipvs->enable) && + !test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) mod_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 0); + mutex_unlock(&ipvs->service_mutex); } } return ret; From d493d9de1c21313cf62be0f6e1a4d48385fa7beb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:15 +0300 Subject: [PATCH 311/972] ipvs: fix the spin_lock usage for RT build syzbot reports for sleeping function called from invalid context [1]. The recently added code for resizable hash tables uses hlist_bl bit locks in combination with spin_lock for the connection fields (cp->lock). Fix the following problems: * avoid using spin_lock(&cp->lock) under locked bit lock because it sleeps on PREEMPT_RT * as the recent changes call ip_vs_conn_hash() only for newly allocated connection, the spin_lock can be removed there because the connection is still not linked to table and does not need cp->lock protection. * the lock can be removed also from ip_vs_conn_unlink() where we are the last connection user. * the last place that is fixed is ip_vs_conn_fill_cport() where now the cp->lock is locked before the other locks to ensure other packets do not modify the cp->flags in non-atomic way. Here we make sure cport and flags are changed only once if two or more packets race to fill the cport. Also, we fill cport early, so that if we race with resizing there will be valid cport key for the hashing. Add a warning if too many hash table changes occur for our RCU read-side critical section which is error condition but minor because the connection still can expire gracefully. Still, restore the cport to 0 to allow retransmitted packet to properly fill the cport. Problems reported by Sashiko. [1]: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0 preempt_count: 2, expected: 0 RCU nest depth: 3, expected: 3 8 locks held by ktimers/0/16: #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: timer_base_lock_expiry kernel/time/timer.c:1502 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: __run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock kernel/locking/spinlock_rt.c:50 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 kernel/locking/spinlock_rt.c:57 #4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 kernel/time/timer.c:1745 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:315 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 #6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 Preemption disabled at: [] bit_spin_lock include/linux/bit_spinlock.h:38 [inline] [] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149 CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G W L syzkaller #0 PREEMPT_{RT,(full)} Tainted: [W]=WARN, [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/18/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 __might_resched+0x329/0x480 kernel/sched/core.c:9162 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline] rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57 spin_lock include/linux/spinlock_rt.h:45 [inline] ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748 expire_timers kernel/time/timer.c:1799 [inline] __run_timers kernel/time/timer.c:2374 [inline] __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386 run_timer_base kernel/time/timer.c:2395 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0x69/0x100 kernel/softirq.c:1151 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+504e778ddaecd36fdd17@syzkaller.appspotmail.com Link: https://sashiko.dev/#/patchset/20260415200216.79699-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260420165539.85174-4-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422135823.50489-4-ja%40ssi.bg Fixes: 2fa7cc9c7025 ("ipvs: switch to per-net connection table") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 74 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 84a4921a7865a9..9ea6b4fa78bf00 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hash_key2 = hash_key; use2 = false; } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, &head, &head2); - spin_lock(&cp->lock); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hn0.hash_key, hash_key); - WRITE_ONCE(cp->hn1.hash_key, hash_key2); - refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->hn0.node, head); - if (use2) - hlist_bl_add_head_rcu(&cp->hn1.node, head2); - ret = 1; - } else { - pr_err("%s(): request for already hashed, called from %pS\n", - __func__, __builtin_return_address(0)); - ret = 0; - } - spin_unlock(&cp->lock); + cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); + refcount_inc(&cp->refcnt); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); + conn_tab_unlock(head, head2); + ret = 1; /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, &head, &head2); - spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ @@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) } } - spin_unlock(&cp->lock); conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; + bool by_me = false; int ntbl; int dir; @@ -664,8 +656,16 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) t = rcu_dereference(t->new_tbl); ntbl++; /* We are lost? */ - if (ntbl >= 2) + if (ntbl >= 2) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me) + cp->cport = 0; + /* hn1 will be rehashed on next packet */ + spin_unlock_bh(&cp->lock); + IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n", + __func__, dir); return; + } } /* Rehashing during resize? Use the recent table for adds */ @@ -683,10 +683,13 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) if (head > head2 && t == t2) swap(head, head2); + /* Protect the cp->flags modification */ + spin_lock_bh(&cp->lock); + /* Lock seqcount only for the old bucket, even if we are on new table * because it affects the del operation, not the adding. */ - spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_lock(&t->lock[hash_key & t->lock_mask].l); preempt_disable_nested(); write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); @@ -704,14 +707,23 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + spin_unlock_bh(&cp->lock); hash_key = hash_key_new; goto retry; } - spin_lock(&cp->lock); - if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && - (cp->flags & IP_VS_CONN_F_HASHED)) { + /* Fill cport once, even if multiple packets try to do it */ + if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) { + /* If we race with resizing make sure cport is set for dir 1 */ + if (!cp->cport) { + cp->cport = cport; + by_me = true; + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + } /* We do not recalc hash_key_r under lock, we assume the * parameters in cp do not change, i.e. cport is * the only possible change. @@ -726,21 +738,17 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) hlist_bl_del_rcu(&hn->node); hlist_bl_add_head_rcu(&hn->node, head_new); } - if (!dir) { - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } } - spin_unlock(&cp->lock); if (head != head2) hlist_bl_unlock(head2); hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); - if (dir--) + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + + spin_unlock_bh(&cp->lock); + if (dir-- && by_me) goto next_dir; } From fbe1e01e818ee6db86ff947599bf0bea96de7e71 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:16 +0300 Subject: [PATCH 312/972] ipvs: do not leak dest after get from dest trash Sashiko warns about leaked dest if ip_vs_start_estimator() fails in ip_vs_add_dest(). Add ip_vs_trash_put_dest() to put back the dest into dest trash. Link: https://sashiko.dev/#/patchset/20260428175725.72050-1-ja%40ssi.bg Fixes: 705dd3444081 ("ipvs: use kthreads for stats estimation") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index caec516856e953..d81077c2457a8b 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1102,6 +1102,24 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, return dest; } +/* Put destination in trash */ +static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest, unsigned long istart, + bool cleanup) +{ + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash with reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = istart; + spin_unlock_bh(&ipvs->dest_trash_lock); +} + static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; @@ -1461,9 +1479,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + /* On error put back dest into the trash */ if (ret < 0) - return ret; - __ip_vs_update_dest(svc, dest, udest, 1); + ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, + false); + else + __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure @@ -1533,17 +1554,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, */ ip_vs_rs_unhash(dest); - spin_lock_bh(&ipvs->dest_trash_lock); - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - refcount_read(&dest->refcnt)); - if (list_empty(&ipvs->dest_trash) && !cleanup) - mod_timer(&ipvs->dest_trash_timer, - jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); - /* dest lives in trash with reference */ - list_add(&dest->t_list, &ipvs->dest_trash); - dest->idle_start = 0; - spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. From 2fd109238925d53c44ea409df0558844af7877b8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:17 +0300 Subject: [PATCH 313/972] ipvs: fix races around est_mutex and est_cpulist Sashiko reports for races and possible crash around the usage of est_cpulist_valid and sysctl_est_cpulist. The problem is that we do not lock est_mutex in some places which can lead to wrong write ordering and as result problems when calling cpumask_weight() and cpumask_empty(). Fix them by moving the est_max_threads read/write under locked est_mutex. Do the same for one ip_vs_est_reload_start() call to protect the cpumask_empty() usage of sysctl_est_cpulist. To remove the chance of deadlock while stopping the estimation kthreads, keep the data structure for kthread 0 even after last estimator is removed and do not hold mutexes while stopping this task. Now we will use a new flag 'needed' to know when kthread 0 should run. The kthreads above 0 do not use mutexes, so stop them under est_mutex because their kthread data still can be destroyed if they do not serve estimators. Now all kthreads will be started by the est_reload_work to properly serialize the stop/start for kthread 0. Reduce the use of service_mutex in ip_vs_est_calc_phase() because under est_mutex we can safely walk est_kt_arr to stop the kthreads above slot 0. As ip_vs_stop_estimator() for tot_stats should be called under service_mutex, do it early in the netns exit path in ip_vs_flush() to avoid locking the mutex again later. It still should be called in ip_vs_control_net_cleanup_sysctl() when we are called during netns init error. Use -2 for ktid as indicator if estimator was already stopped. Finally, fix use-after-free for kd->est_row in ip_vs_est_calc_phase(). est->ktrow should simply switch to a delay value while estimator is linked to est_temp_list. Link: https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422125123.40658-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260424175858.54752-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260425103918.7447-1-ja%40ssi.bg Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 11 ++++- net/netfilter/ipvs/ip_vs_ctl.c | 51 +++++++++++++++++---- net/netfilter/ipvs/ip_vs_est.c | 83 ++++++++++++++++++++-------------- 3 files changed, 100 insertions(+), 45 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 72d325c813132d..d28ad8a0541fe5 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -491,6 +491,7 @@ struct ip_vs_est_kt_data { DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ + int needed; /* task is needed */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ @@ -1884,11 +1885,19 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); -void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_stop_estimator_tot_stats(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + ipvs->tot_stats->s.est.ktid = -2; +#endif +} + static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d81077c2457a8b..5c9f8e0e238f7a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work) if (!kd) continue; /* New config ? Stop kthread tasks */ - if (genid != genid_done) - ip_vs_est_kthread_stop(kd); + if (genid != genid_done) { + if (!id) { + /* Only we can stop kt 0 but not under mutex */ + mutex_unlock(&ipvs->est_mutex); + ip_vs_est_kthread_stop(kd); + mutex_lock(&ipvs->est_mutex); + if (!READ_ONCE(ipvs->enable)) + goto unlock; + /* kd for kt 0 is never destroyed */ + } else { + ip_vs_est_kthread_stop(kd); + } + } if (!kd->task && !ip_vs_est_stopped(ipvs)) { + bool start; + /* Do not start kthreads above 0 in calc phase */ - if ((!id || !ipvs->est_calc_phase) && - ip_vs_est_kthread_start(ipvs, kd) < 0) + if (id) + start = !ipvs->est_calc_phase; + else + start = kd->needed; + if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } @@ -1823,11 +1839,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { + mutex_lock(&ipvs->est_mutex); + /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + /* Start estimation for first time */ - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); } return 0; @@ -2103,6 +2124,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = p; } } + /* Stop the tot_stats estimator early under service_mutex + * to avoid locking it again later. + */ + if (cleanup) + ip_vs_stop_estimator_tot_stats(ipvs); return 0; } @@ -2348,7 +2374,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); unlock: mutex_unlock(&ipvs->est_mutex); @@ -2428,7 +2454,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -2455,7 +2481,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -5005,7 +5031,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + if (ipvs->tot_stats->s.est.ktid != -2) { + /* Not stopped yet? This happens only on netns init error and + * we even do not need to lock the service_mutex for this case. + */ + mutex_lock(&ipvs->service_mutex); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + mutex_unlock(&ipvs->service_mutex); + } if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 433ba3cab58c2c..ab09f518295122 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -68,6 +68,11 @@ and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data + - data protected by service_mutex: est_temp_list, est_add_ktid, + est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W) + - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist, + est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation, + est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0) */ static struct lock_class_key __ipvs_est_key; @@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data) } /* Schedule stop/start for kthread tasks */ -void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart) { + lockdep_assert_held(&ipvs->est_mutex); + /* Ignore reloads before first service is added */ if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); - /* Bump the kthread configuration genid */ - atomic_inc(&ipvs->est_genid); + /* Bump the kthread configuration genid if stopping is requested */ + if (restart) + atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } @@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) void *arr = NULL; int i; - if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - READ_ONCE(ipvs->enable) && ipvs->est_max_threads) - return -EINVAL; - mutex_lock(&ipvs->est_mutex); + /* Allow kt 0 data to be created before the services are added + * and limit the kthreads when services are present. + */ + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; @@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); + kd->needed = 1; /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { @@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) goto out; } - /* Start kthread tasks only when services are present */ - if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { - ret = ip_vs_est_kthread_start(ipvs, kd); - if (ret < 0) - goto out; - } + /* Request kthread to be started */ + ip_vs_est_reload_start(ipvs, false); if (arr) ipvs->est_kt_count++; @@ -482,12 +492,11 @@ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ? + ipvs->est_kt_arr[0] : NULL; struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) - ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); - est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ @@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) * will not allocate much memory, just for kt 0. */ ret = 0; - if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + if (!kd) { ret = ip_vs_est_add_kthread(ipvs); + } else if (!kd->needed) { + mutex_lock(&ipvs->est_mutex); + /* We have job for the kt 0 task */ + kd->needed = 1; + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); + } if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else @@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) } end_kt0: - /* kt 0 is freed after all other kthreads and chains are empty */ + /* kt 0 task is stopped after all other kt slots and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; - if (!kd || !kd->est_count) { + if (kd && !kd->est_count) { mutex_lock(&ipvs->est_mutex); - if (kd) { - ip_vs_est_kthread_destroy(kd); - ipvs->est_kt_arr[0] = NULL; - } - ipvs->est_kt_count--; + /* Keep the kt0 data but request kthread_stop */ + kd->needed = 0; + ip_vs_est_reload_start(ipvs, true); mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } @@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&ipvs->service_mutex); + mutex_lock(&ipvs->est_mutex); kd = ipvs->est_kt_arr[0]; - mutex_unlock(&ipvs->service_mutex); + mutex_unlock(&ipvs->est_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; @@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&ipvs->service_mutex); - /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!READ_ONCE(ipvs->enable)) - goto unlock2; + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) { + mutex_unlock(&ipvs->est_mutex); + return; + } kd = ipvs->est_kt_arr[id]; if (!kd) continue; @@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) } mutex_unlock(&ipvs->est_mutex); + mutex_lock(&ipvs->service_mutex); + /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while - * we reschedule. Even for kthread 0. + * we reschedule. */ step = 0; @@ -849,9 +865,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; - est->ktrow = row - kd->est_row; - if (est->ktrow < 0) - est->ktrow += IPVS_EST_NTICKS; + est->ktrow = delay; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) @@ -889,7 +903,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; -unlock2: mutex_unlock(&ipvs->est_mutex); unlock: From 4ee52b7021a7cb9356f8b9aff5631c68512a9e1b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:18 +0300 Subject: [PATCH 314/972] ipvs: fix shift-out-of-bounds in ip_vs_rht_desired_size Calling roundup_pow_of_two() with 0 has undefined result: UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'unsigned long' CPU: 1 UID: 0 PID: 77 Comm: kworker/u8:4 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Workqueue: events_unbound conn_resize_work_handler Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 ubsan_epilogue+0xa/0x30 lib/ubsan.c:233 __ubsan_handle_shift_out_of_bounds+0x385/0x410 lib/ubsan.c:494 __roundup_pow_of_two include/linux/log2.h:57 [inline] ip_vs_rht_desired_size+0x2cf/0x410 net/netfilter/ipvs/ip_vs_core.c:240 ip_vs_conn_desired_size net/netfilter/ipvs/ip_vs_conn.c:765 [inline] conn_resize_work_handler+0x1b6/0x14c0 net/netfilter/ipvs/ip_vs_conn.c:822 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0xb5d/0x1860 kernel/workqueue.c:3385 worker_thread+0xa53/0xfc0 kernel/workqueue.c:3466 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+217f1db9c791e27fe54a@syzkaller.appspotmail.com Fixes: b655388111cf ("ipvs: add resizable hash tables") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f5b7a204729134..d40b404c1bf646 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, { if (!t) return 1 << min_bits; - n = roundup_pow_of_two(n); + n = n > 0 ? roundup_pow_of_two(n) : 1; if (lfactor < 0) { int factor = min(-lfactor, max_bits); From aa6065206987278291c09d0c6aebed687114c925 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:19 +0300 Subject: [PATCH 315/972] ipvs: Guard access of HK_TYPE_KTHREAD cpumask with RCU The ip_vs_ctl.c file and the associated ip_vs.h file are the only places in the kernel where HK_TYPE_KTHREAD cpumask is being retrieved and used. Now that HK_TYPE_KTHREAD/HK_TYPE_DOMAIN cpumask can be changed at run time. We need to use RCU to guard access to this cpumask to avoid a potential UAF problem as the returned cpumask may be freed before it is being used. We can replace HK_TYPE_KTHREAD by HK_TYPE_DOMAIN as they are aliases of each other, but keeping the HK_TYPE_KTHREAD name can highlight the fact that it is the kthread initiated by ipvs that is being controlled. Fixes: 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 20 ++++++++++++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 13 ++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d28ad8a0541fe5..02762ce73a0ce4 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1412,7 +1412,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; @@ -1530,7 +1530,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } @@ -1565,6 +1565,18 @@ static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) return READ_ONCE(ipvs->sysctl_svc_lfactor); } +static inline bool sysctl_est_cpulist_empty(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_empty(__sysctl_est_cpulist(ipvs)); +} + +static inline unsigned int sysctl_est_cpulist_weight(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_weight(__sysctl_est_cpulist(ipvs)); +} + /* IPVS core functions * (from ip_vs_core.c) */ @@ -1904,7 +1916,7 @@ static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs))); + sysctl_est_cpulist_empty(ipvs)); #endif } @@ -1920,7 +1932,7 @@ static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * - cpumask_weight(sysctl_est_cpulist(ipvs)); + sysctl_est_cpulist_weight(ipvs); return max(1U, limit); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5c9f8e0e238f7a..c7c7f6a7a9f6df 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2394,11 +2394,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, mutex_lock(&ipvs->est_mutex); - if (ipvs->est_cpulist_valid) - mask = *valp; - else - mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); - ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + /* HK_TYPE_KTHREAD cpumask needs RCU protection */ + scoped_guard(rcu) { + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + } mutex_unlock(&ipvs->est_mutex); From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: [PATCH 316/972] sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e1a..cf0fd03dd7a24e 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION From d82ba05263c69fa2437fe93e4e561cc40f4c03af Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 1 May 2026 07:39:41 +0000 Subject: [PATCH 317/972] af_unix: Set gc_in_progress to true in unix_gc(). Igor Ushakov reported that unix_gc() could run with gc_in_progress being false if the work is scheduled while running: Thread 1 Thread 2 Thread 3 -------- -------- -------- unix_schedule_gc() unix_schedule_gc() `- if (!gc_in_progress) `- if (!gc_in_progress) |- gc_in_progress = true | `- queue_work() | unix_gc() <----------------/ | | |- gc_in_progress = true ... `- queue_work() | | `- gc_in_progress = false | | unix_gc() <---------------------------------------------' | ... /* gc_in_progress == false */ | `- gc_in_progress = false unix_peek_fpl() relies on gc_in_progress not to confuse GC by MSG_PEEK. Let's set gc_in_progress to true in unix_gc(). Fixes: 8b90a9f819dc ("af_unix: Run GC on only one CPU.") Reported-by: Igor Ushakov Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260501073945.1884564-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index a7967a34582737..0783555e252660 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -607,6 +607,8 @@ static void unix_gc(struct work_struct *work) struct sk_buff_head hitlist; struct sk_buff *skb; + WRITE_ONCE(gc_in_progress, true); + spin_lock(&unix_gc_lock); if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { @@ -649,10 +651,8 @@ void unix_schedule_gc(struct user_struct *user) READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; - if (!READ_ONCE(gc_in_progress)) { - WRITE_ONCE(gc_in_progress, true); + if (!READ_ONCE(gc_in_progress)) queue_work(system_dfl_wq, &unix_gc_work); - } if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); From 76b93a8107574006b25495664304ea9237494d70 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 1 May 2026 02:58:41 -0700 Subject: [PATCH 318/972] netpoll: pass buffer size to egress_dev() to avoid MAC truncation egress_dev() formats np->dev_mac via snprintf() but receives buf as a bare char *, so it cannot derive the buffer size from the pointer. The size argument was hardcoded to MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1 = 17), which is silly wrong in two ways: 1) misleading kernel log output on the MAC-selected target path (np->dev_name[0] == '\0'); for example "aa:bb:cc:dd:ee:ff doesn't exist, aborting" was logged as "aa:bb:cc:dd:ee:f doesn't exist, aborting". 2) the second argument of snprintf is the size of the buffer, not the size of what you want to write. Add a bufsz parameter to egress_dev() and pass sizeof(buf) from each caller, matching the standard snprintf() idiom and removing the hardcoded size from the helper. Every caller already declares "char buf[MAC_ADDR_STR_LEN + 1]" so the formatted MAC continues to fit. Tested by booting with netconsole=6665@/aa:bb:cc:dd:ee:ff,6666@10.0.0.1/00:11:22:33:44:55 on a kernel without a matching device. Pre-fix dmesg shows "aa:bb:cc:dd:ee:f doesn't exist, aborting"; post-fix shows the full "aa:bb:cc:dd:ee:ff doesn't exist, aborting". Fixes: f8a10bed32f5 ("netconsole: allow selection of egress interface via MAC address") Cc: stable@vger.kernel.org Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260501-netpoll_snprintf_fix-v1-1-84b0566e6597@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4381e0fc25bf47..84faace50ac281 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -608,14 +608,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); /* * Returns a pointer to a string representation of the identifier used * to select the egress interface for the given netpoll instance. buf - * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + * is used to format np->dev_mac when np->dev_name is empty; bufsz must + * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address + * and its NUL terminator. */ -static char *egress_dev(struct netpoll *np, char *buf) +static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz) { if (np->dev_name[0]) return np->dev_name; - snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + snprintf(buf, bufsz, "%pM", np->dev_mac); return buf; } @@ -645,7 +647,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) if (!IS_ENABLED(CONFIG_IPV6)) { np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EINVAL; } @@ -667,7 +669,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return err; } @@ -687,14 +689,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } @@ -736,7 +738,8 @@ int netpoll_setup(struct netpoll *np) ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); + np_err(np, "%s doesn't exist, aborting\n", + egress_dev(np, buf, sizeof(buf))); err = -ENODEV; goto unlock; } @@ -744,14 +747,14 @@ int netpoll_setup(struct netpoll *np) if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = -EBUSY; goto put; } if (!netif_running(ndev)) { np_info(np, "device %s not up yet, forcing it\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = dev_open(ndev, NULL); if (err) { From 36bdc0e815b4e8a05b9028d8ef8a25e1ead35cc1 Mon Sep 17 00:00:00 2001 From: Markus Baier Date: Fri, 1 May 2026 18:39:41 +0200 Subject: [PATCH 319/972] net: usb: asix: ax88772: re-add usbnet_link_change() in phylink callbacks Commit e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") replaced the asix_adjust_link() PHY callback with phylink's mac_link_up() and mac_link_down() handlers, but did not carry over the usbnet_link_change() notification that commit 805206e66fab ("net: asix: fix "can't send until first packet is send" issue") had added. As a result, the original symptom returns: when the link comes up, usbnet is never notified, so the RX URB submission stays dormant until some other event (e.g. a transmitted packet triggering the status endpoint interrupt) wakes it up. This is reproducible with the Apple A1277 USB Ethernet Adapter (05ac:1402, AX88772A based) on a Banana Pro using a static IPv4 configuration. After bringing the interface up, no incoming packets are received until the first outgoing frame triggers usbnet's RX path. Restore the link change notification, gated on a carrier transition so the call remains idempotent if the status endpoint also reports the change later. Fixes: e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") Signed-off-by: Markus Baier Tested-by: Oleksij Rempel Link: https://patch.msgid.link/20260501163941.107668-1-Markus.Baier@soslab.tu-darmstadt.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index df0bcfedddbc36..293ef80c4e302e 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -756,6 +756,7 @@ static void ax88772_mac_link_down(struct phylink_config *config, struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); + usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, @@ -786,6 +787,7 @@ static void ax88772_mac_link_up(struct phylink_config *config, m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); + usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { From 059b7dbd20a6f0c539a45ddff1573cb8946685b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 12:26:52 +0000 Subject: [PATCH 320/972] vsock/virtio: fix potential unbounded skb queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio_transport_inc_rx_pkt() checks vvs->rx_bytes + len > vvs->buf_alloc. virtio_transport_recv_enqueue() skips coalescing for packets with VIRTIO_VSOCK_SEQ_EOM. If fed with packets with len == 0 and VIRTIO_VSOCK_SEQ_EOM, a very large number of packets can be queued because vvs->rx_bytes stays at 0. Fix this by estimating the skb metadata size: (Number of skbs in the queue) * SKB_TRUESIZE(0) Fixes: 077706165717 ("virtio/vsock: don't use skbuff state to account credit") Signed-off-by: Eric Dumazet Cc: Arseniy Krasnov Cc: Stefan Hajnoczi Cc: Stefano Garzarella Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Xuan Zhuo Cc: "Eugenio Pérez" Cc: virtualization@lists.linux.dev Link: https://patch.msgid.link/20260430122653.554058-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 416d533f493d7b..9b8014516f4fb1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -447,7 +447,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->buf_used + len > vvs->buf_alloc) + u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); + + if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; From c4a99a921949cddc590b22bb14eeb23dffcc3ba6 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Fri, 1 May 2026 21:35:34 +0200 Subject: [PATCH 321/972] mptcp: use MPJoinSynAckHMacFailure for SynAck HMAC failure In subflow_finish_connect(), HMAC validation of the server's HMAC in SYN/ACK + MP_JOIN increments MPTCP_MIB_JOINACKMAC ("HMAC was wrong on ACK + MP_JOIN") on failure. The function processes the SYN/ACK, not the ACK; the matching MPTCP_MIB_JOINSYNACKMAC counter ("HMAC was wrong on SYN/ACK + MP_JOIN") exists but is not incremented anywhere in the tree. The mirror site on the server, subflow_syn_recv_sock(), already uses JOINACKMAC correctly for ACK HMAC failure. Use JOINSYNACKMAC at the SYN/ACK validation site so each counter reflects the packet whose HMAC actually failed. Suggested-by: Matthieu Baerts (NGI0) Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure") Cc: stable@vger.kernel.org Signed-off-by: Shardul Bankar Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-1-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e2cb9d23e4a0b2..bda6862264ca4a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -581,7 +581,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup); if (!subflow_thmac_valid(subflow)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } From a6da02d4c00fdda2417e42ad2b762a9209e6cc49 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Fri, 1 May 2026 21:35:35 +0200 Subject: [PATCH 322/972] mptcp: use MPTCP_RST_EMPTCP for ACK HMAC validation failure When HMAC validation fails on a received ACK + MP_JOIN in subflow_syn_recv_sock(), the subflow is reset with reason MPTCP_RST_EPROHIBIT ("Administratively prohibited"). This is incorrect: HMAC validation failure is an MPTCP protocol-level error, not an administrative policy denial. The mirror site on the client, in subflow_finish_connect(), already uses MPTCP_RST_EMPTCP ("MPTCP-specific error") for the same kind of HMAC failure on the SYN/ACK + MP_JOIN. Use the same reason on the server side for symmetry and accuracy. Suggested-by: Matthieu Baerts (NGI0) Fixes: 443041deb5ef ("mptcp: fix NULL pointer in can_accept_new_subflow") Cc: stable@vger.kernel.org Signed-off-by: Shardul Bankar Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-2-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bda6862264ca4a..d562e149606f60 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -908,7 +908,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); - subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } From 6254a16d6f0c672e3809ca5d7c9a28a55d71f764 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 1 May 2026 21:35:36 +0200 Subject: [PATCH 323/972] mptcp: fix rx timestamp corruption on fastopen The skb cb offset containing the timestamp presence flag is cleared before loading such information. Cache such value before MPTCP CB initialization. Fixes: 36b122baf6a8 ("mptcp: add subflow_v(4,6)_send_synack()") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-3-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/fastopen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index 82ec15bcfd7f56..082c46c0f50ee7 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + bool has_rxtstamp; /* on early fallback the subflow context is deleted by * subflow_syn_recv_sock() @@ -40,12 +41,13 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf */ tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; - MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); From 70ece9d7021c54cf40c72b31b066e9088f5f75f5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 1 May 2026 21:35:37 +0200 Subject: [PATCH 324/972] mptcp: sockopt: increase seq in mptcp_setsockopt_all_sf mptcp_setsockopt_all_sf() was missing a call to sockopt_seq_inc(). This is required not to cause missing synchronization for newer subflows created later on. This helper is called each time a socket option is set on subflows, and future ones will need to inherit this option after their creation. Fixes: 51c5fd09e1b4 ("mptcp: add TCP_MAXSEG sockopt support") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-4-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 0efe40be2fde07..1cf608e7357bda 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -812,6 +812,10 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, if (ret) break; } + + if (!ret) + sockopt_seq_inc(msk); + return ret; } From ac0841d7d202073415c808bda7848502163b87dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2026 12:41:02 +0000 Subject: [PATCH 325/972] net: prevent possible UAF in rtnl_prop_list_size() I was mistaken by synchronize_rcu() [1] call in netdev_name_node_alt_destroy(), giving a false sense of RCU safety at delete times. We have to use list_del_rcu() to not confuse potential readers in rtnl_prop_list_size(). [1] This synchronize_rcu() call was later removed in commit 723de3ebef03 ("net: free altname using an RCU callback"). Fixes: 9f30831390ed ("net: add rcu safety to rtnl_prop_list_size()") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260502124102.499204-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 06c195906231a3..8bfa8313ef62ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head) static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); - list_del(&name_node->list); + list_del_rcu(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } From 30cb24f97d44f6b81c14b85c5323de62eef1fb7f Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sat, 2 May 2026 15:19:45 +0100 Subject: [PATCH 326/972] psp: strip variable-length PSP header in psp_dev_rcv() psp_dev_rcv() unconditionally removes a fixed PSP_ENCAP_HLEN, even when psph->hdrlen indicates that the PSP header carries optional fields. A frame whose PSP header advertises a non-zero VC or any extension would therefore be silently mis-decapsulated: option bytes would spill into the inner packet head and downstream parsing would fail on a corrupted skb. Compute the full PSP header length from psph->hdrlen, pull the optional bytes into the linear region, and strip the whole header when decapsulating. Optional fields (VC, ...) are still ignored, just discarded with the rest of the header instead of leaking. crypt_offset and the VIRT flag are intentionally not validated here - callers know their device's PSP implementation and can decide. Both in-tree callers gate on hardware-validated PSP, so this is a correctness fix rather than a reachable corruption path under current configurations. Fixes: 0eddb8023cee ("psp: provide decapsulation and receive helper for drivers") Reviewed-by: Willem de Bruijn Reviewed-by: Daniel Zahka Cc: stable@vger.kernel.org Signed-off-by: David Carlier Link: https://patch.msgid.link/20260502141945.14484-1-devnexen@gmail.com Signed-off-by: Jakub Kicinski --- net/psp/psp_main.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 9508b6c380038b..e45549f08eef8a 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -263,15 +263,16 @@ EXPORT_SYMBOL(psp_dev_encapsulate); /* Receive handler for PSP packets. * - * Presently it accepts only already-authenticated packets and does not - * support optional fields, such as virtualization cookies. The caller should - * ensure that skb->data is pointing to the mac header, and that skb->mac_len - * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE - * is not supported). + * Accepts only already-authenticated packets. The full PSP header is + * stripped according to psph->hdrlen; any optional fields it advertises + * (virtualization cookies, etc.) are ignored and discarded along with the + * rest of the header. The caller should ensure that skb->data is pointing + * to the mac header, and that skb->mac_len is set. This function does not + * currently adjust skb->csum (CHECKSUM_COMPLETE is not supported). */ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) { - int l2_hlen = 0, l3_hlen, encap; + int l2_hlen = 0, l3_hlen, encap, psp_hlen; struct psp_skb_ext *pse; struct psphdr *psph; struct ethhdr *eth; @@ -312,18 +313,36 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) return -EINVAL; - pse = skb_ext_add(skb, SKB_EXT_PSP); - if (!pse) + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + + /* Strip the full PSP header per psph->hdrlen; VC/options are pulled + * into the linear region only so they can be discarded with the + * rest of the header. + */ + psp_hlen = (psph->hdrlen + 1) * 8; + + if (unlikely(psp_hlen < sizeof(struct psphdr))) + return -EINVAL; + + if (psp_hlen > sizeof(struct psphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + + sizeof(struct udphdr) + psp_hlen)) return -EINVAL; psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + sizeof(struct udphdr)); + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + pse->spi = psph->spi; pse->dev_id = dev_id; pse->generation = generation; pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); - encap = PSP_ENCAP_HLEN; + encap = sizeof(struct udphdr) + psp_hlen; encap += strip_icv ? PSP_TRL_SIZE : 0; if (proto == htons(ETH_P_IP)) { @@ -340,8 +359,9 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); } - memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); - skb_pull(skb, PSP_ENCAP_HLEN); + memmove(skb->data + sizeof(struct udphdr) + psp_hlen, + skb->data, l2_hlen + l3_hlen); + skb_pull(skb, sizeof(struct udphdr) + psp_hlen); if (strip_icv) pskb_trim(skb, skb->len - PSP_TRL_SIZE); From a6039776c7994dd0b9a4acce23a3f897d1688cbf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 2 May 2026 18:07:47 +0000 Subject: [PATCH 327/972] ipmr: Add __rcu to netns_ipv4.mrt. kernel test robot reported this Sparse warning: $ make C=1 net/ipv4/ipmr.o net/ipv4/ipmr.c:312:24: error: incompatible types in comparison expression (different address spaces): net/ipv4/ipmr.c:312:24: struct mr_table [noderef] __rcu * net/ipv4/ipmr.c:312:24: struct mr_table * Let's add __rcu annotation to netns_ipv4.mrt. Fixes: b3b6babf4751 ("ipmr: Free mr_table after RCU grace period.") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202605030032.glNApko7-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260502180755.359554-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 80ccd4dda8e0fc..6e27c56514df57 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -275,7 +275,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES - struct mr_table *mrt; + struct mr_table __rcu *mrt; #else struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; From 07d99587396024932e02474c3a5bede71d108454 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Sat, 2 May 2026 11:55:02 +0100 Subject: [PATCH 328/972] net: dsa: mt7530: fix .get_stats64 sleeping in atomic context The .get_stats64 callback runs in atomic context, but on MDIO-connected switches every register read acquires the MDIO bus mutex, which can sleep: [ 12.645973] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:609 [ 12.654442] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 759, name: grep [ 12.663377] preempt_count: 0, expected: 0 [ 12.667410] RCU nest depth: 1, expected: 0 [ 12.671511] INFO: lockdep is turned off. [ 12.675441] CPU: 0 UID: 0 PID: 759 Comm: grep Tainted: G S W 7.0.0+ #0 PREEMPT [ 12.675453] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN [ 12.675456] Hardware name: Bananapi BPI-R64 (DT) [ 12.675459] Call trace: [ 12.675462] show_stack+0x14/0x1c (C) [ 12.675477] dump_stack_lvl+0x68/0x8c [ 12.675487] dump_stack+0x14/0x1c [ 12.675495] __might_resched+0x14c/0x220 [ 12.675504] __might_sleep+0x44/0x80 [ 12.675511] __mutex_lock+0x50/0xb10 [ 12.675523] mutex_lock_nested+0x20/0x30 [ 12.675532] mt7530_get_stats64+0x40/0x2ac [ 12.675542] dsa_user_get_stats64+0x2c/0x40 [ 12.675553] dev_get_stats+0x44/0x1e0 [ 12.675564] dev_seq_printf_stats+0x24/0xe0 [ 12.675575] dev_seq_show+0x14/0x3c [ 12.675583] seq_read_iter+0x37c/0x480 [ 12.675595] seq_read+0xd0/0xec [ 12.675605] proc_reg_read+0x94/0xe4 [ 12.675615] vfs_read+0x98/0x29c [ 12.675625] ksys_read+0x54/0xdc [ 12.675633] __arm64_sys_read+0x18/0x20 [ 12.675642] invoke_syscall.constprop.0+0x54/0xec [ 12.675653] do_el0_svc+0x3c/0xb4 [ 12.675662] el0_svc+0x38/0x200 [ 12.675670] el0t_64_sync_handler+0x98/0xdc [ 12.675679] el0t_64_sync+0x158/0x15c For MDIO-connected switches, poll MIB counters asynchronously using a delayed workqueue every second and let .get_stats64 return the cached values under a spinlock. A mod_delayed_work() call on each read triggers an immediate refresh so counters stay responsive when queried more frequently. MMIO-connected switches (MT7988, EN7581, AN7583) are not affected because their regmap does not sleep, so they continue to read MIB counters directly in .get_stats64. Fixes: 88c810f35ed5 ("net: dsa: mt7530: implement .get_stats64") Signed-off-by: Daniel Golle Acked-by: Chester A. Unal Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/6940b913da2c29156f0feff74b678d3c526ee84c.1777719253.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 75 ++++++++++++++++++++++++++++++++++++++-- drivers/net/dsa/mt7530.h | 8 +++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b9423389c2ef0b..44d670904ad893 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -25,6 +25,9 @@ #include "mt7530.h" +#define MT7530_STATS_POLL_INTERVAL (1 * HZ) +#define MT7530_STATS_RATE_LIMIT (HZ / 10) + static struct mt753x_pcs *pcs_to_mt753x_pcs(struct phylink_pcs *pcs) { return container_of(pcs, struct mt753x_pcs, pcs); @@ -906,10 +909,9 @@ static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, *ranges = mt7530_rmon_ranges; } -static void mt7530_get_stats64(struct dsa_switch *ds, int port, - struct rtnl_link_stats64 *storage) +static void mt7530_read_port_stats64(struct mt7530_priv *priv, int port, + struct rtnl_link_stats64 *storage) { - struct mt7530_priv *priv = ds->priv; uint64_t data; /* MIB counter doesn't provide a FramesTransmittedOK but instead @@ -951,6 +953,54 @@ static void mt7530_get_stats64(struct dsa_switch *ds, int port, &storage->rx_crc_errors); } +static void mt7530_stats_refresh(struct mt7530_priv *priv) +{ + struct rtnl_link_stats64 stats = {}; + struct dsa_port *dp; + int port; + + dsa_switch_for_each_user_port(dp, priv->ds) { + port = dp->index; + + mt7530_read_port_stats64(priv, port, &stats); + + spin_lock_bh(&priv->stats_lock); + priv->ports[port].stats = stats; + priv->stats_last = jiffies; + spin_unlock_bh(&priv->stats_lock); + } +} + +static void mt7530_stats_poll(struct work_struct *work) +{ + struct mt7530_priv *priv = container_of(work, struct mt7530_priv, + stats_work.work); + + mt7530_stats_refresh(priv); + schedule_delayed_work(&priv->stats_work, + MT7530_STATS_POLL_INTERVAL); +} + +static void mt7530_get_stats64(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *storage) +{ + struct mt7530_priv *priv = ds->priv; + bool refresh; + + if (priv->bus) { + spin_lock_bh(&priv->stats_lock); + *storage = priv->ports[port].stats; + refresh = time_after(jiffies, priv->stats_last + + MT7530_STATS_RATE_LIMIT); + spin_unlock_bh(&priv->stats_lock); + if (refresh) + mod_delayed_work(system_percpu_wq, + &priv->stats_work, 0); + } else { + mt7530_read_port_stats64(priv, port, storage); + } +} + static void mt7530_get_eth_ctrl_stats(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats) { @@ -3137,9 +3187,24 @@ mt753x_setup(struct dsa_switch *ds) if (ret && priv->irq_domain) mt7530_free_mdio_irq(priv); + if (!ret && priv->bus) { + mt7530_stats_refresh(priv); + schedule_delayed_work(&priv->stats_work, + MT7530_STATS_POLL_INTERVAL); + } + return ret; } +static void +mt753x_teardown(struct dsa_switch *ds) +{ + struct mt7530_priv *priv = ds->priv; + + if (priv->bus) + cancel_delayed_work_sync(&priv->stats_work); +} + static int mt753x_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { @@ -3257,6 +3322,7 @@ static int mt7988_setup(struct dsa_switch *ds) static const struct dsa_switch_ops mt7530_switch_ops = { .get_tag_protocol = mtk_get_tag_protocol, .setup = mt753x_setup, + .teardown = mt753x_teardown, .preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port, .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, @@ -3395,6 +3461,9 @@ mt7530_probe_common(struct mt7530_priv *priv) priv->ds->ops = &mt7530_switch_ops; priv->ds->phylink_mac_ops = &mt753x_phylink_mac_ops; mutex_init(&priv->reg_mutex); + spin_lock_init(&priv->stats_lock); + INIT_DELAYED_WORK(&priv->stats_work, mt7530_stats_poll); + dev_set_drvdata(dev, priv); return 0; diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 3e0090bed298de..dd33b0df3419e3 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -796,6 +796,7 @@ struct mt7530_fdb { * @pvid: The VLAN specified is to be considered a PVID at ingress. Any * untagged frames will be assigned to the related VLAN. * @sgmii_pcs: Pointer to PCS instance for SerDes ports + * @stats: Cached port statistics for MDIO-connected switches */ struct mt7530_port { bool enable; @@ -803,6 +804,7 @@ struct mt7530_port { u32 pm; u16 pvid; struct phylink_pcs *sgmii_pcs; + struct rtnl_link_stats64 stats; }; /* Port 5 mode definitions of the MT7530 switch */ @@ -875,6 +877,9 @@ struct mt753x_info { * @create_sgmii: Pointer to function creating SGMII PCS instance(s) * @active_cpu_ports: Holding the active CPU ports * @mdiodev: The pointer to the MDIO device structure + * @stats_lock: Protects cached per-port stats from concurrent access + * @stats_work: Delayed work for polling MIB counters on MDIO switches + * @stats_last: Jiffies timestamp of last MIB counter poll */ struct mt7530_priv { struct device *dev; @@ -900,6 +905,9 @@ struct mt7530_priv { int (*create_sgmii)(struct mt7530_priv *priv); u8 active_cpu_ports; struct mdio_device *mdiodev; + spinlock_t stats_lock; /* protects cached stats counters */ + struct delayed_work stats_work; + unsigned long stats_last; }; struct mt7530_hw_vlan_entry { From f4c50a4034e62ab75f1d5cdd191dd5f9c77fdff4 Mon Sep 17 00:00:00 2001 From: Kuan-Ting Chen Date: Mon, 4 May 2026 23:27:12 +0800 Subject: [PATCH 329/972] xfrm: esp: avoid in-place decrypt on shared skb frags MSG_SPLICE_PAGES can attach pages from a pipe directly to an skb. TCP marks such skbs with SKBFL_SHARED_FRAG after skb_splice_from_iter(), so later paths that may modify packet data can first make a private copy. The IPv4/IPv6 datagram append paths did not set this flag when splicing pages into UDP skbs. That leaves an ESP-in-UDP packet made from shared pipe pages looking like an ordinary uncloned nonlinear skb. ESP input then takes the no-COW fast path for uncloned skbs without a frag_list and decrypts in place over data that is not owned privately by the skb. Mark IPv4/IPv6 datagram splice frags with SKBFL_SHARED_FRAG, matching TCP. Also make ESP input fall back to skb_cow_data() when the flag is present, so ESP does not decrypt externally backed frags in place. Private nonlinear skb frags still use the existing fast path. This intentionally does not change ESP output. In esp_output_head(), the path that appends the ESP trailer to existing skb tailroom without calling skb_cow_data() is not reachable for nonlinear skbs: skb_tailroom() returns zero when skb->data_len is nonzero, while ESP tailen is positive. Thus ESP output will either use the separate destination-frag path or fall back to skb_cow_data(). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Fixes: 7da0dde68486 ("ip, udp: Support MSG_SPLICE_PAGES") Fixes: 6d8192bd69bb ("ip6, udp6: Support MSG_SPLICE_PAGES") Reported-by: Hyunwoo Kim Reported-by: Kuan-Ting Chen Tested-by: Hyunwoo Kim Cc: stable@vger.kernel.org Signed-off-by: Kuan-Ting Chen Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 3 ++- net/ipv4/ip_output.c | 2 ++ net/ipv6/esp6.c | 3 ++- net/ipv6/ip6_output.c | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 6dfc0bcdef6542..6a5febbdbee493 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -873,7 +873,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e4790cc7b5c2ec..5bcd73cbdb41c0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,6 +1233,8 @@ static int __ip_append_data(struct sock *sk, if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9f75313734f8cd..9c06c5a1419dc4 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -915,7 +915,8 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e92909ab5be3f..1f2a33fbed6e63 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1794,6 +1794,8 @@ static int __ip6_append_data(struct sock *sk, if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; From aab3d205a086233c612fee86009265451793e0c2 Mon Sep 17 00:00:00 2001 From: Juha-Pekka Heikkila Date: Mon, 27 Apr 2026 19:57:15 +0300 Subject: [PATCH 330/972] drm/i915/display: enable ccs modifiers on dg2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since Xe driver aux ccs enablement dg2 ccs modifiers have been disabled on i915 driver. Here allow dg2 to use ccs again for framebuffers. Fixes: 6a99e91a6ca8 ("drm/i915/display: Detect AuxCCS support via display parent interface") Signed-off-by: Juha-Pekka Heikkila Reviewed-by: Ville Syrjälä Signed-off-by: Mika Kahola Link: https://patch.msgid.link/20260427165715.864721-1-juhapekka.heikkila@gmail.com (cherry picked from commit aee13ba1448213975f36942ba5d1ce693eb5c002) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_driver.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index 385a634c3ed004..d9be7a5a239c16 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -750,9 +750,8 @@ static bool has_auxccs(struct drm_device *drm) { struct drm_i915_private *i915 = to_i915(drm); - return IS_GRAPHICS_VER(i915, 9, 12) || - IS_ALDERLAKE_P(i915) || - IS_METEORLAKE(i915); + return IS_GRAPHICS_VER(i915, 9, 12) && + !HAS_FLAT_CCS(i915); } static bool has_fenced_regions(struct drm_device *drm) From 2c340aab5485ebe9e33c01437dd4815ef33c8df5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 May 2026 09:16:38 +0200 Subject: [PATCH 331/972] x86/efi: Restore IRQ state in EFI page fault handler The kernel's softirq API does not permit re-enabling softirqs while IRQs are disabled. The reason for this is that local_bh_enable() will not only re-enable delivery of softirqs over the back of IRQs, it will also handle any pending softirqs immediately, regardless of whether IRQs are enabled at that point. For this reason, commit d02198550423 ("x86/fpu: Improve crypto performance by making kernel-mode FPU reliably usable in softirqs") disables softirqs only when IRQs are enabled, as it is not permitted otherwise, but also unnecessary, given that asynchronous softirq delivery never happens to begin with while IRQs are disabled. However, this does mean that entering a kernel mode FPU section with IRQs enabled and leaving it with IRQs disabled leads to problems, as identified by Sashiko [0]: the EFI page fault handler is called from page_fault_oops() with IRQs disabled, and thus ends the kernel mode FPU section with IRQs disabled as well, regardless of whether IRQs were enabled when it was started. This may result in schedule() being called with a non-zero preempt_count, causing a BUG(). So take care to re-enable IRQs when handling any EFI page faults if they were taken with IRQs enabled. [0] https://sashiko.dev/#/patchset/20260430074107.27051-1-ivan.hu%40canonical.com Cc: Eric Biggers Cc: Ivan Hu Cc: x86@kernel.org Cc: Fixes: d02198550423 ("x86/fpu: Improve crypto performance by making kernel-mode FPU reliably usable in softirqs") Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 3 ++- arch/x86/mm/fault.c | 2 +- arch/x86/platform/efi/quirks.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index dc8fe1361c18eb..be58b7f5c80634 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -137,7 +137,8 @@ extern void __init efi_dump_pagetable(void); extern void __init efi_apply_memmap_quirks(void); extern int __init efi_reuse_config(u64 tables, int nr_tables); extern void efi_delete_dummy_variable(void); -extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); +extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr, + const struct pt_regs *regs); extern void efi_unmap_boot_services(void); void arch_efi_call_virt_setup(void); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f0e77e08448207..63de8e8684f237 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -686,7 +686,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, * avoid hanging the system. */ if (IS_ENABLED(CONFIG_EFI)) - efi_crash_gracefully_on_page_fault(address); + efi_crash_gracefully_on_page_fault(address, regs); /* Only not-present faults should be handled by KFENCE. */ if (!(error_code & X86_PF_PROT) && diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index c8f5e094ed9d74..90a065fcb1fab2 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -761,7 +761,8 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, * @return: Returns, if the page fault is not handled. This function * will never return if the page fault is handled successfully. */ -void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) +void efi_crash_gracefully_on_page_fault(unsigned long phys_addr, + const struct pt_regs *regs) { if (!IS_ENABLED(CONFIG_X86_64)) return; @@ -810,6 +811,14 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) return; } + /* + * The API does not permit entering a kernel mode FPU section with + * interrupts enabled and leaving it with interrupts disabled. So + * re-enable interrupts now if they were enabled when the page fault + * occurred. + */ + local_irq_restore(regs->flags); + /* * Before calling EFI Runtime Service, the kernel has switched the * calling process to efi_mm. Hence, switch back to task_mm. From 212ec34e4e726e8cd4af7bea4740db24de8a9dab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 May 2026 08:34:32 -0600 Subject: [PATCH 332/972] block: only read from sqe on initial invocation of blkdev_uring_cmd() This passthrough helper currently only supports discards. Part of that command is the start and length, which is read from the SQE. It does so on every invocation, where it really should just make it stable on the first invocation. This avoids needing to copy the SQE upfront, as we only really need those two 8b values stored in our per-req payload. Cc: stable@vger.kernel.org # 6.17+ Signed-off-by: Jens Axboe --- block/ioctl.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index fc3be0549aa754..ab2c9ed7994683 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -857,6 +857,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) #endif struct blk_iou_cmd { + u64 start; + u64 len; int res; bool nowait; }; @@ -946,23 +948,27 @@ int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); - const struct io_uring_sqe *sqe = cmd->sqe; u32 cmd_op = cmd->cmd_op; - uint64_t start, len; - if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || - sqe->rw_flags || sqe->file_index)) - return -EINVAL; + /* Read what we need from the SQE on the first issue */ + if (!(issue_flags & IORING_URING_CMD_REISSUE)) { + const struct io_uring_sqe *sqe = cmd->sqe; + + if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || + sqe->rw_flags || sqe->file_index)) + return -EINVAL; + + bic->start = READ_ONCE(sqe->addr); + bic->len = READ_ONCE(sqe->addr3); + } bic->res = 0; bic->nowait = issue_flags & IO_URING_F_NONBLOCK; - start = READ_ONCE(sqe->addr); - len = READ_ONCE(sqe->addr3); - switch (cmd_op) { case BLOCK_URING_CMD_DISCARD: - return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait); + return blkdev_cmd_discard(cmd, bdev, bic->start, bic->len, + bic->nowait); } return -EINVAL; } From 09ae540e1d5c02210795911bf5459282d7af04e9 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Thu, 23 Apr 2026 02:33:49 +0500 Subject: [PATCH 333/972] rhashtable: drop ht->mutex in rhashtable_free_and_destroy() rhashtable_free_and_destroy() is a single-shot teardown routine: cancel_work_sync() has already quiesced the deferred rehash worker, and the function's documented contract requires the caller to guarantee no other concurrent access to the rhashtable. Under those conditions ht->mutex is not protecting anything -- taking it is a leftover from the original teardown path. That leftover is actively harmful: it closes a circular lock-class dependency with fs_reclaim. The deferred rehash worker takes ht->mutex and then allocates GFP_KERNEL memory in bucket_table_alloc(), establishing &ht->mutex -> fs_reclaim After commit b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") introduced simple_xattr_ht_free(), which calls rhashtable_free_and_destroy(), the simple_xattrs teardown became reachable from evict() under the dcache shrinker. The subsequent per-subsystem adaptations made the reverse edge concrete in three independent code paths: * commit 52b364fed6e1 ("shmem: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 5bd97f5c5f24 ("kernfs: adapt to rhashtable-based simple_xattrs with lazy allocation") * commit 50704c391fbf ("pidfs: adapt to rhashtable-based simple_xattrs") Any of the three closes the cycle fs_reclaim -> &ht->mutex which lockdep reports as follows. This particular splat was observed organically on a workstation kernel built from vfs-7.1-rc1.xattr at ~35h uptime under normal mixed workload, with CONFIG_PROVE_LOCKING=y. The path happens to go through kernfs: WARNING: possible circular locking dependency detected 7.0.0-faeab166167f-with-fixes-v1+ #191 Tainted: G U kswapd0/243 is trying to acquire lock: ffff8882e475c0f8 (&ht->mutex){+.+.}-{4:4}, at: rhashtable_free_and_destroy+0x36/0x740 but task is already holding lock: ffffffffa8ad1d00 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat+0x995/0x1600 the existing dependency chain (in reverse order) is: -> #1 (fs_reclaim){+.+.}-{0:0}: __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 fs_reclaim_acquire+0xd9/0x130 __kvmalloc_node_noprof+0xcd/0xb40 bucket_table_alloc.isra.0+0x5a/0x440 rhashtable_rehash_alloc+0x4e/0xd0 rht_deferred_worker+0x14b/0x440 process_one_work+0x8fd/0x16a0 worker_thread+0x601/0xff0 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 -> #0 (&ht->mutex){+.+.}-{4:4}: check_prev_add+0xdb/0xce0 validate_chain+0x554/0x780 __lock_acquire+0x506/0xbf0 lock_acquire.part.0+0xc7/0x280 __mutex_lock+0x1b2/0x2550 rhashtable_free_and_destroy+0x36/0x740 kernfs_put.part.0+0x119/0x570 evict+0x3b6/0x9c0 __dentry_kill+0x181/0x540 shrink_dentry_list+0x135/0x440 prune_dcache_sb+0xdb/0x150 super_cache_scan+0x2ff/0x520 do_shrink_slab+0x35a/0xee0 shrink_slab_memcg+0x457/0x950 shrink_slab+0x43b/0x550 shrink_one+0x31a/0x6f0 shrink_many+0x31e/0xc80 shrink_node+0xeb3/0x14a0 balance_pgdat+0x8ed/0x1600 kswapd+0x2f3/0x530 kthread+0x36b/0x470 ret_from_fork+0x5bf/0x910 ret_from_fork_asm+0x1a/0x30 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&ht->mutex); lock(fs_reclaim); lock(&ht->mutex); Note that lockdep tracks lock classes, not instances: the two &ht->mutex sites are on different rhashtable objects (the deferred worker was triggered by some unrelated rhashtable growth), but because rhashtable_init() uses a single static lockdep key for all rhashtables, this is a real class-level cycle. Once reported, lockdep disables itself for the remainder of the boot, masking any subsequent locking bugs. Drop the mutex. After cancel_work_sync() the rehash worker is quiesced and, per this function's contract, no other concurrent access is possible; the tables are therefore owned exclusively by this function and can be walked without any lock held. Switch the table walks from rht_dereference() (which requires ht->mutex to be held under CONFIG_PROVE_RCU) to rcu_dereference_raw(), which has no lockdep annotation. rht_ptr_exclusive() already uses rcu_dereference_protected(p, 1) and needs no change. This is the only place in lib/rhashtable.c where &ht->mutex is acquired from a path reachable under fs_reclaim; the deferred worker is the only other site and it is the forward edge. Removing the acquisition here therefore eliminates the class cycle for all three subsystems that use simple_xattrs, not just the one in the splat above. No locking-semantics change is introduced for correct users; incorrect users would already be racing with rehash worker completion regardless of the mutex. Synthetic reproduction of the splat within a few-minute window was unsuccessful across several attempts (tmpfs and kernfs zombies via cgroupfs with open-fd-through-rmdir, with and without swap, up to ~60k reclaim-path executions of simple_xattr_ht_free() in a single run), consistent with the rare coincidence-of-edges profile of the bug: the forward edge is already registered in /proc/lockdep on any idle system via rht_deferred_worker, but the reverse edge requires evict() to complete kernfs_put()'s final release inside the fs_reclaim critical section, which in my attempts was ordered against rather than interleaved with the worker. Fixes: b32c4a213698 ("xattr: add rhashtable-based simple_xattr infrastructure") Signed-off-by: Mikhail Gavrilov Signed-off-by: Herbert Xu --- lib/rhashtable.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 7a67ef5b67b666..426d4e381f1364 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -1166,6 +1166,11 @@ static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, * This function will eventually sleep to wait for an async resize * to complete. The caller is responsible that no further write operations * occurs in parallel. + * + * After cancel_work_sync() has returned, the deferred rehash worker is + * quiesced and, per the contract above, no other concurrent access to the + * rhashtable is possible. The tables are therefore owned exclusively by + * this function and can be walked without ht->mutex held. */ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), @@ -1177,8 +1182,15 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, irq_work_sync(&ht->run_irq_work); cancel_work_sync(&ht->run_work); - mutex_lock(&ht->mutex); - tbl = rht_dereference(ht->tbl, ht); + /* + * Do NOT take ht->mutex here. The rehash worker establishes + * ht->mutex -> fs_reclaim via GFP_KERNEL bucket allocation under + * the mutex; callers on the reclaim path (e.g. simple_xattr_ht_free() + * from evict() under the dcache shrinker for shmem/kernfs/pidfs + * inodes) would otherwise close a circular dependency + * fs_reclaim -> ht->mutex. + */ + tbl = rcu_dereference_raw(ht->tbl); restart: if (free_fn) { for (i = 0; i < tbl->size; i++) { @@ -1187,22 +1199,21 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, cond_resched(); for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)), next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL; + rcu_dereference_raw(pos->next) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? - rht_dereference(pos->next, ht) : NULL) + rcu_dereference_raw(pos->next) : NULL) rhashtable_free_one(ht, pos, free_fn, arg); } } - next_tbl = rht_dereference(tbl->future_tbl, ht); + next_tbl = rcu_dereference_raw(tbl->future_tbl); bucket_table_free(tbl); if (next_tbl) { tbl = next_tbl; goto restart; } - mutex_unlock(&ht->mutex); } EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy); From dad0d91cc2c3e6b6fb285ccfe7ddf71525797198 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:18 +0200 Subject: [PATCH 334/972] mm/slab: Add kvfree_atomic() helper kvmalloc() now supports non-sleeping GFP flags, including the vmalloc fallback path. This means it may return vmalloc memory even for GFP_ATOMIC and GFP_NOWAIT allocations. Freeing such memory with kvfree() may then end up calling vfree(), which is not safe for non-sleeping contexts. Introduce kvfree_atomic() helper for such cases. It mirrors kvfree(), but uses vfree_atomic() for vmalloced memory. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka (SUSE) Acked-by: Harry Yoo (Oracle) Signed-off-by: Herbert Xu --- include/linux/slab.h | 3 +++ mm/slub.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a60b501b95b6..2b5ab488e96b07 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1234,6 +1234,9 @@ void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long alig extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) +extern void kvfree_atomic(const void *addr); +DEFINE_FREE(kvfree_atomic, void *, if (!IS_ERR_OR_NULL(_T)) kvfree_atomic(_T)) + extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); diff --git a/mm/slub.c b/mm/slub.c index 0baa906f39ab84..8f900453672968 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6882,6 +6882,22 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +/** + * kvfree_atomic() - Free memory. + * @addr: Pointer to allocated memory. + * + * Same as kvfree(), but uses vfree_atomic() for vmalloc + * backed memory. Must not be called from NMI context. + */ +void kvfree_atomic(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree_atomic(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree_atomic); + /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. From d1fa83ecac31093a550534a79a33bc7f4ba8fc10 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 28 Apr 2026 18:14:19 +0200 Subject: [PATCH 335/972] rhashtable: Add bucket_table_free_atomic() helper rhashtable_insert_rehash() allocates a new bucket table with GFP_ATOMIC, as it is called from an RCU read-side critical section. If rhashtable_rehash_attach() then fails, the new table is freed via kvfree(). This is unsafe, since kvfree() may fall back to vfree() for vmalloc-backed allocations, which can sleep and trigger: BUG: sleeping function called from invalid context Add bucket_table_free_atomic(), which uses kvfree_atomic() so the table can be freed safely from non-sleeping context. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Herbert Xu --- lib/rhashtable.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 426d4e381f1364..04b3a808fca9f2 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -114,6 +114,14 @@ static void bucket_table_free(const struct bucket_table *tbl) kvfree(tbl); } +static void bucket_table_free_atomic(const struct bucket_table *tbl) +{ + if (tbl->nest) + nested_bucket_table_free(tbl); + + kvfree_atomic(tbl); +} + static void bucket_table_free_rcu(struct rcu_head *head) { bucket_table_free(container_of(head, struct bucket_table, rcu)); @@ -496,7 +504,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { - bucket_table_free(new_tbl); + bucket_table_free_atomic(new_tbl); if (err == -EEXIST) err = 0; } else From 5f8719945244dd65b5fa06195f4600db62581610 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 5 May 2026 10:06:53 +0200 Subject: [PATCH 336/972] x86/xen: Fix a potential problem in xen_e820_resolve_conflicts() When fixing a conflict in xen_e820_resolve_conflicts(), the loop over the E820 map entries needs to be restarted, as the E820 map will have been modified by the fix. Otherwise entries might be skipped by accident. Fixes: be35d91c8880 ("xen: tolerate ACPI NVS memory overlapping with Xen allocated memory") Signed-off-by: Juergen Gross Signed-off-by: Ingo Molnar Cc: xen-devel@lists.xenproject.org Link: https://patch.msgid.link/20260505080653.197775-1-jgross@suse.com --- arch/x86/xen/setup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ac8021c3a997e0..bb95a05259b895 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -695,17 +695,22 @@ static void __init xen_e820_resolve_conflicts(phys_addr_t start, return; end = start + size; - entry = xen_e820_table.entries; + mapcnt = 0; - for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + while (mapcnt < xen_e820_table.nr_entries) { + entry = xen_e820_table.entries + mapcnt; if (entry->addr >= end) return; if (entry->addr + entry->size > start && - entry->type == E820_TYPE_NVS) + entry->type == E820_TYPE_NVS) { xen_e820_swap_entry_with_ram(entry); + /* E820 map has been changed, restart loop! */ + mapcnt = 0; + continue; + } - entry++; + mapcnt++; } } From 3780c41460a9ad6d5d4c09a416765c6cc285033b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Thu, 2 Apr 2026 16:32:35 -0300 Subject: [PATCH 337/972] drm/etnaviv: Fix armed job not being pushed to the DRM scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When xa_alloc_cyclic() failed in etnaviv_sched_push_job(), the error path skipped drm_sched_entity_push_job(). This is a violation of the DRM scheduler contract, as once a job has been armed with drm_sched_job_arm(), it must be pushed with drm_sched_entity_push_job(). From the DRM scheduler documentation, """ drm_sched_job_arm() is a point of no return since it initializes the fences and their sequence number etc. Once that function has been called, you *must* submit it with drm_sched_entity_push_job() and cannot simply abort it by calling drm_sched_job_cleanup(). """ Fix this by splitting the fence ID allocation into two phases: first, alloc an xarray slot before arming the job (which can fail), then fill in the actual fence with xa_store() after arming. This way, allocation failures are handled before the job is armed, and once armed, the job is always pushed to the scheduler. This also fixes a double call to drm_sched_job_cleanup(), as both etnaviv_sched_push_job() and its caller would call it on failure. Fixes: 764be12345c3 ("drm/etnaviv: convert user fence tracking to XArray") Signed-off-by: Maíra Canal Link: https://patch.msgid.link/20260402193424.2023318-1-mcanal@igalia.com Signed-off-by: Christian Gmeiner --- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index df4232d7e135d1..3cc50d697c8917 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -116,16 +116,18 @@ int etnaviv_sched_push_job(struct etnaviv_gem_submit *submit) */ mutex_lock(&gpu->sched_lock); + ret = xa_alloc_cyclic(&gpu->user_fences, &submit->out_fence_id, + NULL, xa_limit_32b, &gpu->next_user_fence, + GFP_KERNEL); + if (ret < 0) + goto out_unlock; + drm_sched_job_arm(&submit->sched_job); submit->out_fence = dma_fence_get(&submit->sched_job.s_fence->finished); - ret = xa_alloc_cyclic(&gpu->user_fences, &submit->out_fence_id, - submit->out_fence, xa_limit_32b, - &gpu->next_user_fence, GFP_KERNEL); - if (ret < 0) { - drm_sched_job_cleanup(&submit->sched_job); - goto out_unlock; - } + + xa_store(&gpu->user_fences, submit->out_fence_id, + submit->out_fence, GFP_KERNEL); /* the scheduler holds on to the job now */ kref_get(&submit->refcount); From 4a142520d166f91627f27a7017525a228137c808 Mon Sep 17 00:00:00 2001 From: Jakov Novak Date: Mon, 4 May 2026 18:23:57 +0200 Subject: [PATCH 338/972] wifi: libertas: notify firmware load wait on disconnect Currently, when the firmware is not fully loaded and if_usb_disconnect is called, if_usb_prog_firmware gets stuck waiting for cardp->surprise_removed or cardp->fwdnldover while lbs_remove_card also waits for the firmware loading to be completed, which never happens. This caused the reported syzbot bug. To address this, the wake_up function call can be added in the if_usb_disconnect function which notifies the if_usb_prog_firmware thread and resolves the firmware loading. Fixes: 954ee164f4f4 ("[PATCH] libertas: reorganize and simplify init sequence") Reported-and-tested-by: syzbot+c99d17aa44dbdba16ad2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c99d17aa44dbdba16ad2 Signed-off-by: Jakov Novak Link: https://patch.msgid.link/20260504162356.17250-2-jakovnovak30@gmail.com [fix subject] Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/libertas/if_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c index a00d53350fa971..5cc0c5cac25745 100644 --- a/drivers/net/wireless/marvell/libertas/if_usb.c +++ b/drivers/net/wireless/marvell/libertas/if_usb.c @@ -310,6 +310,7 @@ static void if_usb_disconnect(struct usb_interface *intf) struct lbs_private *priv = cardp->priv; cardp->surprise_removed = 1; + wake_up(&cardp->fw_wq); if (priv) { lbs_stop_card(priv); From e9e334f8063a991b4f648b8dbb8dac44cf810540 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:52 -0700 Subject: [PATCH 339/972] net: mana: check xdp_rxq registration before unreg in mana_destroy_rxq() When mana_create_rxq() fails at mana_create_wq_obj() or any step before xdp_rxq_info_reg() is called, the error path jumps to `out:` which calls mana_destroy_rxq(). mana_destroy_rxq() unconditionally calls xdp_rxq_info_unreg() on xilinx xdp_rxq that was never registered, triggering a WARN_ON in net/core/xdp.c: mana 7870:00:00.0: HWC: Failed hw_channel req: 0xc000009a mana 7870:00:00.0 eth7: Failed to create RXQ: err = -71 Driver BUG WARNING: CPU: 442 PID: 491615 at ../net/core/xdp.c:150 xdp_rxq_info_unreg+0x44/0x70 Modules linked in: tcp_bbr xsk_diag udp_diag raw_diag unix_diag af_packet_diag netlink_diag nf_tables nfnetlink tcp_diag inet_diag binfmt_misc rpcsec_gss_krb5 nfsv3 nfs_acl auth_rpcgss nfsv4 dns_resolver nfs lockd ext4 grace crc16 iscsi_tcp mbcache fscache libiscsi_tcp jbd2 netfs rpcrdma af_packet sunrpc rdma_ucm ib_iser rdma_cm iw_cm iscsi_ibft ib_cm iscsi_boot_sysfs libiscsi rfkill scsi_transport_iscsi mana_ib ib_uverbs ib_core mana hyperv_drm(X) drm_shmem_helper intel_rapl_msr drm_kms_helper intel_rapl_common syscopyarea nls_iso8859_1 sysfillrect intel_uncore_frequency_common nls_cp437 vfat fat nfit sysimgblt libnvdimm hv_netvsc(X) hv_utils(X) fb_sys_fops hv_balloon(X) joydev fuse drm dm_mod configfs ip_tables x_tables xfs libcrc32c sd_mod nvme nvme_core nvme_common t10_pi crc64_rocksoft_generic crc64_rocksoft crc64 hid_generic serio_raw pci_hyperv(X) hv_storvsc(X) scsi_transport_fc hyperv_keyboard(X) hid_hyperv(X) pci_hyperv_intf(X) crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel crypto_simd cryptd hv_vmbus(X) softdog sg scsi_mod efivarfs Supported: Yes, External CPU: 442 PID: 491615 Comm: ethtool Kdump: loaded Tainted: G X 5.14.21-150500.55.136-default #1 SLE15-SP5 a627be1b53abbfd64ad16b2685e4308c52847f42 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 07/25/2025 RIP: 0010:xdp_rxq_info_unreg+0x44/0x70 Code: e8 91 fe ff ff c7 43 0c 02 00 00 00 48 c7 03 00 00 00 00 5b c3 cc cc cc cc e9 58 3a 1c 00 48 c7 c7 f6 5f 19 97 e8 5c a4 7e ff <0f> 0b 83 7b 0c 01 74 ca 48 c7 c7 d9 5f 19 97 e8 48 a4 7e ff 0f 0b RSP: 0018:ff3df6c8f7207818 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ff30d89f94808a80 RCX: 0000000000000027 RDX: 0000000000000000 RSI: 0000000000000002 RDI: ff30d94bdcca2908 RBP: 0000000000080000 R08: ffffffff98ed11a0 R09: ff3df6c8f72077a0 R10: dead000000000100 R11: 000000000000000a R12: 0000000000000000 R13: 0000000000002000 R14: 0000000000040000 R15: ff30d89f94800000 FS: 00007fe6d8432b80(0000) GS:ff30d94bdcc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe6d81a89b1 CR3: 00000b3b6d578001 CR4: 0000000000371ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 Call Trace: mana_destroy_rxq+0x5b/0x2f0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] mana_create_rxq.isra.55+0x3db/0x720 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ? simple_lookup+0x36/0x50 ? current_time+0x42/0x80 ? __d_free_external+0x30/0x30 mana_alloc_queues+0x32a/0x470 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ? _raw_spin_unlock+0xa/0x30 ? d_instantiate.part.29+0x2e/0x40 ? _raw_spin_unlock+0xa/0x30 ? debugfs_create_dir+0xe4/0x140 mana_attach+0x5c/0xf0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] mana_set_ringparam+0xd5/0x1a0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ethnl_set_rings+0x292/0x320 genl_family_rcv_msg_doit.isra.15+0x11b/0x150 genl_rcv_msg+0xe3/0x1e0 ? rings_prepare_data+0x80/0x80 ? genl_family_rcv_msg_doit.isra.15+0x150/0x150 netlink_rcv_skb+0x50/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1b6/0x280 netlink_sendmsg+0x365/0x4d0 sock_sendmsg+0x5f/0x70 __sys_sendto+0x112/0x140 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x80 ? handle_mm_fault+0xd7/0x290 ? do_user_addr_fault+0x2d8/0x740 ? exc_page_fault+0x67/0x150 entry_SYSCALL_64_after_hwframe+0x6b/0xd5 RIP: 0033:0x7fe6d8122f06 Code: 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 11 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 72 f3 c3 41 57 41 56 4d 89 c7 41 55 41 54 41 RSP: 002b:00007fff2b66b068 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 000055771123d2a0 RCX: 00007fe6d8122f06 RDX: 0000000000000034 RSI: 000055771123d3b0 RDI: 0000000000000003 RBP: 00007fff2b66b100 R08: 00007fe6d8203360 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 000055771123d350 R13: 000055771123d340 R14: 0000000000000000 R15: 00007fff2b66b2b0 Guard the xdp_rxq_info_unreg() call with xdp_rxq_info_is_reg() so that mana_destroy_rxq() is safe to call regardless of how far initialization progressed. Fixes: ed5356b53f07 ("net: mana: Add XDP support") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260430035935.1859220-2-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a654b3699c4c51..dfb4ba9f7664c1 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2520,7 +2520,9 @@ static void mana_destroy_rxq(struct mana_port_context *apc, napi_disable_locked(napi); netif_napi_del_locked(napi); } - xdp_rxq_info_unreg(&rxq->xdp_rxq); + + if (xdp_rxq_info_is_reg(&rxq->xdp_rxq)) + xdp_rxq_info_unreg(&rxq->xdp_rxq); mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); From 2a1c691182823a5c149d502ac153e249ee697b4a Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:53 -0700 Subject: [PATCH 340/972] net: mana: Skip WQ object destruction for uninitialized RXQ In mana_destroy_rxq(), mana_destroy_wq_obj() is called unconditionally even when the WQ object was never created (rxobj is still INVALID_MANA_HANDLE). When mana_create_rxq() fails before mana_create_wq_obj() succeeds, the error path calls mana_destroy_rxq() which sends a bogus destroy command to the hardware: mana 7870:00:00.0: HWC: Failed hw_channel req: 0x1d mana 7870:00:00.0: Failed to send mana message: -71, 0x1d mana 7870:00:00.0 eth7: Failed to destroy WQ object: -71 Guard mana_destroy_wq_obj() with an INVALID_MANA_HANDLE check so that mana_destroy_rxq() is safe to call at any stage of RXQ initialization. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260430035935.1859220-3-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index dfb4ba9f7664c1..f2a6ea162dc319 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2524,7 +2524,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (xdp_rxq_info_is_reg(&rxq->xdp_rxq)) xdp_rxq_info_unreg(&rxq->xdp_rxq); - mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); + if (rxq->rxobj != INVALID_MANA_HANDLE) + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); mana_deinit_cq(apc, &rxq->rx_cq); From 3985c9a56da49af8b2e45cb1fa55c03c89b1d471 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:54 -0700 Subject: [PATCH 341/972] net: mana: remove double CQ cleanup in mana_create_rxq error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq() followed by mana_deinit_cq(). This is incorrect for two reasons: 1. mana_destroy_rxq() already calls mana_deinit_cq() internally, so the CQ's GDMA queue is destroyed twice. 2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning. The subsequent mana_deinit_cq(apc, cq) then operates on freed memory since cq points to &rxq->rx_cq, which is embedded in the already-freed rxq structure — a use-after-free. Remove the redundant mana_deinit_cq() call from the error path since mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is itself safe for an uninitialized CQ as it checks for a NULL gdma_cq before proceeding. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Reviewed-by: Aditya Garg Link: https://patch.msgid.link/20260430035935.1859220-4-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index f2a6ea162dc319..9afc786b297a8d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2799,9 +2799,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc, mana_destroy_rxq(apc, rxq, false); - if (cq) - mana_deinit_cq(apc, cq); - return NULL; } From c69df06e4e26e50611190ce04eab92c5cc261b61 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Mar 2026 12:28:21 +0100 Subject: [PATCH 342/972] perf/core: Fix deadlock in perf_mmap() failure path Ian noted that commit 77de62ad3de3 ("perf/core: Fix refcount bug and potential UAF in perf_mmap") would cause a deadlock due to event->mmap_mutex recursion. This happens because we're now calling perf_mmap_close() under mmap_mutex, while that function itself can also take mmap_mutex. Solve this by noting that perf_mmap_close() is far more complicated than we need at this particular point, since it deals with scenarios that cannot happen in this particular case. Replace the call to perf_mmap_close() with a very narrow undo for the case of first-exposure. If this is not the first mmap(), there is no race and it is fine to drop the lock and call perf_mmap_close() to handle to more complicated scenarios. Note: move the rb->mmap_user (namespace) handling into the rb init/free code such that it does not complicate the mmap handling. Fixes: 77de62ad3de3 ("perf/core: Fix refcount bug and potential UAF in perf_mmap") Reported-by: Ian Rogers Closes: https://patch.msgid.link/CAP-5%3DfVJyVMZw%3DDqP53Kxg58nUmJ_0bxoaeOKAbC03BVc11HaA%40mail.gmail.com Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260326112821.GK3738786@noisy.programming.kicks-ass.net --- kernel/events/core.c | 70 +++++++++++++++++++++++++++++-------- kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 2 ++ 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d1f8bad7e1c52..7935d5663944ee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7006,6 +7006,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) } static void perf_pmu_output_stop(struct perf_event *event); +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb); /* * A buffer can be mmap()ed multiple times; either directly through the same @@ -7021,8 +7022,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); bool detach_rest = false; /* FIXIES vs perf_pmu_unregister() */ @@ -7117,11 +7116,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ - - atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, - &mmap_user->locked_vm); - atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); - free_uid(mmap_user); + perf_mmap_unaccount(vma, rb); out_put: ring_buffer_put(rb); /* could be last */ @@ -7261,6 +7256,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb) +{ + struct user_struct *user = rb->mmap_user; + + atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked, + &user->locked_vm); + atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm); +} + static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7323,8 +7327,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - refcount_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); @@ -7474,16 +7476,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mapped(event, vma->vm_mm); /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). + * Try to map it into the page table. On fail undo the above, + * as the callsite expects full cleanup in this case and + * therefore does not invoke vmops::close(). */ ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + if (likely(!ret)) + return 0; + + /* Error path */ + + /* + * If this is the first mmap(), then event->mmap_count should + * be stable at 1. It is only modified by: + * perf_mmap_{open,close}() and perf_mmap(). + * + * The former are not possible because this mmap() hasn't been + * successful yet, and the latter is serialized by + * event->mmap_mutex which we still hold (note that mmap_lock + * is not strictly sufficient here, because the event fd can + * be passed to another process through trivial means like + * fork(), leading to concurrent mmap() from different mm). + * + * Make sure to remove event->rb before releasing + * event->mmap_mutex, such that any concurrent mmap() will not + * attempt use this failed buffer. + */ + if (refcount_read(&event->mmap_count) == 1) { + /* + * Minimal perf_mmap_close(); there can't be AUX or + * other events on account of this being the first. + */ + mapped = get_mapped(event, event_unmapped); + if (mapped) + mapped(event, vma->vm_mm); + perf_mmap_unaccount(vma, event->rb); + ring_buffer_attach(event, NULL); /* drops last rb->refcount */ + refcount_set(&event->mmap_count, 0); + return ret; + } + + /* + * Otherwise this is an already existing buffer, and there is + * no race vs first exposure, so fall-through and call + * perf_mmap_close(). + */ } + perf_mmap_close(vma); return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d9cc5708309184..c03c4f2eea5718 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); + free_uid(rb->mmap_user); rb_free(rb); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e7de266141729..9fe92161715e09 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) rb->paused = 1; mutex_init(&rb->aux_mutex); + rb->mmap_user = get_current_user(); + refcount_set(&rb->mmap_count, 1); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) From 5ad732a56be46aabf158c16aa0c095291727aaef Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 30 Apr 2026 08:25:54 +0800 Subject: [PATCH 343/972] perf/x86/intel: Improve validation and configuration of ACR masks Currently there are several issues on the user space ACR mask validation and configuration. - The validation for user space ACR mask (attr.config2) is incomplete, e.g., the ACR mask could include the index which belongs to another ACR events group, but it's not validated. - An early return on an invalid ACR mask caused all subsequent ACR groups to be skipped. - The stale hardware ACR mask (hw.config1) is not cleared before setting new hardware ACR mask. The following changes address all of the above issues. - Figure out the event index group of an ACR group. Any bits in the user-space mask not present in the index group are now dropped. - Instead of an early return on invalid bits, drop only the invalid portions and continue iterating through all ACR events to ensure full configuration. - Explicitly clear the stale hardware ACR mask for each event prior to writing the new configuration. Besides, a non-leader event member of ACR group could be disabled in theory. This could cause bit-shifting errors in the acr_mask of remaining group members. But since ACR sampling requires all events to be active, this should not be a big concern in real use case. Add a "FIXME" comment to notice this risk. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260430002558.712334-2-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d9488ade0f8ec4..f8deb67b3c5113 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3332,23 +3332,41 @@ static void intel_pmu_enable_event(struct perf_event *event) static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc) { struct perf_event *event, *leader; - int i, j, idx; + int i, j, k, bit, idx; + /* + * FIXME: ACR mask parsing relies on cpuc->event_list[] (active events only). + * Disabling an ACR event causes bit-shifting errors in the acr_mask of + * remaining group members. As ACR sampling requires all events to be active, + * this limitation is acceptable for now. Revisit if independent event toggling + * is required. + */ for (i = 0; i < cpuc->n_events; i++) { leader = cpuc->event_list[i]; if (!is_acr_event_group(leader)) continue; - /* The ACR events must be contiguous. */ + /* Find the last event of the ACR group. */ for (j = i; j < cpuc->n_events; j++) { event = cpuc->event_list[j]; if (event->group_leader != leader->group_leader) break; - for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { - if (i + idx >= cpuc->n_events || - !is_acr_event_group(cpuc->event_list[i + idx])) - return; - __set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1); + } + + /* + * Translate the user-space ACR mask (attr.config2) into the physical + * counter bitmask (hw.config1) for each ACR event in the group. + * NOTE: ACR event contiguity is guaranteed by intel_pmu_hw_config(). + */ + for (k = i; k < j; k++) { + event = cpuc->event_list[k]; + event->hw.config1 = 0; + for_each_set_bit(bit, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) { + idx = i + bit; + /* Event index of ACR group must locate in [i, j). */ + if (idx >= j || !is_acr_event_group(cpuc->event_list[idx])) + continue; + __set_bit(cpuc->assign[idx], (unsigned long *)&event->hw.config1); } } i = j - 1; From 8ba0b706a485b1e607594cf4210786d517ad1611 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 30 Apr 2026 08:25:55 +0800 Subject: [PATCH 344/972] perf/x86/intel: Always reprogram ACR events to prevent stale masks Members of an ACR group are logically linked via a bitmask of their hardware counter indices. If some members of the group are assigned new hardware counters during rescheduling, even events that keep their original counter index must be updated with a new mask. Without this, an event will continue to use a stale acr_mask that references the old indices of its group peers. Ensure all ACR events are reprogrammed during the scheduling path to maintain consistency across the group. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260430002558.712334-3-dapeng1.mi@linux.intel.com --- arch/x86/events/core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 810ab21ffd9913..4b9e105309c6a9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1294,13 +1294,16 @@ int x86_perf_rdpmc_index(struct perf_event *event) return event->hw.event_base_rdpmc; } -static inline int match_prev_assignment(struct hw_perf_event *hwc, +static inline int match_prev_assignment(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { + struct hw_perf_event *hwc = &event->hw; + return hwc->idx == cpuc->assign[i] && - hwc->last_cpu == smp_processor_id() && - hwc->last_tag == cpuc->tags[i]; + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i] && + !is_acr_event_group(event); } static void x86_pmu_start(struct perf_event *event, int flags); @@ -1346,7 +1349,7 @@ static void x86_pmu_enable(struct pmu *pmu) * - no other event has used the counter since */ if (hwc->idx == -1 || - match_prev_assignment(hwc, cpuc, i)) + match_prev_assignment(event, cpuc, i)) continue; /* @@ -1367,7 +1370,7 @@ static void x86_pmu_enable(struct pmu *pmu) event = cpuc->event_list[i]; hwc = &event->hw; - if (!match_prev_assignment(hwc, cpuc, i)) + if (!match_prev_assignment(event, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; From 1271aeccc307066315b2d3b0d5af2510e27018b5 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 30 Apr 2026 08:25:56 +0800 Subject: [PATCH 345/972] perf/x86/intel: Disable PMI for self-reloaded ACR events On platforms with Auto Counter Reload (ACR) support, such as NVL, a "NMI received for unknown reason 30" warning is observed when running multiple events in a group with ACR enabled: $ perf record -e '{instructions/period=20000,acr_mask=0x2/u,\ cycles/period=40000,acr_mask=0x3/u}' ./test The warning occurs because the Performance Monitoring Interrupt (PMI) is enabled for the self-reloaded event (the cycles event in this case). According to the Intel SDM, the overflow bit (IA32_PERF_GLOBAL_STATUS.PMCn_OVF) is never set for self-reloaded events. Since the bit is not set, the perf NMI handler cannot identify the source of the interrupt, leading to the "unknown reason" message. Furthermore, enabling PMI for self-reloaded events is unnecessary and can lead to extraneous records that pollute the user's requested data. Disable the interrupt bit for all events configured with ACR self-reload. Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Reported-by: Andi Kleen Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260430002558.712334-4-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 17 +++++++++++++---- arch/x86/events/perf_event.h | 10 ++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index f8deb67b3c5113..ead6d95cec6a01 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3118,11 +3118,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event) intel_set_masks(event, idx); /* - * Enable IRQ generation (0x8), if not PEBS, - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: + * Enable IRQ generation (0x8), if not PEBS or self-reloaded + * ACR event, and enable ring-3 counting (0x2) and ring-0 + * counting (0x1) if requested: */ - if (!event->attr.precise_ip) + if (!event->attr.precise_ip && !is_acr_self_reload_event(event)) bits |= INTEL_FIXED_0_ENABLE_PMI; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) bits |= INTEL_FIXED_0_USER; @@ -3306,6 +3306,15 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_set_masks(event, idx); static_call_cond(intel_pmu_enable_acr_event)(event); static_call_cond(intel_pmu_enable_event_ext)(event); + /* + * For self-reloaded ACR event, don't enable PMI since + * HW won't set overflow bit in GLOBAL_STATUS. Otherwise, + * the PMI would be recognized as a suspicious NMI. + */ + if (is_acr_self_reload_event(event)) + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + else if (!event->attr.precise_ip) + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; __x86_pmu_enable_event(hwc, enable_mask); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index fad87d3c8b2caa..524668dcf4cc10 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -137,6 +137,16 @@ static inline bool is_acr_event_group(struct perf_event *event) return check_leader_group(event->group_leader, PERF_X86_EVENT_ACR); } +static inline bool is_acr_self_reload_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < 0) + return false; + + return test_bit(hwc->idx, (unsigned long *)&hwc->config1); +} + struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ From aa4384bc8f4360167f3c3d5322121fe892289ea2 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 30 Apr 2026 08:25:57 +0800 Subject: [PATCH 346/972] perf/x86/intel: Enable auto counter reload for DMR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Panther cove µarch starts to support auto counter reload (ACR), but the static_call intel_pmu_enable_acr_event() is not updated for the Panther Cove µarch used by DMR. It leads to the auto counter reload is not really enabled on DMR. Update static_call intel_pmu_enable_acr_event() in intel_pmu_init_pnc(). Fixes: d345b6bb8860 ("perf/x86/intel: Add core PMU support for DMR") Signed-off-by: Dapeng Mi Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260430002558.712334-5-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ead6d95cec6a01..dd1e3aa75ee9b7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -7531,6 +7531,7 @@ static __always_inline void intel_pmu_init_pnc(struct pmu *pmu) hybrid(pmu, event_constraints) = intel_pnc_event_constraints; hybrid(pmu, pebs_constraints) = intel_pnc_pebs_event_constraints; hybrid(pmu, extra_regs) = intel_pnc_extra_regs; + static_call_update(intel_pmu_enable_acr_event, intel_pmu_enable_acr); } static __always_inline void intel_pmu_init_skt(struct pmu *pmu) From 50987d4e6c55929aa2d4d3976e74ccbae22d5017 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Fri, 27 Mar 2026 10:17:28 +0800 Subject: [PATCH 347/972] drm/panel: himax-hx83121a: Fix incorrect error check for devm_drm_panel_alloc() Check devm_drm_panel_alloc() return value for ERR_PTR instead of NULL. devm_drm_panel_alloc() returns an ERR_PTR on failure, never NULL. Using a NULL check skips the error path and may cause a NULL pointer dereference. Fixes: a7c61963b727 ("drm/panel: Add Himax HX83121A panel driver") Signed-off-by: Chen Ni Reviewed-by: Pengyu Luo Signed-off-by: Neil Armstrong Link: https://patch.msgid.link/20260327021728.647182-1-nichen@iscas.ac.cn --- drivers/gpu/drm/panel/panel-himax-hx83121a.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-himax-hx83121a.c b/drivers/gpu/drm/panel/panel-himax-hx83121a.c index ebe643ba41844a..bed79aa06f46a6 100644 --- a/drivers/gpu/drm/panel/panel-himax-hx83121a.c +++ b/drivers/gpu/drm/panel/panel-himax-hx83121a.c @@ -596,8 +596,8 @@ static int himax_probe(struct mipi_dsi_device *dsi) ctx = devm_drm_panel_alloc(dev, struct himax, panel, &himax_panel_funcs, DRM_MODE_CONNECTOR_DSI); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); ret = devm_regulator_bulk_get_const(&dsi->dev, ARRAY_SIZE(himax_supplies), From defab7b01e0848e004077d7d8dcc04d305ea1a27 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 13 Apr 2026 09:10:19 +0200 Subject: [PATCH 348/972] drm/panel: hx83121a: select DRM_DISPLAY_DSC_HELPER Like a number of other panel drivers, this newly merged driver needs DRM_DISPLAY_DSC_HELPER to be enabled: arm-linux-gnueabi-ld: drivers/gpu/drm/panel/panel-himax-hx83121a.o: in function `himax_prepare': panel-himax-hx83121a.c:(.text+0x1024): undefined reference to `drm_dsc_pps_payload_pack' Fixes: a7c61963b727 ("drm/panel: Add Himax HX83121A panel driver") Signed-off-by: Arnd Bergmann Reviewed-by: Neil Armstrong Reviewed-by: David Heidelberg Signed-off-by: Neil Armstrong Link: https://patch.msgid.link/20260413071043.3829868-1-arnd@kernel.org --- drivers/gpu/drm/panel/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig index d6863b28ddc559..d592f4f4b939a9 100644 --- a/drivers/gpu/drm/panel/Kconfig +++ b/drivers/gpu/drm/panel/Kconfig @@ -208,6 +208,7 @@ config DRM_PANEL_HIMAX_HX83121A depends on OF depends on DRM_MIPI_DSI depends on BACKLIGHT_CLASS_DEVICE + select DRM_DISPLAY_DSC_HELPER select DRM_KMS_HELPER help Say Y here if you want to enable support for Himax HX83121A-based From c67e8787f6743101c90c7a9c4bb7cf6f1f739f83 Mon Sep 17 00:00:00 2001 From: Christian Van Date: Sat, 25 Apr 2026 01:39:48 -0400 Subject: [PATCH 349/972] drm/panel: feiyang-fy07024di26a30d: return display-on error mipi_dsi_dcs_set_display_on() returns an error code, but feiyang_enable() currently ignores it and always reports success. Return the DCS command result so callers can observe enable failures. Signed-off-by: Christian Van Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patch.msgid.link/20260425053948.117714-1-cvan20191@gmail.com --- drivers/gpu/drm/panel/panel-feiyang-fy07024di26a30d.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-feiyang-fy07024di26a30d.c b/drivers/gpu/drm/panel/panel-feiyang-fy07024di26a30d.c index 4f8d6d8c07e4d7..dbdb7e3cb7b625 100644 --- a/drivers/gpu/drm/panel/panel-feiyang-fy07024di26a30d.c +++ b/drivers/gpu/drm/panel/panel-feiyang-fy07024di26a30d.c @@ -98,9 +98,7 @@ static int feiyang_enable(struct drm_panel *panel) /* T12 (video & logic signal rise + backlight rise) T12 >= 200ms */ msleep(200); - mipi_dsi_dcs_set_display_on(ctx->dsi); - - return 0; + return mipi_dsi_dcs_set_display_on(ctx->dsi); } static int feiyang_disable(struct drm_panel *panel) From 570cf799e87ae805eacfab3b4ba66676b5fccdb6 Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sun, 3 May 2026 17:17:08 +0800 Subject: [PATCH 350/972] drm/panel: boe-tv101wum-nl6: restore MODE_LPM after sending disable cmds When preparing the panel, it seems that it always expects commands to be transferred in LP mode. However, the disable function removes the MIPI_DSI_MODE_LPM flag, and no other function re-adds it. As the unprepare function contains no DSI commands, re-adding the flag just after disabling the panel should be safe. Add the code re-adding the flag after the two commands for disabling the panel are sent. This fixes error messages shown in kernel log when unblanking on mt8183-kukui-kodama-sku32 device. Cc: stable@vger.kernel.org Fixes: a869b9db7adf ("drm/panel: support for boe tv101wum-nl6 wuxga dsi video mode panel") Signed-off-by: Icenowy Zheng Reviewed-by: Neil Armstrong Reviewed-by: Douglas Anderson Signed-off-by: Neil Armstrong Link: https://patch.msgid.link/20260503091708.1079962-1-zhengxingda@iscas.ac.cn --- drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c index d5fe105bdbdde5..658ce64c71eb2b 100644 --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c @@ -1324,6 +1324,8 @@ static int boe_panel_disable(struct drm_panel *panel) mipi_dsi_dcs_set_display_off_multi(&ctx); mipi_dsi_dcs_enter_sleep_mode_multi(&ctx); + boe->dsi->mode_flags |= MIPI_DSI_MODE_LPM; + mipi_dsi_msleep(&ctx, 150); return ctx.accum_err; From 2d4e80271f784aa0c7b17676e9762c7e8156be1c Mon Sep 17 00:00:00 2001 From: Icenowy Zheng Date: Sun, 26 Apr 2026 00:57:51 +0800 Subject: [PATCH 351/972] drm/panel: himax-hx83102: restore MODE_LPM after sending disable cmds When preparing the panel, it seems that it always expects commands to be transferred in LP mode. However, the disable function removes the MIPI_DSI_MODE_LPM flag, and no other function re-adds it. As the unprepare function contains no DSI commands, re-adding the flag just after disabling the panel should be safe. Add the code re-adding the flag after the two commands for disabling the panel are sent. This fixes screen unblanking (after blanking once) on mt8188-geralt-ciri-sku1 device. Cc: stable@vger.kernel.org # 6.11+ Fixes: 0ef94554dc40 ("drm/panel: himax-hx83102: Break out as separate driver") Signed-off-by: Icenowy Zheng Reviewed-by: Neil Armstrong Reviewed-by: Douglas Anderson Signed-off-by: Neil Armstrong Link: https://patch.msgid.link/20260425165751.1716569-1-zhengxingda@iscas.ac.cn --- drivers/gpu/drm/panel/panel-himax-hx83102.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panel/panel-himax-hx83102.c b/drivers/gpu/drm/panel/panel-himax-hx83102.c index 8b2a68ee851e39..a5e5c9ea7a73f5 100644 --- a/drivers/gpu/drm/panel/panel-himax-hx83102.c +++ b/drivers/gpu/drm/panel/panel-himax-hx83102.c @@ -937,6 +937,8 @@ static int hx83102_disable(struct drm_panel *panel) mipi_dsi_dcs_set_display_off_multi(&dsi_ctx); mipi_dsi_dcs_enter_sleep_mode_multi(&dsi_ctx); + dsi->mode_flags |= MIPI_DSI_MODE_LPM; + mipi_dsi_msleep(&dsi_ctx, 150); return dsi_ctx.accum_err; From 8cf5dd235eff6008cb04c3d8064d2acfa90616f1 Mon Sep 17 00:00:00 2001 From: Prasanna Kumar T S M Date: Wed, 1 Apr 2026 04:18:56 -0700 Subject: [PATCH 352/972] EDAC/versalnet: Fix device name memory leak The device name allocated via kzalloc() in init_one_mc() is assigned to dev->init_name but never freed on the normal removal path. device_register() copies init_name and then sets dev->init_name to NULL, so the name pointer becomes unreachable from the device. Thus leaking memory. Use a stack-local char array instead of using kzalloc() for name. Fixes: d5fe2fec6c40 ("EDAC: Add a driver for the AMD Versal NET DDR controller") Signed-off-by: Prasanna Kumar T S M Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260401111856.2342975-1-ptsm@linux.microsoft.com --- drivers/edac/versalnet_edac.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c index ec13155824141d..97ec05d68bbbce 100644 --- a/drivers/edac/versalnet_edac.c +++ b/drivers/edac/versalnet_edac.c @@ -777,9 +777,9 @@ static int init_one_mc(struct mc_priv *priv, struct platform_device *pdev, int i u32 num_chans, rank, dwidth, config; struct edac_mc_layer layers[2]; struct mem_ctl_info *mci; + char name[MC_NAME_LEN]; struct device *dev; enum dev_type dt; - char *name; int rc; config = priv->adec[CONF + i * ADEC_NUM]; @@ -813,13 +813,9 @@ static int init_one_mc(struct mc_priv *priv, struct platform_device *pdev, int i layers[1].is_virt_csrow = false; rc = -ENOMEM; - name = kzalloc(MC_NAME_LEN, GFP_KERNEL); - if (!name) - return rc; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) - goto err_name_free; + return rc; mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, sizeof(struct mc_priv)); if (!mci) { @@ -858,8 +854,6 @@ static int init_one_mc(struct mc_priv *priv, struct platform_device *pdev, int i edac_mc_free(mci); err_dev_free: kfree(dev); -err_name_free: - kfree(name); return rc; } From 83861c48ba122f85cc8384780764b3a791341678 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 30 Apr 2026 23:32:50 +0200 Subject: [PATCH 353/972] openvswitch: vport: fix race between tunnel creation and linking When a tunnel vport is created it first creates the tunnel device, e.g., with geneve_dev_create_fb(), then it calls ovs_netdev_link() to take a reference and link it to the device that represents openvswitch datapath. The creation of the device is happening under RTNL, but then RTNL is released and re-acquired to find the device by name. It is technically possible for the tunnel device to be re-named or deleted within that window while RTNL is not held, and some other device created in its place. This will cause a non-tunnel device to be referenced in the vport and tunnel-specific functions used on it, e.g. vxlan_get_options() that directly casts the private netdev data into a struct vxlan_dev causing an invalid memory access: BUG: KASAN: slab-use-after-free in vxlan_get_options+0x323/0x3a0 vxlan_get_options+0x323/0x3a0 ovs_vport_cmd_new+0x6e3/0xd30 Fix that by taking a reference to the just created device before releasing RTNL. This ensures that the device in the vport is always the one that was just created. The search by name is only needed for a standard vport-netdev that links pre-existing devices, so that functionality and device type checks are moved to netdev_create(). It is also awkward that ovs_netdev_link() takes ownership of the vport and destroys it on failure. It doesn't know the type of the port it is dealing with, so we need to pass down the indicator that it's a tunnel, so the link can be properly deleted on failure. It's possible to refactor the logic to make the ovs_netdev_link() do only the linking part and let the callers perform a proper destruction, but it will be much more code for each legacy tunnel port type, so it is not worth it for the bug fix. Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device") Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Reported-by: Yang Yang Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20260430213349.407991-1-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/openvswitch/vport-geneve.c | 5 ++- net/openvswitch/vport-gre.c | 5 ++- net/openvswitch/vport-netdev.c | 58 ++++++++++++++++++++-------------- net/openvswitch/vport-netdev.h | 2 +- net/openvswitch/vport-vxlan.c | 5 ++- 5 files changed, 48 insertions(+), 27 deletions(-) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index b10e1602c6b147..cb5ea4424ffc80 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -97,6 +97,9 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -111,7 +114,7 @@ static struct vport *geneve_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_geneve_vport_ops = { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 4014c9b5eb7987..6cb5a697b396a3 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,6 +63,9 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_PTR(err); } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; } @@ -75,7 +78,7 @@ static struct vport *gre_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_gre_vport_ops = { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 12055af832dc08..a92ca8b37f96ab 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -73,37 +73,21 @@ static struct net_device *get_dpdev(const struct datapath *dp) return local->dev; } -struct vport *ovs_netdev_link(struct vport *vport, const char *name) +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel) { int err; - vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); - if (!vport->dev) { + if (WARN_ON_ONCE(!vport->dev)) { err = -ENODEV; goto error_free_vport; } - /* Ensure that the device exists and that the provided - * name is not one of its aliases. - */ - if (strcmp(name, ovs_vport_name(vport))) { - err = -ENODEV; - goto error_put; - } - netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); - if (vport->dev->flags & IFF_LOOPBACK || - (vport->dev->type != ARPHRD_ETHER && - vport->dev->type != ARPHRD_NONE) || - ovs_is_internal_dev(vport->dev)) { - err = -EINVAL; - goto error_put; - } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) - goto error_unlock; + goto error_put_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); @@ -119,10 +103,11 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); -error_unlock: - rtnl_unlock(); -error_put: +error_put_unlock: + if (tunnel && vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); + rtnl_unlock(); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); @@ -132,12 +117,39 @@ EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; + int err; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); + if (!vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); + + /* Ensure that the provided name is not an alias. */ + if (strcmp(parms->name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } + + if (vport->dev->flags & IFF_LOOPBACK || + (vport->dev->type != ARPHRD_ETHER && + vport->dev->type != ARPHRD_NONE) || + ovs_is_internal_dev(vport->dev)) { + err = -EINVAL; + goto error_put; + } + + return ovs_netdev_link(vport, false); +error_put: + netdev_put(vport->dev, &vport->dev_tracker); +error_free_vport: + ovs_vport_free(vport); + return ERR_PTR(err); } static void vport_netdev_free(struct rcu_head *rcu) diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index c5d83a43bfc49b..6c0d7366f9862e 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -13,7 +13,7 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct vport *ovs_netdev_link(struct vport *vport, const char *name); +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0b881b043bcf4a..c1b37b50d29e15 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -126,6 +126,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -140,7 +143,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_vxlan_netdev_vport_ops = { From aa69918bd418e700309fdd08509dba324fb24296 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 May 2026 01:38:37 +0200 Subject: [PATCH 354/972] openvswitch: vport: fix self-deadlock on release of tunnel ports vports are used concurrently and protected by RCU, so netdev_put() must happen after the RCU grace period. So, either in an RCU call or after the synchronize_net(). The rtnl_delete_link() must happen under RTNL and so can't be executed in RCU context. Calling synchronize_net() while holding RTNL is not a good idea for performance and system stability under load in general, so calling netdev_put() in RCU call is the right solution here. However, when the device is deleted, rtnl_unlock() will call netdev_run_todo() and block until all the references are gone. In the current code this means that we never reach the call_rcu() and the vport is never freed and the reference is never released, causing a self-deadlock on device removal. Fix that by moving the rcu_call() before the rtnl_unlock(), so the scheduled RCU callback will be executed when synchronize_net() is called from the rtnl_unlock()->netdev_run_todo() while the RTNL itself is already released. Fixes: 6931d21f87bc ("openvswitch: defer tunnel netdev_put to RCU release") Cc: stable@vger.kernel.org Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Link: https://patch.msgid.link/20260430233848.440994-2-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/openvswitch/vport-netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index a92ca8b37f96ab..c42642075685d7 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -208,9 +208,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); - rtnl_unlock(); + /* We can't put the device reference yet, since it can still be in + * use, but rtnl_unlock()->netdev_run_todo() will block until all + * the references are released, so the RCU call must be before it. + */ call_rcu(&vport->rcu, vport_netdev_free); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); From 05416ada37aa4efe93f25b0532f551d424fb7b3d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 May 2026 01:38:38 +0200 Subject: [PATCH 355/972] selftests: openvswitch: add tests for tunnel vport refcounting There were a few issues found with the tunnel vport types around the vport destruction code. Add some basic tests, so at least we know that they can be properly added and removed without obvious issues. The test creates OVS datapath, adds a non-LWT tunnel port, makes sure they are created, and then removes the datapath and waits for all the ports to be gone. The dpctl script had a few bugs in the none-lwt tunnel creation code, so fixing them as well to make the testing possible: - The type of the --lwt option changed in order to properly disable it. - Removed byte order conversion for the port numbers, as the value supposed to be in the host order. - Added missing 'gre' choice for the tunnel type. Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Acked-by: Aaron Conole Link: https://patch.msgid.link/20260430233848.440994-3-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- .../selftests/net/openvswitch/openvswitch.sh | 37 +++++++++++++++++++ .../selftests/net/openvswitch/ovs-dpctl.py | 19 +++++++--- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index b327d3061ed53a..3cdd953f681324 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -26,6 +26,7 @@ tests=" netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces tunnel_metadata ovs: test extraction of tunnel metadata + tunnel_refcount ovs: test tunnel vport reference cleanup drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -830,6 +831,42 @@ test_tunnel_metadata() { return 0 } +test_tunnel_refcount() { + sbxname="test_tunnel_refcount" + sbx_add "${sbxname}" || return 1 + + ovs_sbx "${sbxname}" ip netns add trefns || return 1 + on_exit "ovs_sbx ${sbxname} ip netns del trefns" + + for tun_type in gre vxlan geneve; do + info "testing ${tun_type} tunnel vport refcount" + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-dp dp-${tun_type} || return 1 + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-if --no-lwt -t ${tun_type} \ + dp-${tun_type} ovs-${tun_type}0 || return 1 + + ovs_wait ip -netns trefns link show \ + ovs-${tun_type}0 >/dev/null 2>&1 || return 1 + + info "deleting dp - may hang if reference counting is broken" + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + del-dp dp-${tun_type} & + + dev_removed() { + ! ip -netns trefns link show "$1" >/dev/null 2>&1 + } + ovs_wait dev_removed dp-${tun_type} || return 1 + ovs_wait dev_removed ovs-${tun_type}0 || return 1 + done + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 848f61fdcee09a..bbe35e2718d269 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -11,7 +11,6 @@ import math import multiprocessing import re -import socket import struct import sys import time @@ -2069,7 +2068,7 @@ def str_to_type(vport_type): elif vport_type == "internal": return OvsVport.OVS_VPORT_TYPE_INTERNAL elif vport_type == "gre": - return OvsVport.OVS_VPORT_TYPE_INTERNAL + return OvsVport.OVS_VPORT_TYPE_GRE elif vport_type == "vxlan": return OvsVport.OVS_VPORT_TYPE_VXLAN elif vport_type == "geneve": @@ -2121,6 +2120,7 @@ def attach(self, dpindex, vport_ifname, ptype, dport, lwt): ) TUNNEL_DEFAULTS = [("geneve", 6081), + ("gre", 0), ("vxlan", 4789)] for tnl in TUNNEL_DEFAULTS: @@ -2129,9 +2129,13 @@ def attach(self, dpindex, vport_ifname, ptype, dport, lwt): dport = tnl[1] if not lwt: + if tnl[0] == "gre": + # GRE tunnels have no options. + break + vportopt = OvsVport.ovs_vport_msg.vportopts() vportopt["attrs"].append( - ["OVS_TUNNEL_ATTR_DST_PORT", socket.htons(dport)] + ["OVS_TUNNEL_ATTR_DST_PORT", dport] ) msg["attrs"].append( ["OVS_VPORT_ATTR_OPTIONS", vportopt] @@ -2145,6 +2149,9 @@ def attach(self, dpindex, vport_ifname, ptype, dport, lwt): geneve_port=dport, geneve_collect_metadata=True, geneve_udp_zero_csum6_rx=1) + elif tnl[0] == "gre": + ipr.link("add", ifname=vport_ifname, kind="gretap", + gre_collect_metadata=True) elif tnl[0] == "vxlan": ipr.link("add", ifname=vport_ifname, kind=tnl[0], vxlan_learning=0, vxlan_collect_metadata=1, @@ -2563,7 +2570,7 @@ def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()): if vpo: dpo = vpo.get_attr("OVS_TUNNEL_ATTR_DST_PORT") if dpo: - opts += " tnl-dport:%s" % socket.ntohs(dpo) + opts += " tnl-dport:%s" % dpo print( " port %d: %s (%s%s)" % ( @@ -2632,7 +2639,7 @@ def main(argv): "--ptype", type=str, default="netdev", - choices=["netdev", "internal", "geneve", "vxlan"], + choices=["netdev", "internal", "gre", "geneve", "vxlan"], help="Interface type (default netdev)", ) addifcmd.add_argument( @@ -2645,7 +2652,7 @@ def main(argv): addifcmd.add_argument( "-l", "--lwt", - type=bool, + action=argparse.BooleanOptionalAction, default=True, help="Use LWT infrastructure instead of vport (default true)." ) From 44b550d88b267320459d518c0743a241ab2108fa Mon Sep 17 00:00:00 2001 From: Nan Li Date: Fri, 1 May 2026 09:08:44 +0800 Subject: [PATCH 356/972] net/rds: handle zerocopy send cleanup before the message is queued A zerocopy send can fail after user pages have been pinned but before the message is attached to the sending socket. The purge path currently infers zerocopy state from rm->m_rs, so an unqueued message can be cleaned up as if it owned normal payload pages. However, zerocopy ownership is really determined by the presence of op_mmp_znotifier, regardless of whether the message has reached the socket queue. Capture op_mmp_znotifier up front in rds_message_purge() and use it as the cleanup discriminator. If the message is already associated with a socket, keep the existing completion path. Otherwise, drop the pinned page accounting directly and release the notifier before putting the payload pages. This keeps early send failure cleanup consistent with the zerocopy lifetime rules without changing the normal queued completion path. Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Xiao Liu Signed-off-by: Xiao Liu Signed-off-by: Nan Li Signed-off-by: Ren Wei Reviewed-by: Allison Henderson Link: https://patch.msgid.link/d2ea98a6313d5467bac00f7c9fef8c7acddb9258.1777550074.git.tonanli66@gmail.com Signed-off-by: Paolo Abeni --- net/rds/message.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index eaa6f22601a447..25fedcb3cd00ec 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -131,24 +131,34 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, */ static void rds_message_purge(struct rds_message *rm) { + struct rds_znotifier *znotifier; unsigned long i, flags; - bool zcopy = false; + bool zcopy; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; spin_lock_irqsave(&rm->m_rs_lock, flags); + znotifier = rm->data.op_mmp_znotifier; + rm->data.op_mmp_znotifier = NULL; + zcopy = !!znotifier; + if (rm->m_rs) { struct rds_sock *rs = rm->m_rs; - if (rm->data.op_mmp_znotifier) { - zcopy = true; - rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + if (znotifier) { + rds_rm_zerocopy_callback(rs, znotifier); rds_wake_sk_sleep(rs); - rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; + } else if (znotifier) { + /* + * Zerocopy can fail before the message is queued on the + * socket, so there is no rs to carry the notification. + */ + mm_unaccount_pinned_pages(&znotifier->z_mmp); + kfree(rds_info_from_znotifier(znotifier)); } spin_unlock_irqrestore(&rm->m_rs_lock, flags); From 95084f1883a760e0d4290698346759d58e2b944a Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Thu, 30 Apr 2026 19:47:12 -0700 Subject: [PATCH 357/972] net: mana: Fix crash from unvalidated SHM offset read from BAR0 during FLR During Function Level Reset recovery, the MANA driver reads hardware BAR0 registers that may temporarily contain garbage values. The SHM (Shared Memory) offset read from GDMA_REG_SHM_OFFSET is used to compute gc->shm_base, which is later dereferenced via readl() in mana_smc_poll_register(). If the hardware returns an unaligned or out-of-range value, the driver must not blindly use it, as this would propagate the hardware error into a kernel crash. The following crash was observed on an arm64 Hyper-V guest running kernel 6.17.0-3013-azure during VF reset recovery triggered by HWC timeout. [13291.785274] Unable to handle kernel paging request at virtual address ffff8000a200001b [13291.785311] Mem abort info: [13291.785332] ESR = 0x0000000096000021 [13291.785343] EC = 0x25: DABT (current EL), IL = 32 bits [13291.785355] SET = 0, FnV = 0 [13291.785363] EA = 0, S1PTW = 0 [13291.785372] FSC = 0x21: alignment fault [13291.785382] Data abort info: [13291.785391] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [13291.785404] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [13291.785412] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [13291.785421] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000014df3a1000 [13291.785432] [ffff8000a200001b] pgd=1000000100438403, p4d=1000000100438403, pud=1000000100439403, pmd=0068000fc2000711 [13291.785703] Internal error: Oops: 0000000096000021 [#1] SMP [13291.830975] Modules linked in: tls qrtr mana_ib ib_uverbs ib_core xt_owner xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables cfg80211 8021q garp mrp stp llc binfmt_misc joydev serio_raw nls_iso8859_1 hid_generic aes_ce_blk aes_ce_cipher polyval_ce ghash_ce sm4_ce_gcm sm4_ce_ccm sm4_ce sm4_ce_cipher hid_hyperv sm4 sm3_ce sha3_ce hv_netvsc hid vmgenid hyperv_keyboard hyperv_drm sch_fq_codel nvme_fabrics efi_pstore dm_multipath nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vmw_vmci vsock dmi_sysfs ip_tables x_tables autofs4 [13291.862630] CPU: 122 UID: 0 PID: 61796 Comm: kworker/122:2 Tainted: G W 6.17.0-3013-azure #13-Ubuntu VOLUNTARY [13291.869902] Tainted: [W]=WARN [13291.871901] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 01/08/2026 [13291.878086] Workqueue: events mana_serv_func [13291.880718] pstate: 62400005 (nZCv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--) [13291.884835] pc : mana_smc_poll_register+0x48/0xb0 [13291.887902] lr : mana_smc_setup_hwc+0x70/0x1c0 [13291.890493] sp : ffff8000ab79bbb0 [13291.892364] x29: ffff8000ab79bbb0 x28: ffff00410c8b5900 x27: ffff00410d630680 [13291.896252] x26: ffff004171f9fd80 x25: 000000016ed55000 x24: 000000017f37e000 [13291.899990] x23: 0000000000000000 x22: 000000016ed55000 x21: 0000000000000000 [13291.904497] x20: ffff8000a200001b x19: 0000000000004e20 x18: ffff8000a6183050 [13291.908308] x17: 0000000000000000 x16: 0000000000000000 x15: 000000000000000a [13291.912542] x14: 0000000000000004 x13: 0000000000000000 x12: 0000000000000000 [13291.916298] x11: 0000000000000000 x10: 0000000000000001 x9 : ffffc45006af1bd8 [13291.920945] x8 : ffff000151129000 x7 : 0000000000000000 x6 : 0000000000000000 [13291.925293] x5 : 000000015f214000 x4 : 000000017217a000 x3 : 000000016ed50000 [13291.930436] x2 : 000000016ed55000 x1 : 0000000000000000 x0 : ffff8000a1ffffff [13291.934342] Call trace: [13291.935736] mana_smc_poll_register+0x48/0xb0 (P) [13291.938611] mana_smc_setup_hwc+0x70/0x1c0 [13291.941113] mana_hwc_create_channel+0x1a0/0x3a0 [13291.944283] mana_gd_setup+0x16c/0x398 [13291.946584] mana_gd_resume+0x24/0x70 [13291.948917] mana_do_service+0x13c/0x1d0 [13291.951583] mana_serv_func+0x34/0x68 [13291.953732] process_one_work+0x168/0x3d0 [13291.956745] worker_thread+0x2ac/0x480 [13291.959104] kthread+0xf8/0x110 [13291.961026] ret_from_fork+0x10/0x20 [13291.963560] Code: d2807d00 9417c551 71000673 54000220 (b9400281) [13291.967299] ---[ end trace 0000000000000000 ]--- Disassembly of mana_smc_poll_register() around the crash site: Disassembly of section .text: 00000000000047c8 : 47c8: d503201f nop 47cc: d503201f nop 47d0: d503233f paciasp 47d4: f800865e str x30, [x18], #8 47d8: a9bd7bfd stp x29, x30, [sp, #-48]! 47dc: 910003fd mov x29, sp 47e0: a90153f3 stp x19, x20, [sp, #16] 47e4: 91007014 add x20, x0, #0x1c 47e8: 5289c413 mov w19, #0x4e20 47ec: f90013f5 str x21, [sp, #32] 47f0: 12001c35 and w21, w1, #0xff 47f4: 14000008 b 4814 47f8: 36f801e1 tbz w1, #31, 4834 47fc: 52800042 mov w2, #0x2 4800: d280fa01 mov x1, #0x7d0 4804: d2807d00 mov x0, #0x3e8 4808: 94000000 bl 0 480c: 71000673 subs w19, w19, #0x1 4810: 54000200 b.eq 4850 4814: b9400281 ldr w1, [x20] <-- **** CRASHED HERE ***** 4818: d50331bf dmb oshld 481c: 2a0103e2 mov w2, w1 ... From the crash signature x20 = ffff8000a200001b, this address ends in 0x1b which is not 4-byte aligned, so the 'ldr w1, [x20]' instruction (readl) triggers the arm64 alignment fault (FSC = 0x21). The root cause is in mana_gd_init_vf_regs(), which computes: gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); The offset is used without any validation. The same problem exists in mana_gd_init_pf_regs() for sriov_base_off and sriov_shm_off. Fix this by validating all offsets before use: - VF: check shm_off is within BAR0, properly aligned to 4 bytes (readl requirement), and leaves room for the full 256-bit (32-byte) SMC aperture. - PF: check sriov_base_off is within BAR0, aligned to 8 bytes (readq requirement), and leaves room to safely read the sriov_shm_off register at sriov_base_off + GDMA_PF_REG_SHM_OFF. Then check sriov_shm_off leaves room for the full SMC aperture. All arithmetic uses subtraction rather than addition to avoid integer overflow on garbage values. Define SMC_APERTURE_SIZE (32 bytes, derived from the 256-bit aperture width) Return -EPROTO on invalid values. The existing recovery path in mana_serv_reset() already handles -EPROTO by falling through to PCI device rescan, giving the hardware another chance to present valid register values after reset. Fixes: 9bf66036d686 ("net: mana: Handle hardware recovery events when probing the device") Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/afQUMClyjmBVfD+u@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni --- .../net/ethernet/microsoft/mana/gdma_main.c | 40 ++++++++++++++++--- .../net/ethernet/microsoft/mana/shm_channel.c | 5 --- include/net/mana/shm_channel.h | 6 +++ 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 098fbda0d128a5..d8e816882f02c2 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -43,8 +43,9 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset) static int mana_gd_init_pf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); - void __iomem *sriov_base_va; + u64 remaining_barsize; u64 sriov_base_off; + u64 sriov_shm_off; gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF; @@ -73,10 +74,28 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev) gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); + if (sriov_base_off >= gc->bar0_size || + gc->bar0_size - sriov_base_off < + GDMA_PF_REG_SHM_OFF + sizeof(u64) || + !IS_ALIGNED(sriov_base_off, sizeof(u64))) { + dev_err(gc->dev, + "SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + sriov_base_off, (u64)gc->bar0_size); + return -EPROTO; + } - sriov_base_va = gc->bar0_va + sriov_base_off; - gc->shm_base = sriov_base_va + - mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF); + remaining_barsize = gc->bar0_size - sriov_base_off; + sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF); + if (sriov_shm_off >= remaining_barsize || + remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE || + !IS_ALIGNED(sriov_shm_off, sizeof(u32))) { + dev_err(gc->dev, + "SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + sriov_shm_off, (u64)gc->bar0_size); + return -EPROTO; + } + + gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off; return 0; } @@ -84,6 +103,7 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev) static int mana_gd_init_vf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); + u64 shm_off; gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF; @@ -111,7 +131,17 @@ static int mana_gd_init_vf_regs(struct pci_dev *pdev) gc->db_page_base = gc->bar0_va + gc->db_page_off; gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; - gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); + shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); + if (shm_off >= gc->bar0_size || + gc->bar0_size - shm_off < SMC_APERTURE_SIZE || + !IS_ALIGNED(shm_off, sizeof(u32))) { + dev_err(gc->dev, + "SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + shm_off, (u64)gc->bar0_size); + return -EPROTO; + } + + gc->shm_base = gc->bar0_va + shm_off; return 0; } diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c index 0f1679ebad96ba..d21b5db06e5092 100644 --- a/drivers/net/ethernet/microsoft/mana/shm_channel.c +++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c @@ -61,11 +61,6 @@ union smc_proto_hdr { }; }; /* HW DATA */ -#define SMC_APERTURE_BITS 256 -#define SMC_BASIC_UNIT (sizeof(u32)) -#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) -#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) - static int mana_smc_poll_register(void __iomem *base, bool reset) { void __iomem *ptr = base + SMC_LAST_DWORD * SMC_BASIC_UNIT; diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h index 5199b41497fff9..dbabcfb95daf3e 100644 --- a/include/net/mana/shm_channel.h +++ b/include/net/mana/shm_channel.h @@ -4,6 +4,12 @@ #ifndef _SHM_CHANNEL_H #define _SHM_CHANNEL_H +#define SMC_APERTURE_BITS 256 +#define SMC_BASIC_UNIT (sizeof(u32)) +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) +#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8) + struct shm_channel { struct device *dev; void __iomem *base; From b9eac6a9d93c952c4b7775a24d5c7a1bbf4c3c00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 00:47:54 +0200 Subject: [PATCH 358/972] rseq: Revert to historical performance killing behaviour The recent RSEQ optimization work broke the TCMalloc abuse of the RSEQ ABI as it not longer unconditionally updates the CPU, node, mm_cid fields, which are documented as read only for user space. Due to the observed behavior of the kernel it was possible for TCMalloc to overwrite the cpu_id_start field for their own purposes and rely on the kernel to update it unconditionally after each context switch and before signal delivery. The RSEQ ABI only guarantees that these fields are updated when the data changes, i.e. the task is migrated or the MMCID of the task changes due to switching from or to per CPU ownership mode. The optimization work eliminated the unconditional updates and reduced them to the documented ABI guarantees, which results in a massive performance win for syscall, scheduling heavy work loads, which in turn breaks the TCMalloc expectations. There have been several options discussed to restore the TCMalloc functionality while preserving the optimization benefits. They all end up in a series of hard to maintain workarounds, which in the worst case introduce overhead for everyone, e.g. in the scheduler. The requirements of TCMalloc and the optimization work are diametral and the required work arounds are a maintainence burden. They end up as fragile constructs, which are blocking further optimization work and are pretty much guaranteed to cause more subtle issues down the road. The optimization work heavily depends on the generic entry code, which is not used by all architectures yet. So the rework preserved the original mechanism moslty unmodified to keep the support for architectures, which handle rseq in their own exit to user space loop. That code is currently optimized out by the compiler on architectures which use the generic entry code. This allows to revert back to the original behaviour by replacing the compile time constant conditions with a runtime condition where required, which disables the optimization and the dependend time slice extension feature until the run-time condition can be enabled in the RSEQ registration code on a per task basis again. The following changes are required to restore the original behavior, which makes TCMalloc work again: 1) Replace the compile time constant conditionals with runtime conditionals where appropriate to prevent the compiler from optimizing the legacy mode out 2) Enforce unconditional update of IDs on context switch for the non-optimized v1 mode 3) Enforce update of IDs in the pre signal delivery path for the non-optimized v1 mode 4) Enforce update of IDs in the membarrier(RSEQ) IPI for the non-optimized v1 mode 5) Make time slice and future extensions depend on optimized v2 mode This brings back the full performance problems, but preserves the v2 optimization code and for generic entry code using architectures also the TIF_RSEQ optimization which avoids a full evaluation of the exit to user mode loop in many cases. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Reported-by: Mathias Stearn Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Closes: https://lore.kernel.org/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com Link: https://patch.msgid.link/20260428224427.517051752%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq.h | 35 ++++++++++++++++++++++----------- include/linux/rseq_entry.h | 39 +++++++++++++++++++++++++++---------- include/linux/rseq_types.h | 9 ++++++++- kernel/rseq.c | 40 +++++++++++++++++++++++++++++++------- kernel/sched/membarrier.c | 11 ++++++++++- 5 files changed, 104 insertions(+), 30 deletions(-) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f446909551df05..7ef79b25e714b9 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,6 +9,11 @@ void __rseq_handle_slowpath(struct pt_regs *regs); +static __always_inline bool rseq_v2(struct task_struct *t) +{ + return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1); +} + /* Invoked from resume_user_mode_work() */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { @@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) if (current->rseq.event.slowpath) __rseq_handle_slowpath(regs); } else { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + if (current->rseq.event.sched_switch && current->rseq.event.has_rseq) __rseq_handle_slowpath(regs); } } @@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs); */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + if (rseq_v2(current)) { + /* has_rseq is implied in rseq_v2() */ + if (current->rseq.event.user_irq) __rseq_signal_deliver(ksig->sig, regs); } else { if (current->rseq.event.has_rseq) @@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) { struct rseq_event *ev = &t->rseq.event; - if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Only apply the user_irq optimization for RSEQ ABI V2 registrations. + * Legacy users like TCMalloc rely on the original ABI V1 behaviour + * which updates IDs on every context swtich. + */ + if (rseq_v2(t)) { /* - * Avoid a boat load of conditionals by using simple logic - * to determine whether NOTIFY_RESUME needs to be raised. + * Avoid a boat load of conditionals by using simple logic to + * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be + * raised. * - * It's required when the CPU or MM CID has changed or - * the entry was from user space. + * It's required when the CPU or MM CID has changed or the entry + * was via interrupt from user space. ev->has_rseq does not have + * to be evaluated here because rseq_v2() implies has_rseq. */ - bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + bool raise = ev->user_irq | ev->ids_changed; if (raise) { ev->sched_switch = true; @@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } } else { if (ev->has_rseq) { + t->rseq.event.ids_changed = true; t->rseq.event.sched_switch = true; rseq_raise_notify_resume(t); } @@ -161,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void) } #else /* CONFIG_RSEQ */ +static inline bool rseq_v2(struct task_struct *t) { return false; } static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f11ebd34f8b955..934db41ec78261 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +/* + * Open coded, so it can be invoked within a user access region. + * + * This clears the user space state of the time slice extensions field only when + * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations, + * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be + * overwritten by an unconditional write. + */ +#define rseq_slice_clear_user(rseq, efault) \ +do { \ + if (rseq_slice_extension_enabled()) \ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \ +} while (0) + static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; @@ -230,6 +244,7 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; } static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } +#define rseq_slice_clear_user(rseq, efault) do { } while (0) #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -517,11 +532,9 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); @@ -612,6 +625,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t * interrupts disabled */ guard(pagefault)(); + /* + * This optimization is only valid when the task registered for the + * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original + * RSEQ implementation behaviour which unconditionally updated the IDs. + * rseq_sched_switch_event() ensures that legacy registrations always + * have both sched_switch and ids_changed set, which is compatible with + * the historical TIF_NOTIFY_RESUME behaviour. + */ if (likely(!t->rseq.event.ids_changed)) { struct rseq __user *rseq = t->rseq.usrptr; /* @@ -623,11 +644,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t scoped_user_rw_access(rseq, efault) { unsafe_get_user(csaddr, &rseq->rseq_cs, efault); - /* Open coded, so it's in the same user access region */ - if (rseq_slice_extension_enabled()) { - /* Unconditionally clear it, no point in conditionals */ - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); - } + /* RSEQ ABI V2 only operations */ + if (rseq_v2(t)) + rseq_slice_clear_user(rseq, efault); } rseq_slice_clear_grant(t); diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0b42045988db00..a469c1870849c4 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -9,6 +9,12 @@ #ifdef CONFIG_RSEQ struct rseq; +/* + * rseq_event::has_rseq contains the ABI version number so preserving it + * in AND operations requires a mask. + */ +#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff + /** * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently @@ -17,7 +23,8 @@ struct rseq; * exit to user * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode - * @has_rseq: True if the task has a rseq pointer installed + * @has_rseq: Greater than 0 if the task has a rseq pointer installed. + * Contains the RSEQ version number * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME diff --git a/kernel/rseq.c b/kernel/rseq.c index 586f58f652c6e7..aa25753ea1350c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -253,11 +253,14 @@ static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) static void rseq_slowpath_update_usr(struct pt_regs *regs) { /* - * Preserve rseq state and user_irq state. The generic entry code - * clears user_irq on the way out, the non-generic entry - * architectures are not having user_irq. + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; struct task_struct *t = current; struct rseq_ids ids; u32 node_id; @@ -330,8 +333,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs) void __rseq_signal_deliver(int sig, struct pt_regs *regs) { rseq_stat_inc(rseq_stats.signal); + /* - * Don't update IDs, they are handled on exit to user if + * Don't update IDs yet, they are handled on exit to user if * necessary. The important thing is to abort a critical section of * the interrupted context as after this point the instruction * pointer in @regs points to the signal handler. @@ -344,6 +348,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs) current->rseq.event.error = 0; force_sigsegv(sig); } + + /* + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. + */ + if (!rseq_v2(current)) + rseq_force_update(); } /* @@ -408,6 +419,7 @@ static bool rseq_reset_ids(void) SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { u32 rseqfl = 0; + u8 version = 1; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -461,7 +473,11 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + /* + * The version check effectivly disables time slice extensions until the + * RSEQ ABI V2 registration are implemented. + */ + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { if (rseq_slice_extension_enabled()) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) @@ -484,7 +500,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } /* @@ -712,6 +736,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) return -ENOTSUPP; if (!current->rseq.usrptr) return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; /* No change? */ if (enable == !!current->rseq.slice.state.enabled) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 62344560372526..226a6329f3e928 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -199,7 +199,16 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_sched_switch_event(current); + /* + * Legacy mode requires that IDs are written and the critical section is + * evaluated. V2 optimized mode handles the critical section and IDs are + * only updated if they change as a consequence of preemption after + * return from this IPI. + */ + if (rseq_v2(current)) + rseq_sched_switch_event(current); + else + rseq_force_update(); } static void ipi_sync_rq_state(void *info) From 02b44d943b3adddc3a15c1da97045e205b7d14c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 15:46:06 +0200 Subject: [PATCH 359/972] selftests/rseq: Skip tests if time slice extensions are not available Don't fail, skip the test if the extensions are not enabled at compile or runtime. Fixes: 830969e7821a ("selftests/rseq: Implement time slice extension test") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.597838491%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/slice_test.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c index 357122dcb48703..77e668ff74d7f0 100644 --- a/tools/testing/selftests/rseq/slice_test.c +++ b/tools/testing/selftests/rseq/slice_test.c @@ -124,6 +124,13 @@ FIXTURE_SETUP(slice_ext) { cpu_set_t affinity; + if (rseq_register_current_thread()) + SKIP(return, "RSEQ not supported\n"); + + if (prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, + PR_RSEQ_SLICE_EXT_ENABLE, 0, 0)) + SKIP(return, "Time slice extension not supported\n"); + ASSERT_EQ(sched_getaffinity(0, sizeof(affinity), &affinity), 0); /* Pin it on a single CPU. Avoid CPU 0 */ @@ -137,11 +144,6 @@ FIXTURE_SETUP(slice_ext) break; } - ASSERT_EQ(rseq_register_current_thread(), 0); - - ASSERT_EQ(prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, - PR_RSEQ_SLICE_EXT_ENABLE, 0, 0), 0); - self->noise_params.noise_nsecs = variant->noise_nsecs; self->noise_params.sleep_nsecs = variant->sleep_nsecs; self->noise_params.run = 1; From d97cb2ef0b221b068e90b6058aa97faa0626bdab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 18:13:54 +0200 Subject: [PATCH 360/972] selftests/rseq: Make registration flexible for legacy and optimized mode rseq_register_current_thread() either uses the glibc registered RSEQ region or registers it's own region with the legacy size of 32 bytes. That worked so far, but becomes a problem when the kernel implements a distinction between legacy and performance optimized behavior based on the registration size as that does not allow to test both modes with the self test suite. Add two arguments to the function. One to enforce that the registration is not using libc provided mode and one to tell the registration to use the legacy size and not the kernel advertised size. Rename it and make the original one a inline wrapper which preserves the existing behavior. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.677889423%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/rseq-abi.h | 7 ++++- tools/testing/selftests/rseq/rseq.c | 39 ++++++++++++------------- tools/testing/selftests/rseq/rseq.h | 8 ++++- 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h index ecef315204b271..5f4ea2152c2fd8 100644 --- a/tools/testing/selftests/rseq/rseq-abi.h +++ b/tools/testing/selftests/rseq/rseq-abi.h @@ -191,10 +191,15 @@ struct rseq_abi { */ struct rseq_abi_slice_ctrl slice_ctrl; + /* + * Place holder to push the size above 32 bytes. + */ + __u8 __reserved; + /* * Flexible array member at end of structure, after last feature field. */ char end[]; -} __attribute__((aligned(4 * sizeof(__u64)))); +} __attribute__((aligned(256))); #endif /* _RSEQ_ABI_H */ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index a736727b83c1eb..be0d0a97031ef2 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -56,6 +56,7 @@ ptrdiff_t rseq_offset; * unsuccessful. */ unsigned int rseq_size = -1U; +static unsigned int rseq_alloc_size; /* Flags used during rseq registration. */ unsigned int rseq_flags; @@ -115,29 +116,17 @@ bool rseq_available(void) } } -/* The rseq areas need to be at least 32 bytes. */ -static -unsigned int get_rseq_min_alloc_size(void) -{ - unsigned int alloc_size = rseq_size; - - if (alloc_size < ORIG_RSEQ_ALLOC_SIZE) - alloc_size = ORIG_RSEQ_ALLOC_SIZE; - return alloc_size; -} - /* * Return the feature size supported by the kernel. * * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): * - * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) + * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). * * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. */ -static -unsigned int get_rseq_kernel_feature_size(void) +static unsigned int get_rseq_kernel_feature_size(void) { unsigned long auxv_rseq_feature_size, auxv_rseq_align; @@ -152,15 +141,24 @@ unsigned int get_rseq_kernel_feature_size(void) return ORIG_RSEQ_FEATURE_SIZE; } -int rseq_register_current_thread(void) +int __rseq_register_current_thread(bool nolibc, bool legacy) { + unsigned int size; int rc; if (!rseq_ownership) { /* Treat libc's ownership as a successful registration. */ - return 0; + return nolibc ? -EBUSY : 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); + + /* The minimal allocation size is 32, which is the legacy allocation size */ + size = get_rseq_kernel_feature_size(); + if (legacy || size < ORIG_RSEQ_ALLOC_SIZE) + rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE; + else + rseq_alloc_size = size; + + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG); if (rc) { /* * After at least one thread has registered successfully @@ -179,9 +177,8 @@ int rseq_register_current_thread(void) * The first thread to register sets the rseq_size to mimic the libc * behavior. */ - if (RSEQ_READ_ONCE(rseq_size) == 0) { - RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size()); - } + if (RSEQ_READ_ONCE(rseq_size) == 0) + RSEQ_WRITE_ONCE(rseq_size, size); return 0; } @@ -194,7 +191,7 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index f51a5fdb044431..c62ebb9290c010 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -8,6 +8,7 @@ #ifndef RSEQ_H #define RSEQ_H +#include #include #include #include @@ -142,7 +143,12 @@ static inline struct rseq_abi *rseq_get_abi(void) * succeed. A restartable sequence executed from a non-registered * thread will always fail. */ -int rseq_register_current_thread(void); +int __rseq_register_current_thread(bool nolibc, bool legacy); + +static inline int rseq_register_current_thread(void) +{ + return __rseq_register_current_thread(false, false); +} /* * Unregister rseq for current thread. From 9b4e3495d1bd2469bf94b74930c153c2d534ddb7 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Mon, 20 Apr 2026 11:55:57 -0400 Subject: [PATCH 361/972] drm/amdkfd: Make all TLB-flushes heavy-weight With only one sequence number we cannot track the need for legacy vs heavy-weight flushes reliably. Always use heavy-weight. Signed-off-by: Felix Kuehling Reviewed-by: Philip Yang Signed-off-by: Alex Deucher (cherry picked from commit c1a3ff1d327820cd9a52bc1056b98681fc088949) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 +++--- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f829d65a79b43e..f95bf6d9553463 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1360,7 +1360,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); if (WARN_ON_ONCE(!peer_pdd)) continue; - kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(peer_pdd); } kfree(devices_arr); @@ -1455,7 +1455,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, if (WARN_ON_ONCE(!peer_pdd)) continue; if (flush_tlb) - kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); + kfd_flush_tlb(peer_pdd); /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */ err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ab3b2e7be9bd04..9185ebe4c079a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -572,7 +572,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, qpd->vmid, qpd->page_table_base); /* invalidate the VM context after pasid and vmid mapping is set up */ - kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY); + kfd_flush_tlb(qpd_to_pdd(qpd)); if (dqm->dev->kfd2kgd->set_scratch_backing_va) dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->adev, @@ -610,7 +610,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm, if (flush_texture_cache_nocpsch(q->device, qpd)) dev_err(dev, "Failed to flush TC\n"); - kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY); + kfd_flush_tlb(qpd_to_pdd(qpd)); /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); @@ -1284,7 +1284,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, dqm->dev->adev, qpd->vmid, qpd->page_table_base); - kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(pdd); } /* Take a safe reference to the mm_struct, which may otherwise diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 163d665a6074cc..7b5b1220691987 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1554,13 +1554,13 @@ void kfd_signal_reset_event(struct kfd_node *dev); void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid); void kfd_signal_process_terminate_event(struct kfd_process *p); -static inline void kfd_flush_tlb(struct kfd_process_device *pdd, - enum TLB_FLUSH_TYPE type) +static inline void kfd_flush_tlb(struct kfd_process_device *pdd) { struct amdgpu_device *adev = pdd->dev->adev; struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); - amdgpu_vm_flush_compute_tlb(adev, vm, type, pdd->dev->xcc_mask); + amdgpu_vm_flush_compute_tlb(adev, vm, TLB_FLUSH_HEAVYWEIGHT, + pdd->dev->xcc_mask); } static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 38085a0a0f58ee..35ec67d9739bdd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1424,7 +1424,7 @@ svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, if (r) break; } - kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT); + kfd_flush_tlb(pdd); } return r; @@ -1571,7 +1571,7 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, } } - kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY); + kfd_flush_tlb(pdd); } return r; From 7bbfb2559bcec39d1a4e1182d931a2046112c352 Mon Sep 17 00:00:00 2001 From: "John B. Moore" Date: Tue, 28 Apr 2026 11:35:12 -0500 Subject: [PATCH 362/972] drm/amdgpu/gfx9: drop unnecessary 64-bit fence flag check in KIQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the BUG_ON(flags & AMDGPU_FENCE_FLAG_64BIT) assertion from gfx_v9_0_ring_emit_fence_kiq(). The KIQ hardware supports 64-bit fence writes; the 32-bit writeback address constraint is an upper-layer convention, not a hardware limitation. The check serves no purpose and should not be present. Found by code inspection while investigating related BUG_ON assertions in the GFX and compute ring emission paths. Reviewed-by: Christian König Signed-off-by: John B. Moore Signed-off-by: Alex Deucher (cherry picked from commit 1b1101a46a426bb4328116bb5273c326a2780389) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 95be105671ece8..86c7c2a429b713 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5660,9 +5660,6 @@ static void gfx_v9_0_ring_emit_fence_kiq(struct amdgpu_ring *ring, u64 addr, { struct amdgpu_device *adev = ring->adev; - /* we only allocate 32bit for each seq wb address */ - BUG_ON(flags & AMDGPU_FENCE_FLAG_64BIT); - /* write fence seq to the "addr" */ amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3)); amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) | From 2a561b361b7681509710f3cfc3d95d54c87ac69f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 27 Apr 2026 11:38:58 -0400 Subject: [PATCH 363/972] drm/amdgpu/pm: add missing revision check for CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ci_populate_all_memory_levels() workaround only applies to revision 0 SKUs. Link: https://gitlab.freedesktop.org/drm/amd/-/work_items/1816 Fixes: 9f4b35411cfe ("drm/amd/powerplay: add CI asics support to smumgr (v3)") Reviewed-by: Timur Kristóf Reviewed-by: Kent Russell Signed-off-by: Alex Deucher (cherry picked from commit 1db15ba8f72f400bbad8ae0ce24fafc43429d4bd) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c index 731355bdb9bc30..0a3a0722b5c977 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c @@ -1333,8 +1333,9 @@ static int ci_populate_all_memory_levels(struct pp_hwmgr *hwmgr) dev_id = adev->pdev->device; - if ((dpm_table->mclk_table.count >= 2) - && ((dev_id == 0x67B0) || (dev_id == 0x67B1))) { + if ((dpm_table->mclk_table.count >= 2) && + ((dev_id == 0x67B0) || (dev_id == 0x67B1)) && + (adev->pdev->revision == 0)) { smu_data->smc_state_table.MemoryLevel[1].MinVddci = smu_data->smc_state_table.MemoryLevel[0].MinVddci; smu_data->smc_state_table.MemoryLevel[1].MinMvdd = From 1987c79b4fe5789dfa14423e78b5c25f6acf3e9d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 28 Apr 2026 10:42:49 -0400 Subject: [PATCH 364/972] drm/amdgpu/pm: align Hawaii mclk workaround with radeon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the hawaii mclk workaround with radeon and windows. Link: https://gitlab.freedesktop.org/drm/amd/-/work_items/1816 Fixes: 9f4b35411cfe ("drm/amd/powerplay: add CI asics support to smumgr (v3)") Reviewed-by: Timur Kristóf Reviewed-by: Kent Russell Signed-off-by: Alex Deucher (cherry picked from commit 9649528b637f668c5af9f2b83ca4ad8576ae2121) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c index 0a3a0722b5c977..3650e7beeb6712 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c @@ -1336,10 +1336,10 @@ static int ci_populate_all_memory_levels(struct pp_hwmgr *hwmgr) if ((dpm_table->mclk_table.count >= 2) && ((dev_id == 0x67B0) || (dev_id == 0x67B1)) && (adev->pdev->revision == 0)) { - smu_data->smc_state_table.MemoryLevel[1].MinVddci = - smu_data->smc_state_table.MemoryLevel[0].MinVddci; - smu_data->smc_state_table.MemoryLevel[1].MinMvdd = - smu_data->smc_state_table.MemoryLevel[0].MinMvdd; + smu_data->smc_state_table.MemoryLevel[1].MinVddc = + smu_data->smc_state_table.MemoryLevel[0].MinVddc; + smu_data->smc_state_table.MemoryLevel[1].MinVddcPhases = + smu_data->smc_state_table.MemoryLevel[0].MinVddcPhases; } smu_data->smc_state_table.MemoryLevel[0].ActivityLevel = 0x1F; CONVERT_FROM_HOST_TO_SMC_US(smu_data->smc_state_table.MemoryLevel[0].ActivityLevel); From 17223816498f7b117d138d18eb0eba63604dc74e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 27 Apr 2026 11:40:25 -0400 Subject: [PATCH 365/972] drm/radeon: add missing revision check for CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory level workarounds only apply to revision 0 SKUs. Link: https://gitlab.freedesktop.org/drm/amd/-/work_items/1816 Fixes: 127e056e2a82 ("drm/radeon: fix mclk vddc configuration for cards for hawaii") Fixes: 21b8a369046f ("drm/radeon: fix dram timing for certain hawaii boards") Fixes: 90b2fee35cb9 ("drm/radeon: fix dpm mc init for certain hawaii boards") Reviewed-by: Timur Kristóf Reviewed-by: Kent Russell Signed-off-by: Alex Deucher (cherry picked from commit 4d8dcc14311515077062b5740f39f427075de5c9) Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/ci_dpm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index 22321eb95b7d5d..703848fac18933 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -2461,7 +2461,8 @@ static void ci_register_patching_mc_arb(struct radeon_device *rdev, if (patch && ((rdev->pdev->device == 0x67B0) || - (rdev->pdev->device == 0x67B1))) { + (rdev->pdev->device == 0x67B1)) && + (rdev->pdev->revision == 0)) { if ((memory_clock > 100000) && (memory_clock <= 125000)) { tmp2 = (((0x31 * engine_clock) / 125000) - 1) & 0xff; *dram_timimg2 &= ~0x00ff0000; @@ -3304,7 +3305,8 @@ static int ci_populate_all_memory_levels(struct radeon_device *rdev) pi->smc_state_table.MemoryLevel[0].EnabledForActivity = 1; if ((dpm_table->mclk_table.count >= 2) && - ((rdev->pdev->device == 0x67B0) || (rdev->pdev->device == 0x67B1))) { + ((rdev->pdev->device == 0x67B0) || (rdev->pdev->device == 0x67B1)) && + (rdev->pdev->revision == 0)) { pi->smc_state_table.MemoryLevel[1].MinVddc = pi->smc_state_table.MemoryLevel[0].MinVddc; pi->smc_state_table.MemoryLevel[1].MinVddcPhases = @@ -4493,7 +4495,8 @@ static int ci_register_patching_mc_seq(struct radeon_device *rdev, if (patch && ((rdev->pdev->device == 0x67B0) || - (rdev->pdev->device == 0x67B1))) { + (rdev->pdev->device == 0x67B1)) && + (rdev->pdev->revision == 0)) { for (i = 0; i < table->last; i++) { if (table->last >= SMU7_DISCRETE_MC_REGISTER_ARRAY_SIZE) return -EINVAL; From 78d2e624fa073c14970aa097adcf3ea31c157a66 Mon Sep 17 00:00:00 2001 From: "John B. Moore" Date: Mon, 27 Apr 2026 16:06:28 -0500 Subject: [PATCH 366/972] drm/amdgpu/sdma4: replace BUG_ON with WARN_ON in fence emission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sdma_v4_0_ring_emit_fence() contains two BUG_ON(addr & 0x3) assertions that verify fence writeback addresses are dword-aligned. These assertions can be reached from unprivileged userspace via crafted DRM_IOCTL_AMDGPU_CS submissions, causing a fatal kernel panic in a scheduler worker thread. Replace both BUG_ON() calls with WARN_ON() to log the condition without crashing the kernel. A misaligned fence address at this point indicates a driver bug, but crashing the kernel is never the correct response when the assertion is reachable from userspace. The CS IOCTL path is the correct place to filter invalid submissions; the ring emission callback is too late to do anything about it. Fixes: 2130f89ced2c ("drm/amdgpu: add SDMA v4.0 implementation (v2)") Reviewed-by: Christian König Signed-off-by: John B. Moore Signed-off-by: Alex Deucher (cherry picked from commit b90250bd933afd1ba94d86d6b13821997b22b18e) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 44f0f23e114843..e64f2f6df9a9e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -889,7 +889,7 @@ static void sdma_v4_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se /* write the fence */ amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_FENCE)); /* zero in first two bits */ - BUG_ON(addr & 0x3); + WARN_ON(addr & 0x3); amdgpu_ring_write(ring, lower_32_bits(addr)); amdgpu_ring_write(ring, upper_32_bits(addr)); amdgpu_ring_write(ring, lower_32_bits(seq)); @@ -899,7 +899,7 @@ static void sdma_v4_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, u64 se addr += 4; amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_FENCE)); /* zero in first two bits */ - BUG_ON(addr & 0x3); + WARN_ON(addr & 0x3); amdgpu_ring_write(ring, lower_32_bits(addr)); amdgpu_ring_write(ring, upper_32_bits(addr)); amdgpu_ring_write(ring, upper_32_bits(seq)); From e6c2e6c2e1fa066968a16aca1cb66cd1bdde7741 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 27 Apr 2026 09:30:23 -0400 Subject: [PATCH 367/972] drm/amdgpu: zero-initialize GART table on allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GART TLB is flushed after unmapping but not after mapping. Since amdgpu_bo_create_kernel() does not zero-initialize the buffer, when a single PTE is written the TLB may speculatively load other uninitialized entries from the same cacheline. Those garbage entries can appear valid, and a subsequent write to another PTE in the same cacheline may cause the GPU to use a stale garbage PTE from the TLB. Fix this by calling memset_io() to zero-initialize the GART table with gart_pte_flags immediately after allocation. Using AMDGPU_GEM_CREATE_VRAM_CLEARED, SDMA-based clear will not work since SDMA needs GART to be initialized to work. Suggested-by: Felix Kuehling Signed-off-by: Philip Yang Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit d9af8263b82b6eaa60c5718e0c6631c5037e4b24) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c index bc772ca3dab726..b6f849d51c2e77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c @@ -262,12 +262,19 @@ void amdgpu_gart_table_ram_free(struct amdgpu_device *adev) */ int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev) { + int r; + if (adev->gart.bo != NULL) return 0; - return amdgpu_bo_create_kernel(adev, adev->gart.table_size, PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, &adev->gart.bo, - NULL, (void *)&adev->gart.ptr); + r = amdgpu_bo_create_kernel(adev, adev->gart.table_size, PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, &adev->gart.bo, + NULL, (void *)&adev->gart.ptr); + if (r) + return r; + + memset_io(adev->gart.ptr, adev->gart.gart_pte_flags, adev->gart.table_size); + return 0; } /** From 81665e35f143d93adef654f3be1360def9196e72 Mon Sep 17 00:00:00 2001 From: Xiaogang Chen Date: Fri, 24 Apr 2026 13:47:01 -0500 Subject: [PATCH 368/972] drm/amdkfd: Check if there are kfd porcesses using adev by kfd_processes_count During gpu hot-unplug need check if there are kfd porcesses still using the being removed gpu before clean resources of the device. Current driver checks if kfd_processes_table is empty. kfd processes are not terminated after removed from kfd_processes_table immediately. They are still alive and may access the device until kfd_process_wq work queue got ran. Check kfd->kfd_processes_count value that is updated after kfd process got uninitialized when its ref becomes zero. Fixes: 6cca686dfce7 ("drm/amdkfd: kfd driver supports hot unplug/replug amdgpu devices") Signed-off-by: Xiaogang Chen Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit d12d05c4bc4c15585130af43e897923ff292df7b) --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 33 +------------------------ 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 8ff97bf7d95a0b..b7f8f7ff819834 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1737,37 +1737,6 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr return false; } -/* check if there is kfd process still uses adev */ -static bool kgd2kfd_check_device_idle(struct amdgpu_device *adev) -{ - struct kfd_process *p; - struct hlist_node *p_temp; - unsigned int temp; - struct kfd_node *dev; - - mutex_lock(&kfd_processes_mutex); - - if (hash_empty(kfd_processes_table)) { - mutex_unlock(&kfd_processes_mutex); - return true; - } - - /* check if there is device still use adev */ - hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) { - for (int i = 0; i < p->n_pdds; i++) { - dev = p->pdds[i]->dev; - if (dev->adev == adev) { - mutex_unlock(&kfd_processes_mutex); - return false; - } - } - } - - mutex_unlock(&kfd_processes_mutex); - - return true; -} - /** kgd2kfd_teardown_processes - gracefully tear down existing * kfd processes that use adev * @@ -1800,7 +1769,7 @@ void kgd2kfd_teardown_processes(struct amdgpu_device *adev) mutex_unlock(&kfd_processes_mutex); /* wait all kfd processes use adev terminate */ - while (!kgd2kfd_check_device_idle(adev)) + while (!!atomic_read(&adev->kfd.dev->kfd_processes_count)) cond_resched(); } From 6da7b1242da4455b11c24ce667d1cab1a348c8ea Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Mon, 4 May 2026 18:21:17 +0530 Subject: [PATCH 369/972] drm/amdgpu/userq: fix access to stale wptr mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use drm_exec to take both locks i.e vm root bo and wptr_obj bo to access the mapping data properly. This fixes the security issue of unmap the wptr_obj while a queue creation is in progress and passing other bo at same address. Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 1fc6c8ab45dbee096469c08c13f6099d57a52d6c) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 95 +++++++++------------- 1 file changed, 37 insertions(+), 58 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 2fc39a6938f6db..5b4121ddc78c6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -30,34 +30,6 @@ #define AMDGPU_USERQ_PROC_CTX_SZ PAGE_SIZE #define AMDGPU_USERQ_GANG_CTX_SZ PAGE_SIZE -static int -mes_userq_map_gtt_bo_to_gart(struct amdgpu_bo *bo) -{ - int ret; - - ret = amdgpu_bo_reserve(bo, true); - if (ret) { - DRM_ERROR("Failed to reserve bo. ret %d\n", ret); - goto err_reserve_bo_failed; - } - - ret = amdgpu_ttm_alloc_gart(&bo->tbo); - if (ret) { - DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret); - goto err_map_bo_gart_failed; - } - - amdgpu_bo_unreserve(bo); - bo = amdgpu_bo_ref(bo); - - return 0; - -err_map_bo_gart_failed: - amdgpu_bo_unreserve(bo); -err_reserve_bo_failed: - return ret; -} - static int mes_userq_create_wptr_mapping(struct amdgpu_device *adev, struct amdgpu_userq_mgr *uq_mgr, @@ -65,55 +37,62 @@ mes_userq_create_wptr_mapping(struct amdgpu_device *adev, uint64_t wptr) { struct amdgpu_bo_va_mapping *wptr_mapping; - struct amdgpu_vm *wptr_vm; struct amdgpu_userq_obj *wptr_obj = &queue->wptr_obj; + struct amdgpu_bo *obj; + struct amdgpu_vm *vm = queue->vm; + struct drm_exec exec; int ret; - wptr_vm = queue->vm; - ret = amdgpu_bo_reserve(wptr_vm->root.bo, false); - if (ret) - return ret; - wptr &= AMDGPU_GMC_HOLE_MASK; - wptr_mapping = amdgpu_vm_bo_lookup_mapping(wptr_vm, wptr >> PAGE_SHIFT); - amdgpu_bo_unreserve(wptr_vm->root.bo); - if (!wptr_mapping) { - DRM_ERROR("Failed to lookup wptr bo\n"); - return -EINVAL; - } - wptr_obj->obj = wptr_mapping->bo_va->base.bo; - if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) { - DRM_ERROR("Requested GART mapping for wptr bo larger than one page\n"); - return -EINVAL; - } + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 2); + drm_exec_until_all_locked(&exec) { + ret = amdgpu_vm_lock_pd(vm, &exec, 1); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto fail_lock; + + wptr_mapping = amdgpu_vm_bo_lookup_mapping(vm, wptr >> PAGE_SHIFT); + if (!wptr_mapping) { + ret = -EINVAL; + goto fail_lock; + } - ret = mes_userq_map_gtt_bo_to_gart(wptr_obj->obj); - if (ret) { - DRM_ERROR("Failed to map wptr bo to GART\n"); - return ret; + obj = wptr_mapping->bo_va->base.bo; + ret = drm_exec_lock_obj(&exec, &obj->tbo.base); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto fail_lock; } - ret = amdgpu_bo_reserve(wptr_obj->obj, true); - if (ret) { - DRM_ERROR("Failed to reserve wptr bo\n"); - return ret; + wptr_obj->obj = amdgpu_bo_ref(wptr_mapping->bo_va->base.bo); + if (wptr_obj->obj->tbo.base.size > PAGE_SIZE) { + ret = -EINVAL; + goto fail_map; } /* TODO use eviction fence instead of pinning. */ ret = amdgpu_bo_pin(wptr_obj->obj, AMDGPU_GEM_DOMAIN_GTT); if (ret) { - drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin wptr bo\n"); - goto unresv_bo; + DRM_ERROR("Failed to pin wptr bo. ret %d\n", ret); + goto fail_map; + } + + ret = amdgpu_ttm_alloc_gart(&wptr_obj->obj->tbo); + if (ret) { + DRM_ERROR("Failed to bind bo to GART. ret %d\n", ret); + goto fail_map; } queue->wptr_obj.gpu_addr = amdgpu_bo_gpu_offset(wptr_obj->obj); - amdgpu_bo_unreserve(wptr_obj->obj); + drm_exec_fini(&exec); return 0; -unresv_bo: - amdgpu_bo_unreserve(wptr_obj->obj); +fail_map: + amdgpu_bo_unref(&wptr_obj->obj); +fail_lock: + drm_exec_fini(&exec); return ret; } From 4e02e0afa95f691dc7cc17538cdd648089a843f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 13 Oct 2025 15:26:02 +0200 Subject: [PATCH 370/972] drm/amdgpu: nuke amdgpu_userq_fence_slab v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As preparation for independent fences remove the extra slab, kmalloc should do just fine. v2: use GFP_KERNEL instead of GFP_ATOMIC Signed-off-by: Christian König Reviewed-by: Prike Liang Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 0d831487b5be0ae59cac865a0aa87b0acc3dc717) --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 13 ++------- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 28 +++---------------- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 3 -- 3 files changed, 7 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 46aae3fad4bf6c..60debd543e44ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -3149,11 +3149,7 @@ static int __init amdgpu_init(void) r = amdgpu_sync_init(); if (r) - goto error_sync; - - r = amdgpu_userq_fence_slab_init(); - if (r) - goto error_fence; + return r; amdgpu_register_atpx_handler(); amdgpu_acpi_detect(); @@ -3161,7 +3157,7 @@ static int __init amdgpu_init(void) /* Ignore KFD init failures when CONFIG_HSA_AMD is not set. */ r = amdgpu_amdkfd_init(); if (r && r != -ENOENT) - goto error_fence; + goto error_fini_sync; if (amdgpu_pp_feature_mask & PP_OVERDRIVE_MASK) { add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); @@ -3172,10 +3168,8 @@ static int __init amdgpu_init(void) /* let modprobe override vga console setting */ return pci_register_driver(&amdgpu_kms_pci_driver); -error_fence: +error_fini_sync: amdgpu_sync_fini(); - -error_sync: return r; } @@ -3186,7 +3180,6 @@ static void __exit amdgpu_exit(void) amdgpu_unregister_atpx_handler(); amdgpu_acpi_release(); amdgpu_sync_fini(); - amdgpu_userq_fence_slab_fini(); mmu_notifier_synchronize(); amdgpu_xcp_drv_release(); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index da39ac862f37cf..e2d5f04296e1c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -32,29 +32,9 @@ #include "amdgpu.h" #include "amdgpu_userq_fence.h" -static const struct dma_fence_ops amdgpu_userq_fence_ops; -static struct kmem_cache *amdgpu_userq_fence_slab; - #define AMDGPU_USERQ_MAX_HANDLES (1U << 16) -int amdgpu_userq_fence_slab_init(void) -{ - amdgpu_userq_fence_slab = kmem_cache_create("amdgpu_userq_fence", - sizeof(struct amdgpu_userq_fence), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!amdgpu_userq_fence_slab) - return -ENOMEM; - - return 0; -} - -void amdgpu_userq_fence_slab_fini(void) -{ - rcu_barrier(); - kmem_cache_destroy(amdgpu_userq_fence_slab); -} +static const struct dma_fence_ops amdgpu_userq_fence_ops; static inline struct amdgpu_userq_fence *to_amdgpu_userq_fence(struct dma_fence *f) { @@ -231,7 +211,7 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv) static int amdgpu_userq_fence_alloc(struct amdgpu_userq_fence **userq_fence) { - *userq_fence = kmem_cache_alloc(amdgpu_userq_fence_slab, GFP_ATOMIC); + *userq_fence = kmalloc(sizeof(**userq_fence), GFP_KERNEL); return *userq_fence ? 0 : -ENOMEM; } @@ -342,7 +322,7 @@ static void amdgpu_userq_fence_free(struct rcu_head *rcu) amdgpu_userq_fence_driver_put(fence_drv); kvfree(userq_fence->fence_drv_array); - kmem_cache_free(amdgpu_userq_fence_slab, userq_fence); + kfree(userq_fence); } static void amdgpu_userq_fence_release(struct dma_fence *f) @@ -545,7 +525,7 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, r = amdgpu_userq_fence_create(queue, userq_fence, wptr, &fence); if (r) { mutex_unlock(&userq_mgr->userq_mutex); - kmem_cache_free(amdgpu_userq_fence_slab, userq_fence); + kfree(userq_fence); goto put_gobj_write; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h index d56246ad8c260b..d355a0eecc07fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h @@ -58,9 +58,6 @@ struct amdgpu_userq_fence_driver { char timeline_name[TASK_COMM_LEN]; }; -int amdgpu_userq_fence_slab_init(void); -void amdgpu_userq_fence_slab_fini(void); - void amdgpu_userq_fence_driver_get(struct amdgpu_userq_fence_driver *fence_drv); void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv); int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, From 26f6654a9a60eb4d241f42a0ec85412e8821480b Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Thu, 23 Apr 2026 22:06:20 +0200 Subject: [PATCH 371/972] drm/exynos: remove bridge when component_add fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use devm_drm_bridge_add() so the bridge is released if probe fails after registration, and drop the manual drm_bridge_remove() in remove(). Check the return value of devm_drm_bridge_add(). Signed-off-by: Osama Abdelkader Fixes: 576d72fbfb45 ("drm/exynos: mic: add a bridge at probe") Cc: stable@vger.kernel.org Reviewed-by: Raphaël Gallais-Pou Reviewed-by: Luca Ceresoli Link: https://patch.msgid.link/20260423200622.325076-2-osama.abdelkader@gmail.com Signed-off-by: Luca Ceresoli --- drivers/gpu/drm/exynos/exynos_drm_mic.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_mic.c b/drivers/gpu/drm/exynos/exynos_drm_mic.c index 29a8366513fa70..e68c954ec3e614 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_mic.c +++ b/drivers/gpu/drm/exynos/exynos_drm_mic.c @@ -423,7 +423,9 @@ static int exynos_mic_probe(struct platform_device *pdev) mic->bridge.of_node = dev->of_node; - drm_bridge_add(&mic->bridge); + ret = devm_drm_bridge_add(dev, &mic->bridge); + if (ret) + goto err; pm_runtime_enable(dev); @@ -443,12 +445,8 @@ static int exynos_mic_probe(struct platform_device *pdev) static void exynos_mic_remove(struct platform_device *pdev) { - struct exynos_mic *mic = platform_get_drvdata(pdev); - component_del(&pdev->dev, &exynos_mic_component_ops); pm_runtime_disable(&pdev->dev); - - drm_bridge_remove(&mic->bridge); } static const struct of_device_id exynos_mic_of_match[] = { From ac8eb3e18f41e2cc8492cc1d358bcb786c850270 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 5 May 2026 15:15:40 +0200 Subject: [PATCH 372/972] wifi: mac80211: use safe list iteration in radar detect work The call to ieee80211_dfs_cac_cancel can cause the iterated chanctx to be freed and removed from the list. Guard against this to avoid a slab-use-after-free error. Cc: stable@vger.kernel.org Fixes: bca8bc0399ac ("wifi: mac80211: handle ieee80211_radar_detected() for MLO") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20260505151539.236d63a1b736.I35dbb9e96a2d4a480be208770fdd99ba3b817b79@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b093bc203c8159..2529b01e2cd55c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3700,11 +3700,11 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; - struct ieee80211_chanctx *ctx; + struct ieee80211_chanctx *ctx, *tmp; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(ctx, &local->chanctx_list, list) { + list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; From 644132a48f4e28a1d949d162160869286f3e75de Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 5 May 2026 08:49:48 -0400 Subject: [PATCH 373/972] selinux: prune /sys/fs/selinux/checkreqprot commit a7e4676e8e2cb ("selinux: remove the 'checkreqprot' functionality") removed the ability to modify the checkreqprot setting but left everything except the updating of the checkreqprot value intact. Aside from unnecessary processing, this could produce a local DoS from log spam and incorrectly calls selinux_ima_measure_state() on each write even though no state has changed. Prune it to just log an error message once and return count (i.e. all bytes written successfully) so that userspace never breaks. Cc: stable@vger.kernel.org Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 47 ++++++------------------------------ 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 83aa765a09f98d..6f74f87cb2b027 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -689,46 +689,13 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char *page; - ssize_t length; - unsigned int new_value; - - length = avc_has_perm(current_sid(), SECINITSID_SECURITY, - SECCLASS_SECURITY, SECURITY__SETCHECKREQPROT, - NULL); - if (length) - return length; - - if (count >= PAGE_SIZE) - return -ENOMEM; - - /* No partial writes. */ - if (*ppos != 0) - return -EINVAL; - - page = memdup_user_nul(buf, count); - if (IS_ERR(page)) - return PTR_ERR(page); - - if (sscanf(page, "%u", &new_value) != 1) { - length = -EINVAL; - goto out; - } - length = count; - - if (new_value) { - char comm[sizeof(current->comm)]; - - strscpy(comm, current->comm); - pr_err("SELinux: %s (%d) set checkreqprot to 1. This is no longer supported.\n", - comm, current->pid); - } - - selinux_ima_measure_state(); - -out: - kfree(page); - return length; + /* + * Setting checkreqprot is no longer supported, see + * https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-checkreqprot + */ + pr_err_once("SELinux: %s (%d) wrote to checkreqprot. This is no longer supported.\n", + current->comm, current->pid); + return count; } static const struct file_operations sel_checkreqprot_ops = { .read = sel_read_checkreqprot, From 19cfa0099024bb9cd40f6d950caa7f47ff8e77f6 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 5 May 2026 08:49:49 -0400 Subject: [PATCH 374/972] selinux: prune /sys/fs/selinux/disable Commit f22f9aaf6c3d ("selinux: remove the runtime disable functionality") removed the underlying SELinux runtime disable functionality but left everything else intact and started logging an error message to warn any residual users. Prune it to just log an error message once and to return count (i.e. all bytes written successfully) to avoid breaking userspace. This also fixes a local DoS from logspam. Cc: stable@vger.kernel.org Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 6f74f87cb2b027..343303b73d6f72 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -272,35 +272,13 @@ static ssize_t sel_write_disable(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char *page; - ssize_t length; - int new_value; - - if (count >= PAGE_SIZE) - return -ENOMEM; - - /* No partial writes. */ - if (*ppos != 0) - return -EINVAL; - - page = memdup_user_nul(buf, count); - if (IS_ERR(page)) - return PTR_ERR(page); - - if (sscanf(page, "%d", &new_value) != 1) { - length = -EINVAL; - goto out; - } - length = count; - - if (new_value) { - pr_err("SELinux: https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable\n"); - pr_err("SELinux: Runtime disable is not supported, use selinux=0 on the kernel cmdline.\n"); - } - -out: - kfree(page); - return length; + /* + * Setting disable is no longer supported, see + * https://github.com/SELinuxProject/selinux-kernel/wiki/DEPRECATE-runtime-disable + */ + pr_err_once("SELinux: %s (%d) wrote to disable. This is no longer supported.\n", + current->comm, current->pid); + return count; } static const struct file_operations sel_disable_ops = { From ad1ac3d740cc6b858a99ab9c45c8c0574be7d1d3 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 5 May 2026 08:49:50 -0400 Subject: [PATCH 375/972] selinux: prune /sys/fs/selinux/user Remove the previously deprecated /sys/fs/selinux/user interface aside from a residual stub for userspace compatibility. Commit d7b6918e22c7 ("selinux: Deprecate /sys/fs/selinux/user") started the deprecation process for /sys/fs/selinux/user: The selinuxfs "user" node allows userspace to request a list of security contexts that can be reached for a given SELinux user from a given starting context. This was used by libselinux when various login-style programs requested contexts for users, but libselinux stopped using it in 2020. Kernel support will be removed no sooner than Dec 2025. A pr_warn() message has been in place since Linux v6.13, and a 5 second sleep was introduced since Linux v6.17 to help make it more noticeable. We are now past the stated deadline of Dec 2025, so remove the underlying functionality and replace it with a stub that returns a '0\0' buffer to avoid breaking userspace. This also avoids a local DoS from logspam and an uninterruptible sleep delay. Cc: stable@vger.kernel.org Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- .../{obsolete => removed}/sysfs-selinux-user | 0 security/selinux/include/security.h | 2 - security/selinux/selinuxfs.c | 68 +--------- security/selinux/ss/services.c | 125 ------------------ 4 files changed, 5 insertions(+), 190 deletions(-) rename Documentation/ABI/{obsolete => removed}/sysfs-selinux-user (100%) diff --git a/Documentation/ABI/obsolete/sysfs-selinux-user b/Documentation/ABI/removed/sysfs-selinux-user similarity index 100% rename from Documentation/ABI/obsolete/sysfs-selinux-user rename to Documentation/ABI/removed/sysfs-selinux-user diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index d1f16d7f684de3..0babb89921816a 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -312,8 +312,6 @@ int security_context_to_sid_default(const char *scontext, u32 scontext_len, int security_context_to_sid_force(const char *scontext, u32 scontext_len, u32 *sid); -int security_get_user_sids(u32 fromsid, const char *username, u32 **sids, u32 *nel); - int security_port_sid(u8 protocol, u16 port, u32 *out_sid); int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid); diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 343303b73d6f72..6ed66930913235 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1018,69 +1018,11 @@ static ssize_t sel_write_relabel(struct file *file, char *buf, size_t size) static ssize_t sel_write_user(struct file *file, char *buf, size_t size) { - char *con = NULL, *user = NULL, *ptr; - u32 sid, *sids = NULL; - ssize_t length; - char *newcon; - int rc; - u32 i, len, nsids; - - pr_warn_ratelimited("SELinux: %s (%d) wrote to /sys/fs/selinux/user!" - " This will not be supported in the future; please update your" - " userspace.\n", current->comm, current->pid); - ssleep(5); - - length = avc_has_perm(current_sid(), SECINITSID_SECURITY, - SECCLASS_SECURITY, SECURITY__COMPUTE_USER, - NULL); - if (length) - goto out; - - length = -ENOMEM; - con = kzalloc(size + 1, GFP_KERNEL); - if (!con) - goto out; - - length = -ENOMEM; - user = kzalloc(size + 1, GFP_KERNEL); - if (!user) - goto out; - - length = -EINVAL; - if (sscanf(buf, "%s %s", con, user) != 2) - goto out; - - length = security_context_str_to_sid(con, &sid, GFP_KERNEL); - if (length) - goto out; - - length = security_get_user_sids(sid, user, &sids, &nsids); - if (length) - goto out; - - length = sprintf(buf, "%u", nsids) + 1; - ptr = buf + length; - for (i = 0; i < nsids; i++) { - rc = security_sid_to_context(sids[i], &newcon, &len); - if (rc) { - length = rc; - goto out; - } - if ((length + len) >= SIMPLE_TRANSACTION_LIMIT) { - kfree(newcon); - length = -ERANGE; - goto out; - } - memcpy(ptr, newcon, len); - kfree(newcon); - ptr += len; - length += len; - } -out: - kfree(sids); - kfree(user); - kfree(con); - return length; + pr_err_once("SELinux: %s (%d) wrote to user. This is no longer supported.\n", + current->comm, current->pid); + buf[0] = '0'; + buf[1] = 0; + return 2; } static ssize_t sel_write_member(struct file *file, char *buf, size_t size) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index e8e7ccbd1e4485..143021c5e326d8 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2746,131 +2746,6 @@ int security_node_sid(u16 domain, return rc; } -#define SIDS_NEL 25 - -/** - * security_get_user_sids - Obtain reachable SIDs for a user. - * @fromsid: starting SID - * @username: username - * @sids: array of reachable SIDs for user - * @nel: number of elements in @sids - * - * Generate the set of SIDs for legal security contexts - * for a given user that can be reached by @fromsid. - * Set *@sids to point to a dynamically allocated - * array containing the set of SIDs. Set *@nel to the - * number of elements in the array. - */ - -int security_get_user_sids(u32 fromsid, - const char *username, - u32 **sids, - u32 *nel) -{ - struct selinux_policy *policy; - struct policydb *policydb; - struct sidtab *sidtab; - struct context *fromcon, usercon; - u32 *mysids = NULL, *mysids2, sid; - u32 i, j, mynel, maxnel = SIDS_NEL; - struct user_datum *user; - struct role_datum *role; - struct ebitmap_node *rnode, *tnode; - int rc; - - *sids = NULL; - *nel = 0; - - if (!selinux_initialized()) - return 0; - - mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL); - if (!mysids) - return -ENOMEM; - -retry: - mynel = 0; - rcu_read_lock(); - policy = rcu_dereference(selinux_state.policy); - policydb = &policy->policydb; - sidtab = policy->sidtab; - - context_init(&usercon); - - rc = -EINVAL; - fromcon = sidtab_search(sidtab, fromsid); - if (!fromcon) - goto out_unlock; - - rc = -EINVAL; - user = symtab_search(&policydb->p_users, username); - if (!user) - goto out_unlock; - - usercon.user = user->value; - - ebitmap_for_each_positive_bit(&user->roles, rnode, i) { - role = policydb->role_val_to_struct[i]; - usercon.role = i + 1; - ebitmap_for_each_positive_bit(&role->types, tnode, j) { - usercon.type = j + 1; - - if (mls_setup_user_range(policydb, fromcon, user, - &usercon)) - continue; - - rc = sidtab_context_to_sid(sidtab, &usercon, &sid); - if (rc == -ESTALE) { - rcu_read_unlock(); - goto retry; - } - if (rc) - goto out_unlock; - if (mynel < maxnel) { - mysids[mynel++] = sid; - } else { - rc = -ENOMEM; - maxnel += SIDS_NEL; - mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC); - if (!mysids2) - goto out_unlock; - memcpy(mysids2, mysids, mynel * sizeof(*mysids2)); - kfree(mysids); - mysids = mysids2; - mysids[mynel++] = sid; - } - } - } - rc = 0; -out_unlock: - rcu_read_unlock(); - if (rc || !mynel) { - kfree(mysids); - return rc; - } - - rc = -ENOMEM; - mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL); - if (!mysids2) { - kfree(mysids); - return rc; - } - for (i = 0, j = 0; i < mynel; i++) { - struct av_decision dummy_avd; - rc = avc_has_perm_noaudit(fromsid, mysids[i], - SECCLASS_PROCESS, /* kernel value */ - PROCESS__TRANSITION, AVC_STRICT, - &dummy_avd); - if (!rc) - mysids2[j++] = mysids[i]; - cond_resched(); - } - kfree(mysids); - *sids = mysids2; - *nel = j; - return 0; -} - /** * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem * @policy: policy From a02cd6805562305f936e807da83e253b719dd965 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 5 May 2026 10:06:38 -0400 Subject: [PATCH 376/972] selinux: allow multiple opens of /sys/fs/selinux/policy Currently there can only be a single open of /sys/fs/selinux/policy at any time. This allows any process to block any other process from reading the kernel policy. The original motivation seems to have been a mix of preventing an inconsistent view of the policy size and preventing userspace from allocating kernel memory without bound, but this is arguably equally bad. Eliminate the policy_opened flag and shrink the critical section that the policy mutex is held. While we are making changes here, drop a couple of extraneous BUG_ONs. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/selinux/20100726193414.19538.64028.stgit@paris.rdu.redhat.com/ Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 6ed66930913235..a43a38a3ae25be 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -76,7 +76,6 @@ struct selinux_fs_info { int *bool_pending_values; struct dentry *class_dir; unsigned long last_class_ino; - bool policy_opened; unsigned long last_ino; struct super_block *sb; }; @@ -340,44 +339,31 @@ struct policy_load_memory { static int sel_open_policy(struct inode *inode, struct file *filp) { - struct selinux_fs_info *fsi = inode->i_sb->s_fs_info; struct policy_load_memory *plm = NULL; int rc; - BUG_ON(filp->private_data); - - mutex_lock(&selinux_state.policy_mutex); - rc = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__READ_POLICY, NULL); if (rc) - goto err; - - rc = -EBUSY; - if (fsi->policy_opened) - goto err; + return rc; - rc = -ENOMEM; plm = kzalloc_obj(*plm); if (!plm) - goto err; + return -ENOMEM; + mutex_lock(&selinux_state.policy_mutex); rc = security_read_policy(&plm->data, &plm->len); if (rc) goto err; - if ((size_t)i_size_read(inode) != plm->len) { inode_lock(inode); i_size_write(inode, plm->len); inode_unlock(inode); } - - fsi->policy_opened = 1; + mutex_unlock(&selinux_state.policy_mutex); filp->private_data = plm; - mutex_unlock(&selinux_state.policy_mutex); - return 0; err: mutex_unlock(&selinux_state.policy_mutex); @@ -390,13 +376,8 @@ static int sel_open_policy(struct inode *inode, struct file *filp) static int sel_release_policy(struct inode *inode, struct file *filp) { - struct selinux_fs_info *fsi = inode->i_sb->s_fs_info; struct policy_load_memory *plm = filp->private_data; - BUG_ON(!plm); - - fsi->policy_opened = 0; - vfree(plm->data); kfree(plm); From 868f31e4061eca8c3cd607d79d954d5e54f204aa Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 30 Apr 2026 14:36:52 -0400 Subject: [PATCH 377/972] selinux: shrink critical section in sel_write_load() Currently sel_write_load() takes the policy mutex earlier than necessary. Move the taking of the mutex later. This avoids holding it unnecessarily across the vmalloc() and copy_from_user() of the policy data. Cc: stable@vger.kernel.org Signed-off-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/selinuxfs.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index a43a38a3ae25be..25ca7d71401440 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -553,34 +553,31 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, if (!count) return -EINVAL; - mutex_lock(&selinux_state.policy_mutex); - length = avc_has_perm(current_sid(), SECINITSID_SECURITY, SECCLASS_SECURITY, SECURITY__LOAD_POLICY, NULL); if (length) - goto out; + return length; data = vmalloc(count); - if (!data) { - length = -ENOMEM; - goto out; - } + if (!data) + return -ENOMEM; if (copy_from_user(data, buf, count) != 0) { length = -EFAULT; goto out; } + mutex_lock(&selinux_state.policy_mutex); length = security_load_policy(data, count, &load_state); if (length) { pr_warn_ratelimited("SELinux: failed to load policy\n"); - goto out; + goto out_unlock; } fsi = file_inode(file)->i_sb->s_fs_info; length = sel_make_policy_nodes(fsi, load_state.policy); if (length) { pr_warn_ratelimited("SELinux: failed to initialize selinuxfs\n"); selinux_policy_cancel(&load_state); - goto out; + goto out_unlock; } selinux_policy_commit(&load_state); @@ -590,8 +587,9 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); -out: +out_unlock: mutex_unlock(&selinux_state.policy_mutex); +out: vfree(data); return length; } From 60a1e131a811b68703da58fd805ab359b704ab03 Mon Sep 17 00:00:00 2001 From: Gustavo Sousa Date: Thu, 16 Apr 2026 15:17:19 -0300 Subject: [PATCH 378/972] drm/xe/hdcp: Add NULL check for media_gt in intel_hdcp_gsc_check_status() When media GT is disabled via configfs, there is no allocation for media_gt, which is kept as NULL. In such scenario, intel_hdcp_gsc_check_status() results in a kernel pagefault error due to >->uc.gsc being evaluated as an invalid memory address. Fix that by introducing a NULL check on media_gt and bailing out early if so. While at it, also drop the NULL check for gsc, since it can't be NULL if media_gt is not NULL. v2: - Get address for gsc only after checking that gt is not NULL. (Shuicheng) - Drop the NULL check for gsc. (Shuicheng) v3: - Add "Fixes" and "Cc: " tags. (Matt) Fixes: 4af50beb4e0f ("drm/xe: Use gsc_proxy_init_done to check proxy status") Cc: # v6.10+ Reviewed-by: Matt Roper Reviewed-by: Shuicheng Lin Link: https://patch.msgid.link/20260416-check-for-null-media_gt-in-intel_hdcp_gsc_check_status-v2-1-9adb9fd3b621@intel.com Signed-off-by: Gustavo Sousa (cherry picked from commit bfaf87e84ca3ca3f6e275f9ae56da47a8b55ffd1) Signed-off-by: Matthew Brost --- drivers/gpu/drm/xe/display/xe_hdcp_gsc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c index 29c72aa4b0d2d7..33494b86205d2e 100644 --- a/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c +++ b/drivers/gpu/drm/xe/display/xe_hdcp_gsc.c @@ -37,9 +37,17 @@ static bool intel_hdcp_gsc_check_status(struct drm_device *drm) struct xe_device *xe = to_xe_device(drm); struct xe_tile *tile = xe_device_get_root_tile(xe); struct xe_gt *gt = tile->media_gt; - struct xe_gsc *gsc = >->uc.gsc; + struct xe_gsc *gsc; + + if (!gt) { + drm_dbg_kms(&xe->drm, + "not checking GSC status for HDCP2.x: media GT not present or disabled\n"); + return false; + } + + gsc = >->uc.gsc; - if (!gsc || !xe_uc_fw_is_available(&gsc->fw)) { + if (!xe_uc_fw_is_available(&gsc->fw)) { drm_dbg_kms(&xe->drm, "GSC Components not ready for HDCP2.x\n"); return false; From d01012c740bbb298b957e30cc0848e482c6f486f Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Tue, 28 Apr 2026 20:14:48 +0000 Subject: [PATCH 379/972] drm/xe/pf: Fix EAGAIN sign in pf_migration_consume() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PTR_ERR() returns a negative value, so comparing against the positive EAGAIN is always true for ERR_PTR(-EAGAIN), causing pf_migration_consume() to bail out instead of continuing to the remaining GTs. On multi-GT platforms this can skip GTs that already have data ready. Compare against -EAGAIN to match the intent (and the following line that correctly uses -EAGAIN). While at it, gate PTR_ERR() with IS_ERR(). v2: add IS_ERR() guard before PTR_ERR(). (Gustavo) Fixes: 67df4a5cbc58 ("drm/xe/pf: Add data structures and handlers for migration rings") Cc: Michał Winiarski Reviewed-by: Gustavo Sousa Link: https://patch.msgid.link/20260428201448.3999428-1-shuicheng.lin@intel.com Signed-off-by: Shuicheng Lin (cherry picked from commit 9d770e72e1edb54beacfce5f402edb51632811e3) Signed-off-by: Matthew Brost --- drivers/gpu/drm/xe/xe_sriov_pf_migration.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c index 6c4b16409cc9af..150a241110fbde 100644 --- a/drivers/gpu/drm/xe/xe_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_sriov_pf_migration.c @@ -149,10 +149,11 @@ pf_migration_consume(struct xe_device *xe, unsigned int vfid) for_each_gt(gt, xe, gt_id) { data = xe_gt_sriov_pf_migration_save_consume(gt, vfid); - if (data && PTR_ERR(data) != EAGAIN) + if (!data) + continue; + if (!IS_ERR(data) || PTR_ERR(data) != -EAGAIN) return data; - if (PTR_ERR(data) == -EAGAIN) - more_data = true; + more_data = true; } if (!more_data) From b87951a0ae9f95ca6590bf0939edced7d36929dd Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Wed, 29 Apr 2026 19:22:59 +0000 Subject: [PATCH 380/972] drm/xe/pf: Fix MMIO access using PF view instead of VF view during migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pf_migration_mmio_save() and pf_migration_mmio_restore() initialize a local VF-specific MMIO view via xe_mmio_init_vf_view() but then pass >->mmio (the PF base) to all xe_mmio_read32()/xe_mmio_write32() calls instead of the local &mmio. This causes the PF own SW flag registers to be saved/restored rather than the target VF registers, silently corrupting migration state. Use the VF MMIO view for all register accesses, matching the correct pattern used in pf_clear_vf_scratch_regs(). Fixes: b7c1b990f719 ("drm/xe/pf: Handle MMIO migration data as part of PF control") Cc: Michał Winiarski Assisted-by: Claude:claude-opus-4.6 Reviewed-by: Michal Wajdeczko Reviewed-by: Stuart Summers Link: https://patch.msgid.link/20260429192259.4009211-1-shuicheng.lin@intel.com Signed-off-by: Shuicheng Lin (cherry picked from commit 7d9c39cfb31ff389490ca1308767c2807a9829a6) Signed-off-by: Matthew Brost --- drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c index 87a164efcc33c8..01fe03b9efe85c 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_migration.c @@ -385,10 +385,10 @@ static int pf_migration_mmio_save(struct xe_gt *gt, unsigned int vfid, void *buf if (xe_gt_is_media_type(gt)) for (n = 0; n < MED_VF_SW_FLAG_COUNT; n++) - regs[n] = xe_mmio_read32(>->mmio, MED_VF_SW_FLAG(n)); + regs[n] = xe_mmio_read32(&mmio, MED_VF_SW_FLAG(n)); else for (n = 0; n < VF_SW_FLAG_COUNT; n++) - regs[n] = xe_mmio_read32(>->mmio, VF_SW_FLAG(n)); + regs[n] = xe_mmio_read32(&mmio, VF_SW_FLAG(n)); return 0; } @@ -407,10 +407,10 @@ static int pf_migration_mmio_restore(struct xe_gt *gt, unsigned int vfid, if (xe_gt_is_media_type(gt)) for (n = 0; n < MED_VF_SW_FLAG_COUNT; n++) - xe_mmio_write32(>->mmio, MED_VF_SW_FLAG(n), regs[n]); + xe_mmio_write32(&mmio, MED_VF_SW_FLAG(n), regs[n]); else for (n = 0; n < VF_SW_FLAG_COUNT; n++) - xe_mmio_write32(>->mmio, VF_SW_FLAG(n), regs[n]); + xe_mmio_write32(&mmio, VF_SW_FLAG(n), regs[n]); return 0; } From b29987dfd943e655df6e3b641ecffad5cc1509c2 Mon Sep 17 00:00:00 2001 From: Satyanarayana K V P Date: Mon, 4 May 2026 09:49:26 +0000 Subject: [PATCH 381/972] drm/xe/guc: Exclude indirect ring state page from ADS engine state size The engine state size reported to GuC via ADS should only include the engine state portion and should not include the indirect ring state page that comes after it in the context image. The GuC uses this size to overwrite the engine state in the LRC on watchdog resets and we don't want it to overwrite the indirect ring state as well. Fixes: d6219e1cd5e3 ("drm/xe: Add Indirect Ring State support") Suggested-by: Daniele Ceraolo Spurio Signed-off-by: Satyanarayana K V P Cc: Michal Wajdeczko Cc: Matthew Brost Reviewed-by: Matthew Brost Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Daniele Ceraolo Spurio Link: https://patch.msgid.link/20260504094924.3760713-4-satyanarayana.k.v.p@intel.com (cherry picked from commit 3ec5f003f6c377beda8bd5438941f5a7795e1848) Signed-off-by: Matthew Brost --- drivers/gpu/drm/xe/xe_guc_ads.c | 5 +---- drivers/gpu/drm/xe/xe_lrc.c | 11 +++++++++-- drivers/gpu/drm/xe/xe_lrc.h | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index 81b5f01b1f65cc..2b835d48b5652f 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -512,12 +512,9 @@ static void guc_golden_lrc_init(struct xe_guc_ads *ads) * that starts after the execlists LRC registers. This is * required to allow the GuC to restore just the engine state * when a watchdog reset occurs. - * We calculate the engine state size by removing the size of - * what comes before it in the context image (which is identical - * on all engines). */ ads_blob_write(ads, ads.eng_state_size[guc_class], - real_size - xe_lrc_skip_size(xe)); + xe_lrc_engine_state_size(gt, class)); ads_blob_write(ads, ads.golden_context_lrca[guc_class], addr_ggtt); diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c index c725cde4508d20..4af9f0d7c6f3ba 100644 --- a/drivers/gpu/drm/xe/xe_lrc.c +++ b/drivers/gpu/drm/xe/xe_lrc.c @@ -746,9 +746,16 @@ size_t xe_lrc_reg_size(struct xe_device *xe) return 80 * sizeof(u32); } -size_t xe_lrc_skip_size(struct xe_device *xe) +/** + * xe_lrc_engine_state_size() - Get size of the engine state within LRC + * @gt: the &xe_gt struct instance + * @class: Hardware engine class + * + * Returns: Size of the engine state + */ +size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class) { - return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe); + return xe_gt_lrc_hang_replay_size(gt, class) - xe_lrc_reg_size(gt_to_xe(gt)); } static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc) diff --git a/drivers/gpu/drm/xe/xe_lrc.h b/drivers/gpu/drm/xe/xe_lrc.h index e7c975f9e2d975..5440663183f6d7 100644 --- a/drivers/gpu/drm/xe/xe_lrc.h +++ b/drivers/gpu/drm/xe/xe_lrc.h @@ -130,7 +130,7 @@ u32 xe_lrc_parallel_ggtt_addr(struct xe_lrc *lrc); struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc); size_t xe_lrc_reg_size(struct xe_device *xe); -size_t xe_lrc_skip_size(struct xe_device *xe); +size_t xe_lrc_engine_state_size(struct xe_gt *gt, enum xe_engine_class class); void xe_lrc_dump_default(struct drm_printer *p, struct xe_gt *gt, From 901a7d9e2f280a9e76e6c58406a519cb11ad5ff8 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Sun, 3 May 2026 21:25:16 +0200 Subject: [PATCH 382/972] ipv6: default IPV6_SIT to m This basically defaulted to m until recently, since IPV6 defaulted to m. Since IPV6 was changed to a boolean with a default of y, IPV6_SIT started defaulting to built-in as well. This results in a surprise sit0 device by default for defconfig (and defconfig-derived config) users at boot. For me, this broke an (admittedly non-robust) script. Preserve the behaviour of most configs by avoiding building this module, that's probably overall seldom used compared to IPv6 as a whole, into the kernel. Fixes: 309b905deee59 ("ipv6: convert CONFIG_IPV6 to built-in only and clean up Kconfigs") Signed-off-by: Alyssa Ross Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260503192515.290900-2-hi@alyssa.is Signed-off-by: Jakub Kicinski --- net/ipv6/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index c024aa77f25ba6..c3806c6ac96f95 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -164,7 +164,7 @@ config IPV6_SIT select INET_TUNNEL select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE - default y + default m help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -172,7 +172,7 @@ config IPV6_SIT into IPv4 packets. This is useful if you want to connect two IPv6 networks over an IPv4-only path. - Saying M here will produce a module called sit. If unsure, say Y. + Saying M here will produce a module called sit. If unsure, say M. config IPV6_SIT_6RD bool "IPv6: IPv6 Rapid Deployment (6RD)" From 5ad509c1fdad4bf0993b72d1b3d462f036d8a0d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 06:43:13 +0000 Subject: [PATCH 383/972] ipv6: Fix null-ptr-deref in fib6_mtu(). syzbot reported null-ptr-deref in fib6_mtu(). [0] When res->f6i->fib6_pmtu is 0 in fib6_mtu(), it fetches MTU from __in6_dev_get(nh->fib_nh_dev)->cnf.mtu6. However, __in6_dev_get() could return NULL when the device is being unregistered. Let's return 0 MTU if __in6_dev_get() returns NULL in fib6_mtu(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc00000000bc: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x00000000000005e0-0x00000000000005e7] CPU: 0 UID: 0 PID: 7890 Comm: syz.2.502 Tainted: G L syzkaller #0 PREEMPT(full) Tainted: [L]=SOFTLOCKUP Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:fib6_mtu net/ipv6/route.c:1648 [inline] RIP: 0010:rt6_insert_exception+0x9eb/0x10a0 net/ipv6/route.c:1753 Code: 3b 14 cf f7 45 85 f6 0f 85 1d 02 00 00 e8 7d 19 cf f7 48 8d bb e0 05 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 89 RSP: 0000:ffffc9000610f120 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc9000c001000 RDX: 00000000000000bc RSI: ffffffff8a38bc83 RDI: 00000000000005e0 RBP: ffff888052f06000 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff888042d16c00 R13: ffff888042d16cc8 R14: 0000000000000001 R15: 0000000000000500 FS: 0000000000000000(0000) GS:ffff88809717d000(0063) knlGS:00000000f540db40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 00000000f73c6d50 CR3: 000000006eff0000 CR4: 0000000000352ef0 Call Trace: __ip6_rt_update_pmtu+0x555/0xd60 net/ipv6/route.c:2982 ip6_update_pmtu+0x34f/0x3b0 net/ipv6/route.c:3014 icmpv6_err+0x2a2/0x3f0 net/ipv6/icmp.c:82 icmpv6_notify+0x35e/0x820 net/ipv6/icmp.c:1087 icmpv6_rcv+0x10bf/0x1ae0 net/ipv6/icmp.c:1228 ip6_protocol_deliver_rcu+0xf97/0x1500 net/ipv6/ip6_input.c:478 ip6_input_finish+0x1e4/0x4a0 net/ipv6/ip6_input.c:529 NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ip6_input+0x105/0x2f0 net/ipv6/ip6_input.c:540 ip6_mc_input+0x513/0xf50 net/ipv6/ip6_input.c:630 dst_input include/net/dst.h:480 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:119 [inline] NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0x34c/0x3d0 net/ipv6/ip6_input.c:351 __netif_receive_skb_one_core+0x12d/0x1e0 net/core/dev.c:6202 __netif_receive_skb+0x1f/0x120 net/core/dev.c:6315 netif_receive_skb_internal net/core/dev.c:6401 [inline] netif_receive_skb+0x13b/0x7f0 net/core/dev.c:6460 tun_rx_batched.isra.0+0x3f6/0x750 drivers/net/tun.c:1511 tun_get_user+0x1e31/0x3c20 drivers/net/tun.c:1955 tun_chr_write_iter+0xdc/0x200 drivers/net/tun.c:2001 new_sync_write fs/read_write.c:595 [inline] vfs_write+0x6ac/0x1070 fs/read_write.c:688 ksys_write+0x12a/0x250 fs/read_write.c:740 do_syscall_32_irqs_on arch/x86/entry/syscall_32.c:83 [inline] do_int80_emulation+0x141/0x700 arch/x86/entry/syscall_32.c:172 asm_int80_emulation+0x1a/0x20 arch/x86/include/asm/idtentry.h:621 RIP: 0023:0xf715616b Code: 57 56 53 8b 44 24 14 f6 00 08 75 23 8b 44 24 18 8b 5c 24 1c 8b 4c 24 20 8b 54 24 24 8b 74 24 28 8b 7c 24 2c 8b 6c 24 30 cd 80 <5b> 5e 5f 5d c3 5b 5e 5f 5d e9 f7 a1 ff ff 66 90 66 90 66 90 90 53 RSP: 002b:00000000f540d44c EFLAGS: 00000246 ORIG_RAX: 0000000000000004 RAX: ffffffffffffffda RBX: 00000000000000c8 RCX: 0000000080000640 RDX: 000000000000007a RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000292 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: dcd1f572954f ("net/ipv6: Remove fib6_idev") Reported-by: syzbot+01f005f9c6387ca6f6dd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f83f22.170a0220.13cc2.0004.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260504064316.3820775-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0dc0316530ca18..e3d355d1fbd627 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1645,6 +1645,10 @@ static unsigned int fib6_mtu(const struct fib6_result *res) rcu_read_lock(); idev = __in6_dev_get(dev); + if (!idev) { + rcu_read_unlock(); + return 0; + } mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } From 07f44433355f70fa97d4c44b4c0d2e86adc082fb Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 4 May 2026 14:06:08 +0530 Subject: [PATCH 384/972] bnxt_en: Delay for 5 seconds after AER DPC for all chips The FW on all chips is requiring a 5-second delay after Downstream Port Containment (DPC) AER. The previously added 900 msec delay was not long enough in all cases because the chip's CRS (Configuration Request Retry Status) mechanism is not always reliable. Fixes: d5ab32e9b02d ("bnxt_en: Add delay to handle Downstream Port Containment (DPC) AER") Reviewed-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-2-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8c55874f44ca6d..3db951d0c6907a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -17360,9 +17360,14 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) netdev_info(bp->dev, "PCI Slot Reset\n"); - if (!(bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && - test_bit(BNXT_STATE_PCI_CHANNEL_IO_FROZEN, &bp->state)) - msleep(900); + if (test_bit(BNXT_STATE_PCI_CHANNEL_IO_FROZEN, &bp->state)) { + /* After DPC, the chip should return CRS when the vendor ID + * config register is read until it is ready. On all chips, + * this is not happening reliably so add a 5-second delay as a + * workaround. + */ + msleep(5000); + } netdev_lock(netdev); From 54c28fab2fa5afd681c9c4b10f4f6da1efdd397a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 4 May 2026 14:06:09 +0530 Subject: [PATCH 385/972] bnxt_en: Set bp->max_tpa according to what the FW supports Fix the logic to set bp->max_tpa no higher than what the FW supports. On P5 chips, some older FW sets max_tpa very low so we override it to prevent performance regressions with the older FW. Fixes: 79632e9ba386 ("bnxt_en: Expand bnxt_tpa_info struct to support 57500 chips.") Reviewed-by: Kalesh AP Reviewed-by: Colin Winegarden Reviewed-by: Rukhsana Ansari Signed-off-by: Michael Chan Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-3-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3db951d0c6907a..008c34cff7b46c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3825,7 +3825,10 @@ static int bnxt_alloc_tpa_info(struct bnxt *bp) if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { if (!bp->max_tpa_v2) return 0; - bp->max_tpa = max_t(u16, bp->max_tpa_v2, MAX_TPA_P5); + bp->max_tpa = min_t(u16, bp->max_tpa_v2, MAX_TPA_P5); + /* Older P5 FW sets max_tpa_v2 low by mistake except NPAR */ + if (bp->max_tpa <= 32 && BNXT_CHIP_P5(bp) && !BNXT_NPAR(bp)) + bp->max_tpa = MAX_TPA_P5; } for (i = 0; i < bp->rx_nr_rings; i++) { From 16517bc98a56004274472cc9949194cb4d2ad0b7 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 4 May 2026 14:06:10 +0530 Subject: [PATCH 386/972] bnxt_en: Check return value of bnxt_hwrm_vnic_cfg When the bnxt RDMA driver is loaded, it calls bnxt_register_dev(). As part of this, driver sends HWRM_VNIC_CFG firmware command to configure the VNIC to operate in dual VNIC mode. Currently the driver ignores the result of this firmware command. The RDMA driver must know the result since it affects its functioning. Check return value of call to bnxt_hwrm_vnic_cfg() in bnxt_register_dev() and return failure on error. Fixes: a588e4580a7e ("bnxt_en: Add interface to support RDMA driver.") Reviewed-by: Michael Chan Signed-off-by: Kalesh AP Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-4-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 052bf69cfa4cdd..5c751933da6a9d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -175,8 +175,14 @@ int bnxt_register_dev(struct bnxt_en_dev *edev, ulp->handle = handle; rcu_assign_pointer(ulp->ulp_ops, ulp_ops); - if (test_bit(BNXT_STATE_OPEN, &bp->state)) - bnxt_hwrm_vnic_cfg(bp, &bp->vnic_info[BNXT_VNIC_DEFAULT]); + if (test_bit(BNXT_STATE_OPEN, &bp->state)) { + rc = bnxt_hwrm_vnic_cfg(bp, &bp->vnic_info[BNXT_VNIC_DEFAULT]); + if (rc) { + netdev_err(dev, "Failed to configure dual VNIC mode\n"); + RCU_INIT_POINTER(ulp->ulp_ops, NULL); + goto exit; + } + } edev->ulp_tbl->msix_requested = bnxt_get_ulp_msix_num(bp); From bd279e104e5f5400307d56116a36756b35ab345a Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 4 May 2026 14:06:11 +0530 Subject: [PATCH 387/972] bnxt_en: Use absolute target ns from ptp_clock_request There is no need to calculate the target PHC cycles required to make phase adjustment on the PPS OUT signal. This is because the application supplies absolute n_sec value in the future and is already the actual desired target value. Remove the unnecessary code. Fixes: 9e518f25802c ("bnxt_en: 1PPS functions to configure TSIO pins") Reviewed-by: Kalesh AP Cc: Richard Cochran Signed-off-by: Pavan Chebbi Reviewed-by: Vadim Fedorenko Tested-by: Vadim Fedorenko Link: https://patch.msgid.link/20260504083611.1383776-5-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 53f336db4fcc06..5d41dc1bc78209 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -419,31 +419,13 @@ void bnxt_ptp_reapply_pps(struct bnxt *bp) } } -static int bnxt_get_target_cycles(struct bnxt_ptp_cfg *ptp, u64 target_ns, - u64 *cycles_delta) -{ - u64 cycles_now; - u64 nsec_now, nsec_delta; - int rc; - - rc = bnxt_refclk_read(ptp->bp, NULL, &cycles_now); - if (rc) - return rc; - - nsec_now = bnxt_timecounter_cyc2time(ptp, cycles_now); - - nsec_delta = target_ns - nsec_now; - *cycles_delta = div64_u64(nsec_delta << ptp->cc.shift, ptp->cc.mult); - return 0; -} - static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, struct ptp_clock_request *rq) { struct hwrm_func_ptp_cfg_input *req; struct bnxt *bp = ptp->bp; struct timespec64 ts; - u64 target_ns, delta; + u64 target_ns; u16 enables; int rc; @@ -451,10 +433,6 @@ static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, ts.tv_nsec = rq->perout.start.nsec; target_ns = timespec64_to_ns(&ts); - rc = bnxt_get_target_cycles(ptp, target_ns, &delta); - if (rc) - return rc; - rc = hwrm_req_init(bp, req, HWRM_FUNC_PTP_CFG); if (rc) return rc; @@ -468,7 +446,10 @@ static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, req->ptp_freq_adj_dll_phase = 0; req->ptp_freq_adj_ext_period = cpu_to_le32(NSEC_PER_SEC); req->ptp_freq_adj_ext_up = 0; - req->ptp_freq_adj_ext_phase_lower = cpu_to_le32(delta); + req->ptp_freq_adj_ext_phase_lower = + cpu_to_le32(lower_32_bits(target_ns)); + req->ptp_freq_adj_ext_phase_upper = + cpu_to_le32(upper_32_bits(target_ns)); return hwrm_req_send(bp, req); } From f83e07b29246f468bc7c99f98ca1897843fa8167 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 May 2026 16:38:42 +0000 Subject: [PATCH 388/972] net/sched: sch_fq_codel: annotate data-races from fq_codel_dump_class_stats() fq_codel_dump_class_stats() acquires qdisc spinlock only when requested to follow flow->head chain. As we did in sch_cake recently, add the missing READ_ONCE()/WRITE_ONCE() annotations. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260504163842.1162001-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq_codel.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0664b2f2d6f280..24db54684e8a59 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -117,7 +117,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) { struct sk_buff *skb = flow->head; - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); return skb; } @@ -127,7 +127,7 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, struct sk_buff *skb) { if (flow->head == NULL) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -173,8 +173,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, } while (++i < max_packets && len < threshold); /* Tell codel to increase its signal strength also */ - flow->cvars.count += i; - q->backlogs[idx] -= len; + WRITE_ONCE(flow->cvars.count, flow->cvars.count + i); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] - len); q->memory_usage -= mem; sch->qstats.drops += i; sch->qstats.backlog -= len; @@ -204,13 +204,13 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, codel_set_enqueue_time(skb); flow = &q->flows[idx]; flow_queue_add(flow, skb); - q->backlogs[idx] += qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] + qdisc_pkt_len(skb)); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); q->new_flow_count++; - flow->deficit = q->quantum; + WRITE_ONCE(flow->deficit, q->quantum); } get_codel_cb(skb)->mem_usage = skb->truesize; q->memory_usage += get_codel_cb(skb)->mem_usage; @@ -263,7 +263,8 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) flow = container_of(vars, struct fq_codel_flow, cvars); if (flow->head) { skb = dequeue_head(flow); - q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[flow - q->flows], + q->backlogs[flow - q->flows] - qdisc_pkt_len(skb)); q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -296,7 +297,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) flow = list_first_entry(head, struct fq_codel_flow, flowchain); if (flow->deficit <= 0) { - flow->deficit += q->quantum; + WRITE_ONCE(flow->deficit, flow->deficit + q->quantum); list_move_tail(&flow->flowchain, &q->old_flows); goto begin; } @@ -314,7 +315,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) goto begin; } qdisc_bstats_update(sch, skb); - flow->deficit -= qdisc_pkt_len(skb); + WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb)); if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, @@ -328,7 +329,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) static void fq_codel_flow_purge(struct fq_codel_flow *flow) { rtnl_kfree_skbs(flow->head, flow->tail); - flow->head = NULL; + WRITE_ONCE(flow->head, NULL); } static void fq_codel_reset(struct Qdisc *sch) @@ -656,21 +657,21 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; - xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.deficit = READ_ONCE(flow->deficit); xstats.class_stats.ldelay = - codel_time_to_us(flow->cvars.ldelay); - xstats.class_stats.count = flow->cvars.count; - xstats.class_stats.lastcount = flow->cvars.lastcount; - xstats.class_stats.dropping = flow->cvars.dropping; - if (flow->cvars.dropping) { - codel_tdiff_t delta = flow->cvars.drop_next - + codel_time_to_us(READ_ONCE(flow->cvars.ldelay)); + xstats.class_stats.count = READ_ONCE(flow->cvars.count); + xstats.class_stats.lastcount = READ_ONCE(flow->cvars.lastcount); + xstats.class_stats.dropping = READ_ONCE(flow->cvars.dropping); + if (xstats.class_stats.dropping) { + codel_tdiff_t delta = READ_ONCE(flow->cvars.drop_next) - codel_get_time(); xstats.class_stats.drop_next = (delta >= 0) ? codel_time_to_us(delta) : -codel_time_to_us(-delta); } - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -679,7 +680,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = q->backlogs[idx]; + qs.backlog = READ_ONCE(q->backlogs[idx]); qs.drops = 0; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) From 54d54f33813d7911555226b4220737177a2ba8d6 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Sat, 14 Mar 2026 18:54:30 +0530 Subject: [PATCH 389/972] powerpc/pseries/htmdump: Free the global buffers in htmdump module exit htmdump modules uses global memory buffers to capture details like capabilities, status of specified HTM, read the trace buffer. These are initialized during module init and hence needs to be freed in module exit. Patch adds freeing of the memory in module exit. The change also includes minor clean up for the variable name. The read call back for the debugfs interface file saves filp->private_data to local variable name which is same as global variable name for the memory buffers. Rename these local variable names. Signed-off-by: Athira Rajeev Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260314132432.25581-1-atrajeev@linux.ibm.com --- arch/powerpc/platforms/pseries/htmdump.c | 31 +++++++++++++----------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c index 742ec52c9d4df9..93f0cc2dc7fb66 100644 --- a/arch/powerpc/platforms/pseries/htmdump.c +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -86,7 +86,7 @@ static ssize_t htm_return_check(long rc) static ssize_t htmdump_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_buf = filp->private_data; + void *htm_buf_data = filp->private_data; unsigned long page, read_size, available; loff_t offset; long rc, ret; @@ -100,7 +100,7 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf, * - last three values are address, size and offset */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf), + htmtype, H_HTM_OP_DUMP_DATA, virt_to_phys(htm_buf_data), PAGE_SIZE, page); ret = htm_return_check(rc); @@ -112,7 +112,7 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf, available = PAGE_SIZE; read_size = min(count, available); *ppos += read_size; - return simple_read_from_buffer(ubuf, count, &offset, htm_buf, available); + return simple_read_from_buffer(ubuf, count, &offset, htm_buf_data, available); } static const struct file_operations htmdump_fops = { @@ -226,7 +226,7 @@ static int htmstart_get(void *data, u64 *val) static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_status_buf = filp->private_data; + void *htm_status_data = filp->private_data; long rc, ret; u64 *num_entries; u64 to_copy; @@ -238,7 +238,7 @@ static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, * - last three values as addr, size and offset */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_buf), + htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_data), PAGE_SIZE, 0); ret = htm_return_check(rc); @@ -255,13 +255,13 @@ static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, * So total count to copy is: * 32 bytes (for first 7 fields) + (number of HTM entries * entry size) */ - num_entries = htm_status_buf + 0x10; + num_entries = htm_status_data + 0x10; if (htmtype == 0x2) htmstatus_flag = 0x8; else htmstatus_flag = 0x6; to_copy = 32 + (be64_to_cpu(*num_entries) * htmstatus_flag); - return simple_read_from_buffer(ubuf, count, ppos, htm_status_buf, to_copy); + return simple_read_from_buffer(ubuf, count, ppos, htm_status_data, to_copy); } static const struct file_operations htmstatus_fops = { @@ -273,7 +273,7 @@ static const struct file_operations htmstatus_fops = { static ssize_t htminfo_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_info_buf = filp->private_data; + void *htm_info_data = filp->private_data; long rc, ret; u64 *num_entries; u64 to_copy; @@ -284,7 +284,7 @@ static ssize_t htminfo_read(struct file *filp, char __user *ubuf, * - last three values as addr, size and offset */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_buf), + htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_data), PAGE_SIZE, 0); ret = htm_return_check(rc); @@ -301,15 +301,15 @@ static ssize_t htminfo_read(struct file *filp, char __user *ubuf, * So total count to copy is: * 32 bytes (for first 5 fields) + (number of HTM entries * entry size) */ - num_entries = htm_info_buf + 0x10; + num_entries = htm_info_data + 0x10; to_copy = 32 + (be64_to_cpu(*num_entries) * 16); - return simple_read_from_buffer(ubuf, count, ppos, htm_info_buf, to_copy); + return simple_read_from_buffer(ubuf, count, ppos, htm_info_data, to_copy); } static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - void *htm_caps_buf = filp->private_data; + void *htm_caps_data = filp->private_data; long rc, ret; /* @@ -319,7 +319,7 @@ static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, * and zero */ rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, - htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_buf), + htmtype, H_HTM_OP_CAPABILITIES, virt_to_phys(htm_caps_data), 0x80, 0); ret = htm_return_check(rc); @@ -328,7 +328,7 @@ static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, return ret; } - return simple_read_from_buffer(ubuf, count, ppos, htm_caps_buf, 0x80); + return simple_read_from_buffer(ubuf, count, ppos, htm_caps_data, 0x80); } static const struct file_operations htminfo_fops = { @@ -482,6 +482,9 @@ static void __exit htmdump_exit(void) { debugfs_remove_recursive(htmdump_debugfs_dir); kfree(htm_buf); + kfree(htm_status_buf); + kfree(htm_info_buf); + kfree(htm_caps_buf); } module_init(htmdump_init); From 4ec6ade73f1626a74d46e61ff247cf2b1b96446b Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Sat, 14 Mar 2026 18:54:31 +0530 Subject: [PATCH 390/972] powerpc/pseries/htmdump: Fix the offset value used in processor configuration dump H_HTM call is invoked using three parameters specifying the address of the buffer, size of the buffer and offset where to read from. offset used was always zero. "offset" is value from output buffer header that points to next entry to dump. zero is the first entry to dump. next entry is read from the output bufferbyte offset 0x8. Update htminfo_read() function to use right offset. Return when offset points to -1 Fixes: dea7384e14e7 ("powerpc/pseries/htmdump: Add htm info support to htmdump module") Signed-off-by: Athira Rajeev Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260314132432.25581-2-atrajeev@linux.ibm.com --- arch/powerpc/platforms/pseries/htmdump.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c index 93f0cc2dc7fb66..34978a794eba1e 100644 --- a/arch/powerpc/platforms/pseries/htmdump.c +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -277,15 +277,26 @@ static ssize_t htminfo_read(struct file *filp, char __user *ubuf, long rc, ret; u64 *num_entries; u64 to_copy; + loff_t offset = 0; + u64 info_offset = 0; /* * Invoke H_HTM call with: * - operation as htm status (H_HTM_OP_STATUS) * - last three values as addr, size and offset + * "offset" is value from output buffer header + * that points to next entry to dump. 0 is the first + * entry to dump. next entry is read from the output + * bufferbyte offset 0x8. */ + if (*ppos) { + info_offset = *(u64 *)(htm_info_data + 0x8); + if (info_offset == -1) + return 0; + } rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, htmtype, H_HTM_OP_DUMP_SYSPROC_CONF, virt_to_phys(htm_info_data), - PAGE_SIZE, 0); + PAGE_SIZE, be64_to_cpu(info_offset)); ret = htm_return_check(rc); if (ret <= 0) { @@ -303,7 +314,9 @@ static ssize_t htminfo_read(struct file *filp, char __user *ubuf, */ num_entries = htm_info_data + 0x10; to_copy = 32 + (be64_to_cpu(*num_entries) * 16); - return simple_read_from_buffer(ubuf, count, ppos, htm_info_data, to_copy); + + *ppos += to_copy; + return simple_read_from_buffer(ubuf, count, &offset, htm_info_data, to_copy); } static ssize_t htmcaps_read(struct file *filp, char __user *ubuf, From 0031424cbc0a6eafc56a8f9201a4a69d3649981e Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Sat, 14 Mar 2026 18:54:32 +0530 Subject: [PATCH 391/972] powerpc/pseries/htmdump: Fix the offset value used in htm status dump H_HTM call is invoked using three parameters specifying the address of the buffer, size of the buffer and offset where to read from. offset used was always zero. "offset" is value from output buffer header that points to next entry to dump. zero is the first entry to dump. next entry is read from the output bufferbyte offset 0x8. Update htmstatus_read() function to use right offset. Return when offset points to -1 Fixes: 627cf584f4c3 ("powerpc/pseries/htmdump: Add htm status support to htmdump module") Signed-off-by: Athira Rajeev Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260314132432.25581-3-atrajeev@linux.ibm.com --- arch/powerpc/platforms/pseries/htmdump.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c index 34978a794eba1e..76444c6c5cc122 100644 --- a/arch/powerpc/platforms/pseries/htmdump.c +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -231,15 +231,26 @@ static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, u64 *num_entries; u64 to_copy; int htmstatus_flag; + loff_t offset = 0; + u64 status_offset = 0; /* * Invoke H_HTM call with: * - operation as htm status (H_HTM_OP_STATUS) - * - last three values as addr, size and offset + * - last three values as addr, size and offset. + * "offset" is value from output buffer header + * that points to next entry to dump. 0 is the first + * entry to dump. next entry is read from the output + * bufferbyte offset 0x8. */ + if (*ppos) { + status_offset = *(u64 *)(htm_status_data + 0x8); + if (status_offset == -1) + return 0; + } rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, htmtype, H_HTM_OP_STATUS, virt_to_phys(htm_status_data), - PAGE_SIZE, 0); + PAGE_SIZE, be64_to_cpu(status_offset)); ret = htm_return_check(rc); if (ret <= 0) { @@ -261,7 +272,9 @@ static ssize_t htmstatus_read(struct file *filp, char __user *ubuf, else htmstatus_flag = 0x6; to_copy = 32 + (be64_to_cpu(*num_entries) * htmstatus_flag); - return simple_read_from_buffer(ubuf, count, ppos, htm_status_data, to_copy); + *ppos += to_copy; + + return simple_read_from_buffer(ubuf, count, &offset, htm_status_data, to_copy); } static const struct file_operations htmstatus_fops = { From 0342545a29bc2edfbe132c68c68b51c92a12444c Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Sat, 14 Mar 2026 18:59:53 +0530 Subject: [PATCH 392/972] powerpc/pseries/htmdump: Add memory configuration dump support to htmdump module H_HTM (Hardware Trace Macro) hypervisor call has capability to capture SystemMemory Configuration. This information helps to understand the address mapping for the partitions in the system. Support dumping system memory configuration from Hardware Trace Macro (HTM) function via debugfs interface. Under debugfs folder "/sys/kernel/debug/powerpc/htmdump", add file "htmsystem_mem". The interface allows only read of this file which will present the content of HTM buffer from the hcall. The 16th offset of HTM buffer has value for the number of entries for array of processors. Use this information to copy data to the debugfs file Signed-off-by: Athira Rajeev Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260314132953.27269-1-atrajeev@linux.ibm.com --- arch/powerpc/platforms/pseries/htmdump.c | 70 ++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/arch/powerpc/platforms/pseries/htmdump.c b/arch/powerpc/platforms/pseries/htmdump.c index 76444c6c5cc122..489a80e8708218 100644 --- a/arch/powerpc/platforms/pseries/htmdump.c +++ b/arch/powerpc/platforms/pseries/htmdump.c @@ -16,6 +16,7 @@ static void *htm_buf; static void *htm_status_buf; static void *htm_info_buf; static void *htm_caps_buf; +static void *htm_mem_buf; static u32 nodeindex; static u32 nodalchipindex; static u32 coreindexonchip; @@ -115,12 +116,72 @@ static ssize_t htmdump_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, count, &offset, htm_buf_data, available); } +static ssize_t htmsystem_mem_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + void *htm_mem_data = filp->private_data; + long rc, ret; + u64 *num_entries; + u64 to_copy = 0; + loff_t offset = 0; + u64 mem_offset = 0; + + /* + * Invoke H_HTM call with: + * - operation as htm status (H_HTM_OP_STATUS) + * - last three values as addr, size and offset. "offset" + * is value from output buffer header that points to next + * entry to dump. 0 is the first entry to dump. next entry + * is read from the output bufferbyte offset 0x8. + * + * When first time hcall is invoked, mem_offset should be + * zero because zero is the first entry. + * In the next hcall, offset of next entry to read from is + * picked from output buffer header itself. So don't fill + * mem_offset for first read. + * + * If there is no further data to read in next iteration, + * offset value from output buffer header will point to -1. + */ + if (*ppos) { + mem_offset = *(u64 *)(htm_mem_data + 0x8); + if (mem_offset == -1) + return 0; + } + rc = htm_hcall_wrapper(htmflags, nodeindex, nodalchipindex, coreindexonchip, + htmtype, H_HTM_OP_DUMP_SYSMEM_CONF, virt_to_phys(htm_mem_data), + PAGE_SIZE, be64_to_cpu(mem_offset)); + ret = htm_return_check(rc); + if (ret <= 0) { + pr_debug("H_HTM hcall returned for op: H_HTM_OP_DUMP_SYSMEM_CONF with hcall returning %ld\n", ret); + return ret; + } + + /* + * HTM system mem buffer, start of buffer + 0x10 gives the + * number of HTM entries in the buffer. + * So total count to copy is: + * 32 bytes (for first 5 fields) + (number of HTM entries * entry size) + */ + num_entries = htm_mem_data + 0x10; + to_copy = 32 + (be64_to_cpu(*num_entries) * 32); + + *ppos += to_copy; + return simple_read_from_buffer(ubuf, count, &offset, htm_mem_data, to_copy); +} + static const struct file_operations htmdump_fops = { .llseek = NULL, .read = htmdump_read, .open = simple_open, }; +static const struct file_operations htmsystem_mem_fops = { + .llseek = NULL, + .read = htmsystem_mem_read, + .open = simple_open, +}; + static int htmconfigure_set(void *data, u64 val) { long rc, ret; @@ -483,9 +544,17 @@ static int htmdump_init_debugfs(void) return -ENOMEM; } + /* Memory to present HTM system memory configuration */ + htm_mem_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!htm_mem_buf) { + pr_err("Failed to allocate htm mem buf\n"); + return -ENOMEM; + } + debugfs_create_file("htmstatus", 0400, htmdump_debugfs_dir, htm_status_buf, &htmstatus_fops); debugfs_create_file("htminfo", 0400, htmdump_debugfs_dir, htm_info_buf, &htminfo_fops); debugfs_create_file("htmcaps", 0400, htmdump_debugfs_dir, htm_caps_buf, &htmcaps_fops); + debugfs_create_file("htmsystem_mem", 0400, htmdump_debugfs_dir, htm_mem_buf, &htmsystem_mem_fops); return 0; } @@ -511,6 +580,7 @@ static void __exit htmdump_exit(void) kfree(htm_status_buf); kfree(htm_info_buf); kfree(htm_caps_buf); + kfree(htm_mem_buf); } module_init(htmdump_init); From 7a4f0846ee6cc8cf44ae0046ed42e3259d1dd45b Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:40 +0530 Subject: [PATCH 393/972] pseries/papr-hvpipe: Fix race with interrupt handler While executing ->ioctl handler or ->release handler, if an interrupt fires on the same cpu, then we can enter into a deadlock. This patch fixes both these handlers to take spin_lock_irq{save|restore} versions of the lock to prevent this deadlock. Cc: stable@vger.kernel.org Fixes: 814ef095f12c9 ("powerpc/pseries: Add papr-hvpipe char driver for HVPIPE interfaces") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/e4ed435c44fc191f2eb23c7907ba6f72f193e6aa.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 14ae480d060a4d..c41d45e1986d14 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -444,13 +444,14 @@ static int papr_hvpipe_handle_release(struct inode *inode, struct file *file) { struct hvpipe_source_info *src_info; + unsigned long flags; /* * Hold the lock, remove source from src_list, reset the * hvpipe status and release the lock to prevent any race * with message event IRQ. */ - spin_lock(&hvpipe_src_list_lock); + spin_lock_irqsave(&hvpipe_src_list_lock, flags); src_info = file->private_data; list_del(&src_info->list); file->private_data = NULL; @@ -461,10 +462,10 @@ static int papr_hvpipe_handle_release(struct inode *inode, */ if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) { src_info->hvpipe_status = 0; - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); hvpipe_rtas_recv_msg(NULL, 0); } else - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); kfree(src_info); return 0; @@ -480,20 +481,21 @@ static const struct file_operations papr_hvpipe_handle_ops = { static int papr_hvpipe_dev_create_handle(u32 srcID) { struct hvpipe_source_info *src_info __free(kfree) = NULL; + unsigned long flags; - spin_lock(&hvpipe_src_list_lock); + spin_lock_irqsave(&hvpipe_src_list_lock, flags); /* * Do not allow more than one process communicates with * each source. */ src_info = hvpipe_find_source(srcID); if (src_info) { - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); pr_err("pid(%d) is already using the source(%d)\n", src_info->tsk->pid, srcID); return -EALREADY; } - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); src_info = kzalloc_obj(*src_info, GFP_KERNEL_ACCOUNT); if (!src_info) @@ -510,18 +512,18 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) return fdf.err; retain_and_null_ptr(src_info); - spin_lock(&hvpipe_src_list_lock); + spin_lock_irqsave(&hvpipe_src_list_lock, flags); /* * If two processes are executing ioctl() for the same * source ID concurrently, prevent the second process to * acquire FD. */ if (hvpipe_find_source(srcID)) { - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); return -EALREADY; } list_add(&src_info->list, &hvpipe_src_list); - spin_unlock(&hvpipe_src_list_lock); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); return fd_publish(fdf); } From cefeed44296261173a806bef988b26bc565da4be Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:41 +0530 Subject: [PATCH 394/972] pseries/papr-hvpipe: Prevent kernel stack memory leak to userspace The hdr variable is allocated on the stack and only hdr.version and hdr.flags are initialized explicitly. Because the struct papr_hvpipe_hdr contains reserved padding bytes (reserved[3] and reserved2[40]), these could leak the uninitialized bytes to userspace after copy_to_user(). This patch fixes that by initializing the whole struct to 0. Cc: stable@vger.kernel.org Fixes: cebdb522fd3ed ("powerpc/pseries: Receive payload with ibm,receive-hvpipe-msg RTAS") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/7bfe03b65a282c856ed8182d1871bb973c0b78f2.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index c41d45e1986d14..3392874ebdf686 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -327,7 +327,7 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, { struct hvpipe_source_info *src_info = file->private_data; - struct papr_hvpipe_hdr hdr; + struct papr_hvpipe_hdr hdr = {}; long ret; /* From 1b9f7aafa44f5ce852c00509104d10fd9eb0f402 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:42 +0530 Subject: [PATCH 395/972] pseries/papr-hvpipe: Fix null ptr deref in papr_hvpipe_dev_create_handle() commit 6d3789d347a7 ("papr-hvpipe: convert papr_hvpipe_dev_create_handle() to FD_PREPARE()"), changed the create handle to FD_PREPARE(), but it caused kernel null-ptr-deref because after call to retain_and_null_ptr(src_info), src_info is re-used for adding it to the global list. Getting the following kernel panic in papr_hvpipe_dev_create_handle() when trying to add src_info to the list. Kernel attempted to write user page (0) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on write at 0x00000000 Faulting instruction address: 0xc0000000001b44a0 Oops: Kernel access of bad area, sig: 11 [#1] ... Call Trace: papr_hvpipe_dev_ioctl+0x1f4/0x48c (unreliable) sys_ioctl+0x528/0x1064 system_call_exception+0x128/0x360 system_call_vectored_common+0x15c/0x2ec Now, the error handling with FD_PREPARE's file cleanup and __free(kfree) auto cleanup is getting too convoluted. This is mainly because we need to ensure only 1 user get the srcID handle. To simplify this, we allocate prepare the src_info in the beginning and add it to the global list under a spinlock after checking that no duplicates exist. This simplify the error handling where if the FD_ADD fails, we can simply remove the src_info from the list and consume any pending msg in hvpipe to be cleared, after src_info became visible in the global list. Cc: stable@vger.kernel.org Fixes: 6d3789d347a7 ("papr-hvpipe: convert papr_hvpipe_dev_create_handle() to FD_PREPARE()") Reported-by: Haren Myneni Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/31ad94bc89d44156ee700c5bd006cb47a748e3cb.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 57 ++++++++++---------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 3392874ebdf686..402781299497a5 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -480,23 +480,10 @@ static const struct file_operations papr_hvpipe_handle_ops = { static int papr_hvpipe_dev_create_handle(u32 srcID) { - struct hvpipe_source_info *src_info __free(kfree) = NULL; + struct hvpipe_source_info *src_info; + int fd; unsigned long flags; - spin_lock_irqsave(&hvpipe_src_list_lock, flags); - /* - * Do not allow more than one process communicates with - * each source. - */ - src_info = hvpipe_find_source(srcID); - if (src_info) { - spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); - pr_err("pid(%d) is already using the source(%d)\n", - src_info->tsk->pid, srcID); - return -EALREADY; - } - spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); - src_info = kzalloc_obj(*src_info, GFP_KERNEL_ACCOUNT); if (!src_info) return -ENOMEM; @@ -505,26 +492,42 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) src_info->tsk = current; init_waitqueue_head(&src_info->recv_wqh); - FD_PREPARE(fdf, O_RDONLY | O_CLOEXEC, - anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops, - (void *)src_info, O_RDWR)); - if (fdf.err) - return fdf.err; - - retain_and_null_ptr(src_info); - spin_lock_irqsave(&hvpipe_src_list_lock, flags); /* - * If two processes are executing ioctl() for the same - * source ID concurrently, prevent the second process to - * acquire FD. + * Do not allow more than one process communicates with + * each source. */ + spin_lock_irqsave(&hvpipe_src_list_lock, flags); if (hvpipe_find_source(srcID)) { spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + pr_err("pid(%d) could not get the source(%d)\n", + src_info->tsk->pid, srcID); + kfree(src_info); return -EALREADY; } list_add(&src_info->list, &hvpipe_src_list); spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); - return fd_publish(fdf); + + fd = FD_ADD(O_RDONLY | O_CLOEXEC, + anon_inode_getfile("[papr-hvpipe]", &papr_hvpipe_handle_ops, + (void *)src_info, O_RDWR)); + if (fd < 0) { + spin_lock_irqsave(&hvpipe_src_list_lock, flags); + list_del(&src_info->list); + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + /* + * if we fail to add FD, that means no userspace program is + * polling. In that case if there is a msg pending because the + * interrupt was fired after the src_info was added to the + * global list, then let's consume it here, to unblock the + * hvpipe + */ + if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) + hvpipe_rtas_recv_msg(NULL, 0); + kfree(src_info); + return fd; + } + + return fd; } /* From 713e468cdbc2277db6ce949c32c1acbd83501733 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:43 +0530 Subject: [PATCH 396/972] pseries/papr-hvpipe: Fix & simplify error handling in papr_hvpipe_init() Remove such 3 levels of nesting patterns to check success return values from function calls. ret = enable_hvpipe_IRQ() if (!ret) ret = set_hvpipe_sys_param(1) if (!ret) ret = misc_register() Instead just bail out to "out*:" labels, in case of any error. This simplifies the init flow. While at it let's also fix the following error handling logic: We have already enabled interrupt sources and enabled hvpipe to received interrupts, if misc_register() fails, we will destroy the workqueue, but the HMC might send us a msg via hvpipe which will call, queue work on the workqueue which might be destroyed. So instead, let's reverse the order of enabling set_hvpipe_sys_param(1) and in case of an error let's remove the misc dev by calling misc_deregister(). Cc: stable@vger.kernel.org Fixes: 39a08a4f94980 ("powerpc/pseries: Enable hvpipe with ibm,set-system-parameter RTAS") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/f2141eafb80e7780395e03aa9a22e8a37be80513.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 28 ++++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 402781299497a5..800649f309a573 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -780,23 +780,29 @@ static int __init papr_hvpipe_init(void) } ret = enable_hvpipe_IRQ(); - if (!ret) { - ret = set_hvpipe_sys_param(1); - if (!ret) - ret = misc_register(&papr_hvpipe_dev); - } + if (ret) + goto out_wq; - if (!ret) { - pr_info("hvpipe feature is enabled\n"); - hvpipe_feature = true; - return 0; - } + ret = misc_register(&papr_hvpipe_dev); + if (ret) + goto out_wq; - pr_err("hvpipe feature is not enabled %d\n", ret); + ret = set_hvpipe_sys_param(1); + if (ret) + goto out_misc; + + pr_info("hvpipe feature is enabled\n"); + hvpipe_feature = true; + return 0; + +out_misc: + misc_deregister(&papr_hvpipe_dev); +out_wq: destroy_workqueue(papr_hvpipe_wq); out: kfree(papr_hvpipe_work); papr_hvpipe_work = NULL; + pr_err("hvpipe feature is not enabled %d\n", ret); return ret; } machine_device_initcall(pseries, papr_hvpipe_init); From d48654bd8b1a75f662e224d257db54de475120dc Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:44 +0530 Subject: [PATCH 397/972] pseries/papr-hvpipe: Fix the usage of copy_to_user() copy_to_user() return bytes_not_copied to the user buffer. If there was an error writing bytes into the user buffer, i.e. if copy_to_user returns a non-zero value, then we should simply return -EFAULT from the ->read() call. Otherwise, in the non-patched version, we may end up mixing "bytes_not_copied + bytes_copied (HVPIPE_HDR_LEN)" as the return value to the user in ->read() call Also let's make sure we clear the hvpipe_status flag, if we have consumed the hvpipe msg by making the rtas call. ret = -EFAULT means copy_to_user has failed but that still means that the msg was read from the hvpipe, hence for both cases, success & -EFAULT, we should clear the HVPIPE_MSG_AVAILABLE flag in hvpipe_status. Cc: stable@vger.kernel.org Fixes: cebdb522fd3edd1 ("powerpc/pseries: Receive payload with ibm,receive-hvpipe-msg RTAS") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/8fda3212a1ad48879c174e92f67472d9b9f1c3b7.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 23 ++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 800649f309a573..c007560d2d8ce9 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -206,10 +206,11 @@ static int hvpipe_rtas_recv_msg(char __user *buf, int size) bytes_written, size); bytes_written = size; } - ret = copy_to_user(buf, + if (copy_to_user(buf, rtas_work_area_raw_buf(work_area), - bytes_written); - if (!ret) + bytes_written)) + ret = -EFAULT; + else ret = bytes_written; } } else { @@ -328,7 +329,7 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, struct hvpipe_source_info *src_info = file->private_data; struct papr_hvpipe_hdr hdr = {}; - long ret; + ssize_t ret = 0; /* * Return -ENXIO during migration @@ -376,7 +377,7 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, ret = copy_to_user(buf, &hdr, HVPIPE_HDR_LEN); if (ret) - return ret; + return -EFAULT; /* * Message event has payload, so get the payload with @@ -385,19 +386,23 @@ static ssize_t papr_hvpipe_handle_read(struct file *file, if (hdr.flags & HVPIPE_MSG_AVAILABLE) { ret = hvpipe_rtas_recv_msg(buf + HVPIPE_HDR_LEN, size - HVPIPE_HDR_LEN); - if (ret > 0) { + /* + * Always clear MSG_AVAILABLE once the RTAS call has drained + * the message, regardless of whether copy_to_user succeeded. + */ + if (ret >= 0 || ret == -EFAULT) src_info->hvpipe_status &= ~HVPIPE_MSG_AVAILABLE; - ret += HVPIPE_HDR_LEN; - } } else if (hdr.flags & HVPIPE_LOST_CONNECTION) { /* * Hypervisor is closing the pipe for the specific * source. So notify user space. */ src_info->hvpipe_status &= ~HVPIPE_LOST_CONNECTION; - ret = HVPIPE_HDR_LEN; } + if (ret >= 0) + ret += HVPIPE_HDR_LEN; + return ret; } From 2eeac577480848801b35885b3a8201aa35f46236 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:45 +0530 Subject: [PATCH 398/972] pseries/papr-hvpipe: Simplify spin unlock usage in papr_hvpipe_handle_release() Once the src_info is removed from the global list, no one can access it. This simplies the usage of spin_unlock_irqrestore() in papr_hvpipe_handle_release() Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/4a980331557af3d10aada8576aaa16cddc691c65.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index c007560d2d8ce9..5aa37f6ad8c998 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -460,6 +460,7 @@ static int papr_hvpipe_handle_release(struct inode *inode, src_info = file->private_data; list_del(&src_info->list); file->private_data = NULL; + spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); /* * If the pipe for this specific source has any pending * payload, issue recv HVPIPE RTAS so that pipe will not @@ -467,10 +468,8 @@ static int papr_hvpipe_handle_release(struct inode *inode, */ if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) { src_info->hvpipe_status = 0; - spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); hvpipe_rtas_recv_msg(NULL, 0); - } else - spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); + } kfree(src_info); return 0; From 4e2d83c80495a9327141e8636f25dde13155f14f Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:46 +0530 Subject: [PATCH 399/972] pseries/papr-hvpipe: Kill task_struct pointer from struct hvpipe_source_info We don't really use task_struct pointer for anything meaningful. So just kill it for now, and we can bring back later if we need this for any future debug purposes. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/895e061e45cdc95db36fa7f27aa1922b81eed867.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 5 ++--- arch/powerpc/platforms/pseries/papr-hvpipe.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 5aa37f6ad8c998..46159f1c1cf101 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -493,7 +493,6 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) return -ENOMEM; src_info->srcID = srcID; - src_info->tsk = current; init_waitqueue_head(&src_info->recv_wqh); /* @@ -503,8 +502,8 @@ static int papr_hvpipe_dev_create_handle(u32 srcID) spin_lock_irqsave(&hvpipe_src_list_lock, flags); if (hvpipe_find_source(srcID)) { spin_unlock_irqrestore(&hvpipe_src_list_lock, flags); - pr_err("pid(%d) could not get the source(%d)\n", - src_info->tsk->pid, srcID); + pr_err("pid(%s:%d) could not get the source(%d)\n", + current->comm, task_pid_nr(current), srcID); kfree(src_info); return -EALREADY; } diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.h b/arch/powerpc/platforms/pseries/papr-hvpipe.h index c343f4230865c0..4bdf7bb2fc4dbe 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.h +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.h @@ -21,7 +21,6 @@ struct hvpipe_source_info { u32 srcID; u32 hvpipe_status; wait_queue_head_t recv_wqh; /* wake up poll() waitq */ - struct task_struct *tsk; }; /* From fe53d2ae82c06aa2d6402624af01e8f8ddfcd5b3 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:47 +0530 Subject: [PATCH 400/972] pseries/papr-hvpipe: Refactor and simplify hvpipe_rtas_recv_msg() Simplify hvpipe_rtas_recv_msg() by removing three levels of nesting... if (!ret) if (buf) if (size < bytes_written) ... this refactoring of the function bails out to "out:" label first, in case of any error. This simplifies the init flow. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/bbe7ddf8b8e25c9be8fc5e2c4aea9e5fca128bf4.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 46159f1c1cf101..3688b2be0445e8 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -190,34 +190,34 @@ static int hvpipe_rtas_recv_msg(char __user *buf, int size) return -ENOMEM; } - ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, - &bytes_written); - if (!ret) { - /* - * Recv HVPIPE RTAS is successful. - * When releasing FD or no one is waiting on the - * specific source, issue recv HVPIPE RTAS call - * so that pipe is not blocked - this func is called - * with NULL buf. - */ - if (buf) { - if (size < bytes_written) { - pr_err("Received the payload size = %d, but the buffer size = %d\n", - bytes_written, size); - bytes_written = size; - } - if (copy_to_user(buf, - rtas_work_area_raw_buf(work_area), - bytes_written)) - ret = -EFAULT; - else - ret = bytes_written; - } - } else { - pr_err("ibm,receive-hvpipe-msg failed with %d\n", - ret); + /* + * Recv HVPIPE RTAS is successful. + * When releasing FD or no one is waiting on the + * specific source, issue recv HVPIPE RTAS call + * so that pipe is not blocked - this func is called + * with NULL buf. + */ + ret = rtas_ibm_receive_hvpipe_msg(work_area, &srcID, &bytes_written); + if (ret) { + pr_err("ibm,receive-hvpipe-msg failed with %d\n", ret); + goto out; } + if (!buf) + goto out; + + if (size < bytes_written) { + pr_err("Received the payload size = %d, but the buffer size = %d\n", + bytes_written, size); + bytes_written = size; + } + + if (copy_to_user(buf, rtas_work_area_raw_buf(work_area), bytes_written)) + ret = -EFAULT; + else + ret = bytes_written; + +out: rtas_work_area_free(work_area); return ret; } From 629d1a901de57490d29a495273df11e64993ec04 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Fri, 1 May 2026 09:41:48 +0530 Subject: [PATCH 401/972] pseries/papr-hvpipe: Fix style and checkpatch issues in enable_hvpipe_IRQ() While at it let's also fix the similar style issue in enable_hvpipe_IRQ() function. This also fixes a minor checkpatch warning which I got due to an extra space before " ==". Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1174f60d0ae128e773dbefd11dd8d46d69e7f50e.1777606826.git.ritesh.list@gmail.com --- arch/powerpc/platforms/pseries/papr-hvpipe.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c index 3688b2be0445e8..0c40bdde45e27f 100644 --- a/arch/powerpc/platforms/pseries/papr-hvpipe.c +++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c @@ -693,20 +693,19 @@ static int __init enable_hvpipe_IRQ(void) struct device_node *np; hvpipe_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION); - if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) + if (hvpipe_check_exception_token == RTAS_UNKNOWN_SERVICE) return -ENODEV; /* hvpipe events */ np = of_find_node_by_path("/event-sources/ibm,hvpipe-msg-events"); - if (np != NULL) { - request_event_sources_irqs(np, hvpipe_event_interrupt, - "HPIPE_EVENT"); - of_node_put(np); - } else { - pr_err("Can not enable hvpipe event IRQ\n"); + if (!np) { + pr_err("No device node found, could not enable hvpipe event IRQ\n"); return -ENODEV; } + request_event_sources_irqs(np, hvpipe_event_interrupt, "HPIPE_EVENT"); + of_node_put(np); + return 0; } From b3a97f9484080c6e71db9e803e3cc1bb372a9bc7 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 7 Apr 2026 18:13:44 +0530 Subject: [PATCH 402/972] powerpc/kdump: fix KASAN sanitization flag for core_$(BITS).o MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KASAN instrumentation is intended to be disabled for the kexec core code, but the existing Makefile entry misses the object suffix. As a result, the flag is not applied correctly to core_$(BITS).o. So when KASAN is enabled, kexec_copy_flush and copy_segments in kexec/core_64.c are instrumented, which can result in accesses to shadow memory via normal address translation paths. Since these run with the MMU disabled, such accesses may trigger page faults (bad_page_fault) that cannot be handled in the kdump path, ultimately causing a hang and preventing the kdump kernel from booting. The same is true for kexec as well, since the same functions are used there. Update the entry to include the “.o” suffix so that KASAN instrumentation is properly disabled for this object file. Fixes: 2ab2d5794f14 ("powerpc/kasan: Disable address sanitization in kexec paths") Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/1dee8891-8bcc-46b4-93f3-fc3a774abd5b@linux.ibm.com/ Cc: stable@vger.kernel.org Reviewed-by: Ritesh Harjani (IBM) Tested-by: Venkat Rao Bagalkote Acked-by: Mahesh Salgaonkar Reviewed-by: Aboorva Devarajan Tested-by: Aboorva Devarajan Signed-off-by: Sourabh Jain Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260407124349.1698552-1-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 470eb0453e17f3..ec7a0eed75dc59 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -16,4 +16,4 @@ GCOV_PROFILE_core_$(BITS).o := n KCOV_INSTRUMENT_core_$(BITS).o := n UBSAN_SANITIZE_core_$(BITS).o := n KASAN_SANITIZE_core.o := n -KASAN_SANITIZE_core_$(BITS) := n +KASAN_SANITIZE_core_$(BITS).o := n From 38e989d504fc52900a3786b7144fb53cd67e0389 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 7 Apr 2026 18:13:45 +0530 Subject: [PATCH 403/972] powerpc/vmx: avoid KASAN instrumentation in enter_vmx_ops() for kexec The kexec sequence invokes enter_vmx_ops() via copy_page() with the MMU disabled. In this context, code must not rely on normal virtual address translations or trigger page faults. With KASAN enabled, functions get instrumented and may access shadow memory using regular address translation. When executed with the MMU off, this can lead to page faults (bad_page_fault) from which the kernel cannot recover in the kexec path, resulting in a hang. The kexec path sets preempt_count to HARDIRQ_OFFSET before entering the MMU-off copy sequence. current_thread_info()->preempt_count = HARDIRQ_OFFSET kexec_sequence(..., copy_with_mmu_off = 1) -> kexec_copy_flush(image) copy_segments() -> copy_page(dest, addr) bl enter_vmx_ops() if (in_interrupt()) return 0 beq .Lnonvmx_copy Since kexec sets preempt_count to HARDIRQ_OFFSET, in_interrupt() evaluates to true and enter_vmx_ops() returns early. As in_interrupt() (and preempt_count()) are always inlined, mark enter_vmx_ops() with __no_sanitize_address to avoid KASAN instrumentation and shadow memory access with MMU disabled, helping kexec boot fine with KASAN enabled. Reported-by: Aboorva Devarajan Reviewed-by: Aboorva Devarajan Tested-by: Aboorva Devarajan Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Sourabh Jain Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260407124349.1698552-2-sourabhjain@linux.ibm.com --- arch/powerpc/lib/vmx-helper.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index 554b248002b4fd..57e897b60db86c 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -52,7 +52,14 @@ int exit_vmx_usercopy(void) } EXPORT_SYMBOL(exit_vmx_usercopy); -int enter_vmx_ops(void) +/* + * Can be called from kexec copy_page() path with MMU off. The kexec + * code sets preempt_count to HARDIRQ_OFFSET so we return early here. + * Since in_interrupt() is always inline, __no_sanitize_address on this + * function is sufficient to avoid KASAN shadow memory accesses in real + * mode. + */ +int __no_sanitize_address enter_vmx_ops(void) { if (in_interrupt()) return 0; From 0e7c074cfcd9bd93765505f9eb8b42f03ed2a744 Mon Sep 17 00:00:00 2001 From: Pavitra Jha Date: Fri, 1 May 2026 07:07:12 -0400 Subject: [PATCH 404/972] net: wwan: t7xx: validate port_count against message length in t7xx_port_enum_msg_handler t7xx_port_enum_msg_handler() uses the modem-supplied port_count field as a loop bound over port_msg->data[] without checking that the message buffer contains sufficient data. A modem sending port_count=65535 in a 12-byte buffer triggers a slab-out-of-bounds read of up to 262140 bytes. Add a sizeof(*port_msg) check before accessing the port message header fields to guard against undersized messages. Add a struct_size() check after extracting port_count and before the loop. In t7xx_parse_host_rt_data(), guard the rt_feature header read with a remaining-buffer check before accessing data_len, validate feat_data_len against the actual remaining buffer to prevent OOB reads and signed integer overflow on offset. Pass msg_len from both call sites: skb->len at the DPMAIF path after skb_pull(), and the validated feat_data_len at the handshake path. Fixes: da45d2566a1d ("net: wwan: t7xx: Add control port") Cc: stable@vger.kernel.org Signed-off-by: Pavitra Jha Link: https://patch.msgid.link/20260501110713.145563-1-jhapavitra98@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_modem_ops.c | 20 +++++++++++++++++--- drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c | 18 ++++++++++++++++-- drivers/net/wwan/t7xx/t7xx_port_proxy.h | 2 +- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_modem_ops.c b/drivers/net/wwan/t7xx/t7xx_modem_ops.c index 7968e208dd37c1..adb29d30c63fe7 100644 --- a/drivers/net/wwan/t7xx/t7xx_modem_ops.c +++ b/drivers/net/wwan/t7xx/t7xx_modem_ops.c @@ -457,8 +457,20 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf offset = sizeof(struct feature_query); for (i = 0; i < FEATURE_COUNT && offset < data_length; i++) { + size_t remaining = data_length - offset; + size_t feat_data_len, feat_total; + + if (remaining < sizeof(*rt_feature)) + break; + rt_feature = data + offset; - offset += sizeof(*rt_feature) + le32_to_cpu(rt_feature->data_len); + feat_data_len = le32_to_cpu(rt_feature->data_len); + + if (feat_data_len > remaining - sizeof(*rt_feature)) + break; + + feat_total = sizeof(*rt_feature) + feat_data_len; + offset += feat_total; ft_spt_cfg = FIELD_GET(FEATURE_MSK, core->feature_set[i]); if (ft_spt_cfg != MTK_FEATURE_MUST_BE_SUPPORTED) @@ -468,8 +480,10 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf if (ft_spt_st != MTK_FEATURE_MUST_BE_SUPPORTED) return -EINVAL; - if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) - t7xx_port_enum_msg_handler(ctl->md, rt_feature->data); + if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) { + t7xx_port_enum_msg_handler(ctl->md, rt_feature->data, + feat_data_len); + } } return 0; diff --git a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c index ae632ef966983e..f869e4ed9ee9a9 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c +++ b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c @@ -117,6 +117,7 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c * t7xx_port_enum_msg_handler() - Parse the port enumeration message to create/remove nodes. * @md: Modem context. * @msg: Message. + * @msg_len: Length of @msg in bytes. * * Used to control create/remove device node. * @@ -124,12 +125,18 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c * * 0 - Success. * * -EFAULT - Message check failure. */ -int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg) +int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len) { struct device *dev = &md->t7xx_dev->pdev->dev; unsigned int version, port_count, i; struct port_msg *port_msg = msg; + if (msg_len < sizeof(*port_msg)) { + dev_err(dev, "Port enum msg too short for header: need %zu, have %zu\n", + sizeof(*port_msg), msg_len); + return -EINVAL; + } + version = FIELD_GET(PORT_MSG_VERSION, le32_to_cpu(port_msg->info)); if (version != PORT_ENUM_VER || le32_to_cpu(port_msg->head_pattern) != PORT_ENUM_HEAD_PATTERN || @@ -141,6 +148,13 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg) } port_count = FIELD_GET(PORT_MSG_PRT_CNT, le32_to_cpu(port_msg->info)); + + if (msg_len < struct_size(port_msg, data, port_count)) { + dev_err(dev, "Port enum msg too short: need %zu, have %zu\n", + struct_size(port_msg, data, port_count), msg_len); + return -EINVAL; + } + for (i = 0; i < port_count; i++) { u32 port_info = le32_to_cpu(port_msg->data[i]); unsigned int ch_id; @@ -191,7 +205,7 @@ static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb) case CTL_ID_PORT_ENUM: skb_pull(skb, sizeof(*ctrl_msg_h)); - ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data); + ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data, skb->len); if (!ret) ret = port_ctl_send_msg_to_md(port, CTL_ID_PORT_ENUM, 0); else diff --git a/drivers/net/wwan/t7xx/t7xx_port_proxy.h b/drivers/net/wwan/t7xx/t7xx_port_proxy.h index f0918b36e899bd..7c3190bf0fcf39 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_proxy.h +++ b/drivers/net/wwan/t7xx/t7xx_port_proxy.h @@ -103,7 +103,7 @@ void t7xx_port_proxy_reset(struct port_proxy *port_prox); void t7xx_port_proxy_uninit(struct port_proxy *port_prox); int t7xx_port_proxy_init(struct t7xx_modem *md); void t7xx_port_proxy_md_status_notify(struct port_proxy *port_prox, unsigned int state); -int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg); +int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len); int t7xx_port_proxy_chl_enable_disable(struct port_proxy *port_prox, unsigned int ch_id, bool en_flag); void t7xx_port_proxy_set_cfg(struct t7xx_modem *md, enum port_cfg_id cfg_id); From da107152c43915211e1fc0319b9526f4241abca2 Mon Sep 17 00:00:00 2001 From: "Christophe Leroy (CS GROUP)" Date: Tue, 21 Apr 2026 08:26:08 +0200 Subject: [PATCH 405/972] powerpc/8xx: Fix interrupt mask in cpm1_gpiochip_add16() Allthough fsl,cpm1-gpio-irq-mask always contains a 16 bits value, it is a standard u32 OF property as documented in Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt The driver erroneously uses of_property_read_u16() leading to a mask which is always 0. Fix it by using of_property_read_u32() instead. Fixes: 726bd223105c ("powerpc/8xx: Adding support of IRQ in MPC8xx GPIO") Signed-off-by: Christophe Leroy (CS GROUP) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/bb0b6d6c4543238c38d5d29a776d0674a8c0c180.1776752750.git.chleroy@kernel.org --- arch/powerpc/platforms/8xx/cpm1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c index 7433be7d66ee96..f00734f0590cf7 100644 --- a/arch/powerpc/platforms/8xx/cpm1.c +++ b/arch/powerpc/platforms/8xx/cpm1.c @@ -477,7 +477,7 @@ int cpm1_gpiochip_add16(struct device *dev) struct device_node *np = dev->of_node; struct cpm1_gpio16_chip *cpm1_gc; struct gpio_chip *gc; - u16 mask; + u32 mask; cpm1_gc = devm_kzalloc(dev, sizeof(*cpm1_gc), GFP_KERNEL); if (!cpm1_gc) @@ -485,7 +485,7 @@ int cpm1_gpiochip_add16(struct device *dev) spin_lock_init(&cpm1_gc->lock); - if (!of_property_read_u16(np, "fsl,cpm1-gpio-irq-mask", &mask)) { + if (!of_property_read_u32(np, "fsl,cpm1-gpio-irq-mask", &mask)) { int i, j; for (i = 0, j = 0; i < 16; i++) From 131717e656b379addb95af2dcb2d90c723bae24b Mon Sep 17 00:00:00 2001 From: Shivani Nittor Date: Tue, 21 Apr 2026 20:36:28 +0530 Subject: [PATCH 406/972] powerpc/perf: Update check for PERF_SAMPLE_DATA_SRC marked events The core-book3s PMU sampling code validates the SIER TYPE field when PERF_SAMPLE_DATA_SRC is requested. The SIER TYPE field indicates the instruction type and is only valid for random sampling (marked events). To handle cases observed where SIER TYPE could be zero even for marked events,validation was added to drop such samples and increment event->lost_samples. However, this validation was applied to all samples, including continuous sampling. In continuous sampling mode, the PMU does not set the SIER TYPE field, so it remains zero. As a result, valid continuous samples were incorrectly treated as invalid and dropped. Fixed this by gating the SIER TYPE validation with mark_event, so the check runs only for marked (random) events. Continuous samples now skip this check and are recorded normally in the final data recording path. Fixes: 2ffb26afa642 ("arch/powerpc/perf: Check the instruction type before creating sample with perf_mem_data_src") Signed-off-by: Shivani Nittor Reviewed-by: Mukesh Kumar Chaurasiya (IBM) Reviewed-by: Athira Rajeev [Maddy: Fixed reviewed-by tag] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260421150628.96500-1-shivani@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 8b0081441f85d1..2e6adf5b95c404 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2242,6 +2242,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, const u64 last_period = event->hw.last_period; s64 prev, delta, left; int record = 0; + int mark_event = regs->dsisr & MMCRA_SAMPLE_ENABLE; if (event->hw.state & PERF_HES_STOPPED) { write_pmc(event->hw.idx, 0); @@ -2304,9 +2305,9 @@ static void record_and_restart(struct perf_event *event, unsigned long val, * In ISA v3.0 and before values "0" and "7" are considered reserved. * In ISA v3.1, value "7" has been used to indicate "larx/stcx". * Drop the sample if "type" has reserved values for this field with a - * ISA version check. + * ISA version check for marked events. */ - if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && + if (mark_event && event->attr.sample_type & PERF_SAMPLE_DATA_SRC && ppmu->get_mem_data_src) { val = (regs->dar & SIER_TYPE_MASK) >> SIER_TYPE_SHIFT; if (val == 0 || (val == 7 && !cpu_has_feature(CPU_FTR_ARCH_31))) { From ae9582cd0b9ccc4a121af300df68fd27f72e9822 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:10:58 +0300 Subject: [PATCH 407/972] net/mlx5e: psp: Fix invalid access on PSP dev registration fail priv->psp->psp is initialized with the PSP device as returned by psp_dev_create(). This could also return an error, in which case a future psp_dev_unregister() will result in unpleasantness. Avoid that by using a local variable and only saving the PSP device when registration succeeds. In case psp_dev_create() fails, priv->psp and steering structs are left in place, but they will be inert. The unchecked access of priv->psp in mlx5e_psp_offload_handle_rx_skb() won't happen because without a PSP device, there can be no SAs added and therefore no packets will be successfully decrypted and be handed off to the SW handler. Fixes: 89ee2d92f66c ("net/mlx5e: Support PSP offload functionality") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en_accel/psp.c | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c index 6a50b6dec0fa82..1ff818fb48df2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c @@ -1070,29 +1070,37 @@ static struct psp_dev_ops mlx5_psp_ops = { void mlx5e_psp_unregister(struct mlx5e_priv *priv) { - if (!priv->psp || !priv->psp->psp) + struct mlx5e_psp *psp = priv->psp; + + if (!psp || !psp->psp) return; - psp_dev_unregister(priv->psp->psp); + psp_dev_unregister(psp->psp); + psp->psp = NULL; } void mlx5e_psp_register(struct mlx5e_priv *priv) { + struct mlx5e_psp *psp = priv->psp; + struct psp_dev *psd; + /* FW Caps missing */ if (!priv->psp) return; - priv->psp->caps.assoc_drv_spc = sizeof(u32); - priv->psp->caps.versions = 1 << PSP_VERSION_HDR0_AES_GCM_128; + psp->caps.assoc_drv_spc = sizeof(u32); + psp->caps.versions = 1 << PSP_VERSION_HDR0_AES_GCM_128; if (MLX5_CAP_PSP(priv->mdev, psp_crypto_esp_aes_gcm_256_encrypt) && MLX5_CAP_PSP(priv->mdev, psp_crypto_esp_aes_gcm_256_decrypt)) - priv->psp->caps.versions |= 1 << PSP_VERSION_HDR0_AES_GCM_256; + psp->caps.versions |= 1 << PSP_VERSION_HDR0_AES_GCM_256; - priv->psp->psp = psp_dev_create(priv->netdev, &mlx5_psp_ops, - &priv->psp->caps, NULL); - if (IS_ERR(priv->psp->psp)) + psd = psp_dev_create(priv->netdev, &mlx5_psp_ops, &psp->caps, NULL); + if (IS_ERR(psd)) { mlx5_core_err(priv->mdev, "PSP failed to register due to %pe\n", - priv->psp->psp); + psd); + return; + } + psp->psp = psd; } int mlx5e_psp_init(struct mlx5e_priv *priv) From 50690733db59fbb3de9fa811b606af324eeb4e37 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:10:59 +0300 Subject: [PATCH 408/972] net/mlx5e: psp: Expose only a fully initialized priv->psp Currently, during PSP init, priv->psp is initialized to an incompletely built psp struct. Additionally, on fs init failure priv->psp is reset to NULL. Change this so that only a fully initialized priv->psp is set, which makes the code easier to reason about in failure scenarios. Fixes: af2196f49480 ("net/mlx5e: Implement PSP operations .assoc_add and .assoc_del") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c index 1ff818fb48df2c..d9adb993e64d95 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c @@ -1139,22 +1139,18 @@ int mlx5e_psp_init(struct mlx5e_priv *priv) if (!psp) return -ENOMEM; - priv->psp = psp; fs = mlx5e_accel_psp_fs_init(priv); if (IS_ERR(fs)) { err = PTR_ERR(fs); - goto out_err; + kfree(psp); + return err; } psp->fs = fs; + priv->psp = psp; mlx5_core_dbg(priv->mdev, "PSP attached to netdevice\n"); return 0; - -out_err: - priv->psp = NULL; - kfree(psp); - return err; } void mlx5e_psp_cleanup(struct mlx5e_priv *priv) From c4a5c46199b5addf0157934da3aa89c33eb02a6d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:11:00 +0300 Subject: [PATCH 409/972] net/mlx5e: psp: Hook PSP dev reg/unreg to profile enable/disable devlink reload while PSP connections are active does: mlx5_unload_one_devl_locked() -> mlx5_detach_device() -> _mlx5e_suspend() -> mlx5e_detach_netdev() -> profile->cleanup_rx -> profile->cleanup_tx -> mlx5e_destroy_mdev_resources() -> mlx5_core_dealloc_pd() fails: ... mlx5_core 0000:08:00.0: mlx5_cmd_out_err:821:(pid 19722): DEALLOC_PD(0x801) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0xef0c8a), err(-22) ... The reason for failure is the existence of TX keys, which are removed by the PSP dev unregistration happening in: profile->cleanup() -> mlx5e_psp_unregister() -> mlx5e_psp_cleanup() -> psp_dev_unregister() ...but this isn't invoked in the devlink reload flow, only when changing the NIC profile (e.g. when transitioning to switchdev mode) or on dev teardown. Move PSP device registration into mlx5e_nic_enable(), and unregistration into the corresponding mlx5e_nic_disable(). These functions are called during netdev attach/detach after RX & TX are set up. This ensures that the keys will be gone by the time the PD is destroyed. Fixes: 89ee2d92f66c ("net/mlx5e: Support PSP offload functionality") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5a46870c4b749e..8e9443caa93338 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6023,7 +6023,6 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, if (take_rtnl) rtnl_lock(); - mlx5e_psp_register(priv); /* update XDP supported features */ mlx5e_set_xdp_feature(priv); @@ -6036,7 +6035,6 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { mlx5e_health_destroy_reporters(priv); - mlx5e_psp_unregister(priv); mlx5e_ktls_cleanup(priv); mlx5e_psp_cleanup(priv); mlx5e_fs_cleanup(priv->fs); @@ -6160,6 +6158,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) mlx5e_fs_init_l2_addr(priv->fs, netdev); mlx5e_ipsec_init(priv); + mlx5e_psp_register(priv); err = mlx5e_macsec_init(priv); if (err) @@ -6230,6 +6229,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5_lag_remove_netdev(mdev, priv->netdev); mlx5_vxlan_reset_to_default(mdev->vxlan); mlx5e_macsec_cleanup(priv); + mlx5e_psp_unregister(priv); mlx5e_ipsec_cleanup(priv); } From 4052b932041614675fa2dbf48f725557c23ebf05 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 1 Apr 2026 10:30:03 +0200 Subject: [PATCH 410/972] arch/powerpc: Drop CONFIG_FIRMWARE_EDID from defconfig files CONFIG_FIRMWARE_EDID=y depends on X86 or EFI_GENERIC_STUB. Neither is true here, so drop the lines from the defconfig files. Signed-off-by: Thomas Zimmermann Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260401083023.214426-1-tzimmermann@suse.de --- arch/powerpc/configs/amigaone_defconfig | 1 - arch/powerpc/configs/chrp32_defconfig | 1 - arch/powerpc/configs/g5_defconfig | 1 - arch/powerpc/configs/pasemi_defconfig | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/powerpc/configs/ppc64e_defconfig | 1 - arch/powerpc/configs/skiroot_defconfig | 1 - 8 files changed, 8 deletions(-) diff --git a/arch/powerpc/configs/amigaone_defconfig b/arch/powerpc/configs/amigaone_defconfig index 69ef3dc31c4b65..7a515390646b8c 100644 --- a/arch/powerpc/configs/amigaone_defconfig +++ b/arch/powerpc/configs/amigaone_defconfig @@ -76,7 +76,6 @@ CONFIG_SERIAL_8250_CONSOLE=y # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_RADEON=y CONFIG_FB_3DFX=y diff --git a/arch/powerpc/configs/chrp32_defconfig b/arch/powerpc/configs/chrp32_defconfig index b799c95480ae6d..66eae5b7e16c6f 100644 --- a/arch/powerpc/configs/chrp32_defconfig +++ b/arch/powerpc/configs/chrp32_defconfig @@ -76,7 +76,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_NVRAM=y # CONFIG_HWMON is not set CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index 04bbb37f5978aa..e9996711b362d7 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -121,7 +121,6 @@ CONFIG_I2C_CHARDEV=y CONFIG_AGP=m CONFIG_AGP_UNINORTH=m CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_OF=y CONFIG_FB_NVIDIA=y diff --git a/arch/powerpc/configs/pasemi_defconfig b/arch/powerpc/configs/pasemi_defconfig index 8bbf51b38480ec..89bcbeb05067af 100644 --- a/arch/powerpc/configs/pasemi_defconfig +++ b/arch/powerpc/configs/pasemi_defconfig @@ -98,7 +98,6 @@ CONFIG_SENSORS_LM85=y CONFIG_SENSORS_LM90=y CONFIG_DRM=y CONFIG_DRM_RADEON=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_VGA16=y CONFIG_FB_NVIDIA=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index cc980242023758..5d32c2767a6558 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -196,7 +196,6 @@ CONFIG_I2C_CHARDEV=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_DRM=y CONFIG_DRM_AST=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=m CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 3bf518e3a573a3..6316ca4df25d39 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -249,7 +249,6 @@ CONFIG_I2C_CHARDEV=y CONFIG_I2C_AMD8111=y CONFIG_I2C_PASEMI=y CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/ppc64e_defconfig b/arch/powerpc/configs/ppc64e_defconfig index 0fd49f67331ff3..20cc17dce94d7b 100644 --- a/arch/powerpc/configs/ppc64e_defconfig +++ b/arch/powerpc/configs/ppc64e_defconfig @@ -118,7 +118,6 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_AMD8111=y CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y CONFIG_FB_MATROX=y CONFIG_FB_MATROX_MILLENIUM=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index ff1bed4b6d2cb1..005536ee75bb94 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -214,7 +214,6 @@ CONFIG_SENSORS_IBMPOWERNV=m CONFIG_DRM=m CONFIG_DRM_AST=m CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y From 3abcedfdfd3125431ed404fa75724118beac630b Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:03 +0300 Subject: [PATCH 411/972] net/mlx5: SD: Serialize init/cleanup mlx5_sd_init() / mlx5_sd_cleanup() may run from multiple PFs in the same Socket-Direct group. This can cause the SD bring-up/tear-down sequence to be executed more than once or interleaved across PFs. Protect SD init/cleanup with mlx5_devcom_comp_lock() and track the SD group state on the primary device. Skip init if the primary is already UP, and skip cleanup unless the primary is UP. The state check on cleanup is needed because sd_register() drops the devcom comp lock between marking the comp ready and assigning primary_dev on each peer. A concurrent cleanup that acquires the lock in this window would observe devcom_is_ready==true while primary_dev is still NULL (causing mlx5_sd_get_primary() to return NULL) or while the FW alias setup performed by mlx5_sd_init()'s body has not yet run (causing sd_cmd_unset_primary() to dereference a NULL tx_ft). Gate the cleanup body on primary_sd->state == MLX5_SD_STATE_UP, which is set only at the very end of mlx5_sd_init() under the same comp lock - so observing UP guarantees primary_dev, secondaries[], tx_ft, and dfs are all populated. Also bail explicitly if mlx5_sd_get_primary() returns NULL, in case state is checked on a peer whose primary_dev hasn't been assigned yet. In addition, move mlx5_devcom_comp_set_ready(false) from sd_unregister() into the cleanup's locked section, including the !primary and state != UP early-exit paths, so the device cannot unregister and free its struct mlx5_sd while devcom is still marked ready. A concurrent init acquiring the devcom lock will now observe devcom is no longer ready and bail out immediately. Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 56 +++++++++++++++++-- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 762c783156b411..ec42685bdece6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -18,6 +18,7 @@ struct mlx5_sd { u8 host_buses; struct mlx5_devcom_comp_dev *devcom; struct dentry *dfs; + u8 state; bool primary; union { struct { /* primary */ @@ -31,6 +32,11 @@ struct mlx5_sd { }; }; +enum mlx5_sd_state { + MLX5_SD_STATE_DOWN = 0, + MLX5_SD_STATE_UP, +}; + static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); @@ -270,9 +276,6 @@ static void sd_unregister(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); - mlx5_devcom_comp_lock(sd->devcom); - mlx5_devcom_comp_set_ready(sd->devcom, false); - mlx5_devcom_comp_unlock(sd->devcom); mlx5_devcom_unregister_component(sd->devcom); } @@ -426,6 +429,7 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) struct mlx5_core_dev *primary, *pos, *to; struct mlx5_sd *sd = mlx5_get_sd(dev); u8 alias_key[ACCESS_KEY_LEN]; + struct mlx5_sd *primary_sd; int err, i; err = sd_init(dev); @@ -440,10 +444,17 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) if (err) goto err_sd_cleanup; + mlx5_devcom_comp_lock(sd->devcom); if (!mlx5_devcom_comp_is_ready(sd->devcom)) - return 0; + goto out; primary = mlx5_sd_get_primary(dev); + if (!primary) + goto out; + + primary_sd = mlx5_get_sd(primary); + if (primary_sd->state != MLX5_SD_STATE_DOWN) + goto out; for (i = 0; i < ACCESS_KEY_LEN; i++) alias_key[i] = get_random_u8(); @@ -472,6 +483,9 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) sd->group_id, mlx5_devcom_comp_get_size(sd->devcom)); sd_print_group(primary); + primary_sd->state = MLX5_SD_STATE_UP; +out: + mlx5_devcom_comp_unlock(sd->devcom); return 0; err_unset_secondaries: @@ -481,6 +495,15 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) sd_cmd_unset_primary(primary); debugfs_remove_recursive(sd->dfs); err_sd_unregister: + mlx5_sd_for_each_secondary(i, primary, pos) { + struct mlx5_sd *peer_sd = mlx5_get_sd(pos); + + primary_sd->secondaries[i - 1] = NULL; + peer_sd->primary_dev = NULL; + } + primary_sd->primary = false; + mlx5_devcom_comp_set_ready(sd->devcom, false); + mlx5_devcom_comp_unlock(sd->devcom); sd_unregister(dev); err_sd_cleanup: sd_cleanup(dev); @@ -491,22 +514,43 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); struct mlx5_core_dev *primary, *pos; + struct mlx5_sd *primary_sd; int i; if (!sd) return; + mlx5_devcom_comp_lock(sd->devcom); if (!mlx5_devcom_comp_is_ready(sd->devcom)) - goto out; + goto out_unlock; primary = mlx5_sd_get_primary(dev); + if (!primary) + goto out_ready_false; + + primary_sd = mlx5_get_sd(primary); + if (primary_sd->state != MLX5_SD_STATE_UP) + goto out_clear_peers; + mlx5_sd_for_each_secondary(i, primary, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); debugfs_remove_recursive(sd->dfs); sd_info(primary, "group id %#x, uncombined\n", sd->group_id); -out: + primary_sd->state = MLX5_SD_STATE_DOWN; +out_clear_peers: + mlx5_sd_for_each_secondary(i, primary, pos) { + struct mlx5_sd *peer_sd = mlx5_get_sd(pos); + + primary_sd->secondaries[i - 1] = NULL; + peer_sd->primary_dev = NULL; + } + primary_sd->primary = false; +out_ready_false: + mlx5_devcom_comp_set_ready(sd->devcom, false); +out_unlock: + mlx5_devcom_comp_unlock(sd->devcom); sd_unregister(dev); sd_cleanup(dev); } From 05217e4ffbb229e7218cf318e0033780abadb624 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:04 +0300 Subject: [PATCH 412/972] net/mlx5: SD, Keep multi-pf debugfs entries on primary mlx5_sd_init() creates the "multi-pf" debugfs directory under the primary device debugfs root, but stored the dentry in the calling device's sd struct. When sd_cleanup() run on a different PF, this leads to using the wrong sd->dfs for removing entries, which results in memory leak and an error in when re-creating the SD.[1] Fix it by explicitly storing the debugfs dentry in the primary device sd struct and use it for all per-group files. [1] debugfs: 'multi-pf' already exists in '0000:08:00.1' Fixes: 4375130bf527 ("net/mlx5: SD, Add debugfs") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index ec42685bdece6d..89b7e4d67303c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -463,9 +463,13 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) if (err) goto err_sd_unregister; - sd->dfs = debugfs_create_dir("multi-pf", mlx5_debugfs_get_dev_root(primary)); - debugfs_create_x32("group_id", 0400, sd->dfs, &sd->group_id); - debugfs_create_file("primary", 0400, sd->dfs, primary, &dev_fops); + primary_sd->dfs = + debugfs_create_dir("multi-pf", + mlx5_debugfs_get_dev_root(primary)); + debugfs_create_x32("group_id", 0400, primary_sd->dfs, + &primary_sd->group_id); + debugfs_create_file("primary", 0400, primary_sd->dfs, primary, + &dev_fops); mlx5_sd_for_each_secondary(i, primary, pos) { char name[32]; @@ -475,7 +479,8 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) goto err_unset_secondaries; snprintf(name, sizeof(name), "secondary_%d", i - 1); - debugfs_create_file(name, 0400, sd->dfs, pos, &dev_fops); + debugfs_create_file(name, 0400, primary_sd->dfs, pos, + &dev_fops); } @@ -493,7 +498,8 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) mlx5_sd_for_each_secondary_to(i, primary, to, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(sd->dfs); + debugfs_remove_recursive(primary_sd->dfs); + primary_sd->dfs = NULL; err_sd_unregister: mlx5_sd_for_each_secondary(i, primary, pos) { struct mlx5_sd *peer_sd = mlx5_get_sd(pos); @@ -535,7 +541,8 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) mlx5_sd_for_each_secondary(i, primary, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(sd->dfs); + debugfs_remove_recursive(primary_sd->dfs); + primary_sd->dfs = NULL; sd_info(primary, "group id %#x, uncombined\n", sd->group_id); primary_sd->state = MLX5_SD_STATE_DOWN; From 3564222cfdde83a2d760b80192155a3ada1c9bdd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:05 +0300 Subject: [PATCH 413/972] net/mlx5e: SD, Fix missing cleanup on probe error When _mlx5e_probe() fails, the preceding successful mlx5_sd_init() is not undone. Auxiliary bus probe failure skips binding, so mlx5e_remove() is never called for that adev and the matching mlx5_sd_cleanup() never runs - leaking the per-dev SD struct. Call mlx5_sd_cleanup() on the probe error path to balance mlx5_sd_init(). A similar gap exists on the resume path: mlx5_sd_init() and mlx5_sd_cleanup() are currently bundled with both probe/remove and suspend/resume, even though only the FW alias state actually needs to follow the suspend/resume lifecycle - the sd struct allocation and devcom membership are software state that should track the full bound lifetime. As a result, a failed resume can leave a still-bound device with sd == NULL, which mlx5_sd_get_adev() can't distinguish from a non-SD device. Fixing this requires sd_suspend/resume APIs which will only destroy FW resources and is left for a follow-up series. Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 8e9443caa93338..62b70334a13d1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6775,8 +6775,8 @@ static int mlx5e_resume(struct auxiliary_device *adev) actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); if (actual_adev) - return _mlx5e_resume(actual_adev); - return 0; + err = _mlx5e_resume(actual_adev); + return err; } static int _mlx5e_suspend(struct auxiliary_device *adev, bool pre_netdev_reg) @@ -6912,9 +6912,16 @@ static int mlx5e_probe(struct auxiliary_device *adev, return err; actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); - if (actual_adev) - return _mlx5e_probe(actual_adev); + if (actual_adev) { + err = _mlx5e_probe(actual_adev); + if (err) + goto sd_cleanup; + } return 0; + +sd_cleanup: + mlx5_sd_cleanup(mdev); + return err; } static void _mlx5e_remove(struct auxiliary_device *adev) From d466ddda5500b6b8ae060909d2317811f2c32a6a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:06 +0300 Subject: [PATCH 414/972] net/mlx5e: SD, Fix race condition in secondary device probe/remove When utilizing Socket-Direct single netdev functionality the driver resolves the actual auxiliary device using mlx5_sd_get_adev(). However, the current implementation returns the primary ETH auxiliary device without holding the device lock, leading to a potential race condition where the ETH device could be unbound or removed concurrently during probe, suspend, resume, or remove operations.[1] Fix this by introducing mlx5_sd_put_adev() and updating mlx5_sd_get_adev() so that secondaries devices would get a ref and acquire the device lock of the returned auxiliary device. After the lock is acquired, a second devcom check is needed[2]. In addition, update The callers to pair the get operation with the new put operation, ensuring the lock is held while the auxiliary device is being operated on and released afterwards. The "primary" designation is determined once in sd_register(). It's set before devcom is marked ready, and it never changes after that. In Addition, The primary path never locks a secondary: When the primary device invoke mlx5_sd_get_adev(), it sees dev == primary and returns. no additional lock is taken. Therefore lock ordering is always: secondary_lock -> primary_lock. The reverse never happens, so ABBA deadlock is impossible. [1] for example: BUG: kernel NULL pointer dereference, address: 0000000000000370 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP CPU: 4 UID: 0 PID: 3945 Comm: bash Not tainted 6.19.0-rc3+ #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_dcbnl_dscp_app+0x23/0x100 [mlx5_core] Call Trace: mlx5e_remove+0x82/0x12a [mlx5_core] device_release_driver_internal+0x194/0x1f0 bus_remove_device+0xc6/0x140 device_del+0x159/0x3c0 ? devl_param_driverinit_value_get+0x29/0x80 mlx5_rescan_drivers_locked+0x92/0x160 [mlx5_core] mlx5_unregister_device+0x34/0x50 [mlx5_core] mlx5_uninit_one+0x43/0xb0 [mlx5_core] remove_one+0x4e/0xc0 [mlx5_core] pci_device_remove+0x39/0xa0 device_release_driver_internal+0x194/0x1f0 unbind_store+0x99/0xa0 kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x55/0xe90 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] CPU0 (primary) CPU1 (secondary) ========================================================================== mlx5e_remove() (device_lock held) mlx5e_remove() (2nd device_lock held) mlx5_sd_get_adev() mlx5_devcom_comp_is_ready() => true device_lock(primary) mlx5_sd_get_adev() ==> ret adev _mlx5e_remove() mlx5_sd_cleanup() // mlx5e_remove finished // releasing device_lock //need another check here... mlx5_devcom_comp_is_ready() => false Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++- .../net/ethernet/mellanox/mlx5/core/lib/sd.c | 39 +++++++++++++++++-- .../net/ethernet/mellanox/mlx5/core/lib/sd.h | 2 + 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 62b70334a13d1c..8f2b3abe009214 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6774,8 +6774,10 @@ static int mlx5e_resume(struct auxiliary_device *adev) return err; actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); - if (actual_adev) + if (actual_adev) { err = _mlx5e_resume(actual_adev); + mlx5_sd_put_adev(actual_adev, adev); + } return err; } @@ -6815,6 +6817,8 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state) err = _mlx5e_suspend(actual_adev, false); mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); return err; } @@ -6916,11 +6920,14 @@ static int mlx5e_probe(struct auxiliary_device *adev, err = _mlx5e_probe(actual_adev); if (err) goto sd_cleanup; + mlx5_sd_put_adev(actual_adev, adev); } return 0; sd_cleanup: mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); return err; } @@ -6973,6 +6980,8 @@ static void mlx5e_remove(struct auxiliary_device *adev) _mlx5e_remove(actual_adev); mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); } static const struct auxiliary_device_id mlx5e_id_table[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 89b7e4d67303c0..6e199161b0083a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -562,22 +562,55 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) sd_cleanup(dev); } +/* Lock order: + * primary: actual_adev_lock -> SD devcom comp lock + * secondary: SD devcom comp lock -> (drop) -> actual_adev_lock + * The two locks are never held together, so no ABBA. + */ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev, struct auxiliary_device *adev, int idx) { struct mlx5_sd *sd = mlx5_get_sd(dev); struct mlx5_core_dev *primary; + struct mlx5_adev *primary_adev; if (!sd) return adev; - if (!mlx5_devcom_comp_is_ready(sd->devcom)) + mlx5_devcom_comp_lock(sd->devcom); + if (!mlx5_devcom_comp_is_ready(sd->devcom)) { + mlx5_devcom_comp_unlock(sd->devcom); return NULL; + } primary = mlx5_sd_get_primary(dev); - if (dev == primary) + if (!primary || dev == primary) { + mlx5_devcom_comp_unlock(sd->devcom); return adev; + } + + primary_adev = primary->priv.adev[idx]; + get_device(&primary_adev->adev.dev); + mlx5_devcom_comp_unlock(sd->devcom); - return &primary->priv.adev[idx]->adev; + device_lock(&primary_adev->adev.dev); + /* Primary may have completed remove between dropping devcom and + * acquiring device_lock; recheck. + */ + if (!mlx5_devcom_comp_is_ready(sd->devcom)) { + device_unlock(&primary_adev->adev.dev); + put_device(&primary_adev->adev.dev); + return NULL; + } + return &primary_adev->adev; +} + +void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, + struct auxiliary_device *adev) +{ + if (actual_adev != adev) { + device_unlock(&actual_adev->dev); + put_device(&actual_adev->dev); + } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h index 137efaf9aabcc0..9bfd5b9756b507 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h @@ -15,6 +15,8 @@ struct mlx5_core_dev *mlx5_sd_ch_ix_get_dev(struct mlx5_core_dev *primary, int c struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev, struct auxiliary_device *adev, int idx); +void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, + struct auxiliary_device *adev); int mlx5_sd_init(struct mlx5_core_dev *dev); void mlx5_sd_cleanup(struct mlx5_core_dev *dev); From 525cb7ba6661074c1c5cc3772bccc6afab6791ef Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Tue, 5 May 2026 05:34:03 +0000 Subject: [PATCH 415/972] platform/chrome: cros_ec_typec: Init mutex in Thunderbolt registration cros_typec_register_thunderbolt() missed initializing the `adata->lock` mutex. This leads to a NULL dereference when the mutex is later acquired (e.g. in cros_typec_altmode_work()). Initialize the mutex in cros_typec_register_thunderbolt() to fix the issue. Cc: stable@vger.kernel.org Fixes: 3b00be26b16a ("platform/chrome: cros_ec_typec: Thunderbolt support") Reviewed-by: Benson Leung Reviewed-by: Abhishek Pandit-Subedi Link: https://lore.kernel.org/r/20260505053403.3335740-1-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_typec_altmode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/chrome/cros_typec_altmode.c b/drivers/platform/chrome/cros_typec_altmode.c index 557340b53af03b..66c546bf89b532 100644 --- a/drivers/platform/chrome/cros_typec_altmode.c +++ b/drivers/platform/chrome/cros_typec_altmode.c @@ -359,6 +359,7 @@ cros_typec_register_thunderbolt(struct cros_typec_port *port, } INIT_WORK(&adata->work, cros_typec_altmode_work); + mutex_init(&adata->lock); adata->alt = alt; adata->port = port; adata->ap_mode_entry = true; From 60c71369ee356d11ad845ddeb28ceb70ec6cd70e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 11 Mar 2026 17:39:56 -0700 Subject: [PATCH 416/972] powerpc/vdso: Drop -DCC_USING_PATCHABLE_FUNCTION_ENTRY from 32-bit flags with clang After commit 73cdf24e81e4 ("powerpc64: make clang cross-build friendly"), building 64-bit little endian + CONFIG_COMPAT=y with clang results in many warnings along the lines of: $ cat arch/powerpc/configs/compat.config CONFIG_COMPAT=y $ make -skj"$(nproc)" ARCH=powerpc LLVM=1 ppc64le_defconfig compat.config arch/powerpc/kernel/vdso/ ... In file included from :4: In file included from lib/vdso/gettimeofday.c:6: In file included from include/vdso/datapage.h:15: In file included from include/vdso/cache.h:5: arch/powerpc/include/asm/cache.h:77:8: warning: unknown attribute 'patchable_function_entry' ignored [-Wunknown-attributes] 77 | static inline u32 l1_icache_bytes(void) | ^~~~~~ include/linux/compiler_types.h:235:58: note: expanded from macro 'inline' 235 | #define inline inline __gnu_inline __inline_maybe_unused notrace | ^~~~~~~ include/linux/compiler_types.h:215:34: note: expanded from macro 'notrace' 215 | #define notrace __attribute__((patchable_function_entry(0, 0))) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ... arch/powerpc/Makefile adds -DCC_USING_PATCHABLE_FUNCTION_ENTRY to KBUILD_CPPFLAGS, which is inherited by the 32-bit vDSO. However, the 32-bit little endian target does not support '-fpatchable-function-entry', resulting in the warnings above. Remove -DCC_USING_PATCHABLE_FUNCTION_ENTRY from the 32-bit vDSO flags when building with clang to avoid the warnings. Fixes: 73cdf24e81e4 ("powerpc64: make clang cross-build friendly") Signed-off-by: Nathan Chancellor Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260311-ppc-vdso-drop-cc-using-pfe-define-clang-v1-1-66c790e22650@kernel.org --- arch/powerpc/kernel/vdso/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index 8834dfe9d72796..368759f8170841 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -62,6 +62,12 @@ CC32FLAGSREMOVE += -fno-stack-clash-protection # 32-bit one. clang validates the values passed to these arguments during # parsing, even when -fno-stack-protector is passed afterwards. CC32FLAGSREMOVE += -mstack-protector-guard% +# ftrace is disabled for the vdso but arch/powerpc/Makefile adds this define to +# KBUILD_CPPFLAGS, which enables use of the 'patchable_function_entry' +# attribute in the 'inline' define via 'notrace'. This attribute is not +# supported for the powerpcle target, resulting in many instances of +# -Wunknown-attributes. +CC32FLAGSREMOVE += -DCC_USING_PATCHABLE_FUNCTION_ENTRY endif LD32FLAGS := -Wl,-soname=linux-vdso32.so.1 AS32FLAGS := -D__VDSO32__ From 8333e4916040e529bd5b56b82d573aba51e88a14 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 14:08:24 +0100 Subject: [PATCH 417/972] powerpc/ps3: Drop redundant result assignment Return value of ps3_start_probe_thread() is not used, so code can be simplified to fix W=1 clang warnings: arch/powerpc/platforms/ps3/device-init.c:953:6: error: variable 'result' set but not used [-Werror,-Wunused-but-set-variable] Signed-off-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260317130823.240279-3-krzysztof.kozlowski@oss.qualcomm.com --- arch/powerpc/platforms/ps3/device-init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index 12c473768c397d..9109c218a060a2 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -950,8 +950,6 @@ static int __init ps3_start_probe_thread(enum ps3_bus_type bus_type) static int __init ps3_register_devices(void) { - int result; - if (!firmware_has_feature(FW_FEATURE_PS3_LV1)) return -ENODEV; @@ -959,7 +957,7 @@ static int __init ps3_register_devices(void) /* ps3_repository_dump_bus_info(); */ - result = ps3_start_probe_thread(PS3_BUS_TYPE_STORAGE); + ps3_start_probe_thread(PS3_BUS_TYPE_STORAGE); ps3_register_vuart_devices(); From f583bd5f64d40e083dde5bb22846c4d93e59d471 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 17 Mar 2026 14:08:25 +0100 Subject: [PATCH 418/972] powerpc/pasemi: Drop redundant res assignment Return value of pas_add_bridge() is not used, so code can be simplified to fix W=1 clang warnings: arch/powerpc/platforms/pasemi/pci.c:275:6: error: variable 'res' set but not used [-Werror,-Wunused-but-set-variable] Signed-off-by: Krzysztof Kozlowski Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260317130823.240279-4-krzysztof.kozlowski@oss.qualcomm.com --- arch/powerpc/platforms/pasemi/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index 60f990a336c470..2df9552746529c 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -272,13 +272,12 @@ void __init pas_pci_init(void) { struct device_node *root = of_find_node_by_path("/"); struct device_node *np; - int res; pci_set_flags(PCI_SCAN_ALL_PCIE_DEVS); np = of_find_compatible_node(root, NULL, "pasemi,rootbus"); if (np) { - res = pas_add_bridge(np); + pas_add_bridge(np); of_node_put(np); } of_node_put(root); From d73a9a63f9f7f7c17637731fd28daf3665992d1e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:15 +0300 Subject: [PATCH 419/972] xsk: reject sw-csum UMEM binding to IFF_TX_SKB_NO_LINEAR devices skb_checksum_help() is a common helper that writes the folded 16-bit checksum back via skb->data + csum_start + csum_offset, i.e. it relies on the skb's linear head and fails (with WARN_ONCE and -EINVAL) when skb_headlen() is 0. AF_XDP generic xmit takes two very different paths depending on the netdev. Drivers that advertise IFF_TX_SKB_NO_LINEAR (e.g. virtio_net) skip the "copy payload into a linear head" step on purpose as a performance optimisation: xsk_build_skb_zerocopy() only attaches UMEM pages as frags and never calls skb_put(), so skb_headlen() stays 0 for the whole skb. For these skbs there is simply no linear area for skb_checksum_help() to write the csum into - the sw-csum fallback is structurally inapplicable. The patch tries to catch this and reject the combination with error at setup time. Rejecting at bind() converts this silent per-packet failure into a synchronous, actionable -EOPNOTSUPP at setup time. HW csum and launch_time metadata on IFF_TX_SKB_NO_LINEAR drivers are unaffected because they do not call skb_checksum_help(). Without the patch, every descriptor carrying 'XDP_TX_METADATA | XDP_TXMD_FLAGS_CHECKSUM' produces: 1) a WARN_ONCE "offset (N) >= skb_headlen() (0)" from skb_checksum_help(), 2) sendmsg() returning -EINVAL without consuming the descriptor (invalid_descs is not incremented), 3) a wedged TX ring: __xsk_generic_xmit() does not advance the consumer on non-EOVERFLOW errors, so the next sendmsg() re-reads the same descriptor and re-hits the same WARN until the socket is closed. Closes: https://lore.kernel.org/all/20260419045822.843BFC2BCAF@smtp.kernel.org/#t Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Fixes: 30c3055f9c0d ("xsk: wrap generic metadata handling onto separate function") Link: https://patch.msgid.link/20260502200722.53960-2-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk_buff_pool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index cd7bc50872f6b5..d981cfdd853578 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (force_zc && force_copy) return -EINVAL; + if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR)) + return -EOPNOTSUPP; + if (xsk_get_pool_from_qid(netdev, queue_id)) return -EBUSY; From 0bb7a9caf5c1d6e25ba376ea6b39261ad28550f4 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:16 +0300 Subject: [PATCH 420/972] xsk: free the skb when hitting the upper bound MAX_SKB_FRAGS Fix it by explicitly adding kfree_skb() before returning back to its caller. How to reproduce it in virtio_net: 1. the current skb is the first one (which means xs->skb is NULL) and hit the limit MAX_SKB_FRAGS. 2. xsk_build_skb_zerocopy() returns -EOVERFLOW. 3. the caller xsk_build_skb() clears skb by using 'skb = NULL;'. This is why bug can be triggered. 4. there is no chance to free this skb anymore. Note that if in this case the xs->skb is not NULL, xsk_build_skb() will call xsk_drop_skb(xs->skb) to do the right thing. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-3-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 887abed2546680..d706b1e0bf6029 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -856,8 +856,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { - if (unlikely(i >= MAX_SKB_FRAGS)) + if (unlikely(i >= MAX_SKB_FRAGS)) { + if (!xs->skb) + kfree_skb(skb); return ERR_PTR(-EOVERFLOW); + } page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); From 8cd3c1c6e7d9a1f0954159ec5f2fdaa7f6a48bd8 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:17 +0300 Subject: [PATCH 421/972] xsk: handle NULL dereference of the skb without frags issue When a first descriptor (xs->skb == NULL) triggers -EOVERFLOW in xsk_build_skb_zerocopy() (e.g., MAX_SKB_FRAGS exceeded), the free_err -EOVERFLOW handler unconditionally dereferences xs->skb via xsk_inc_num_desc(xs->skb) and xsk_drop_skb(xs->skb), causing a NULL pointer dereference. Fix this by guarding the existing xsk_inc_num_desc()/xsk_drop_skb() calls with an xs->skb check (for the continuation case), and add an else branch for the first-descriptor case that manually cancels the one reserved CQ slot and increments invalid_descs by one to account for the single invalid descriptor. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-4-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d706b1e0bf6029..06ee260f3afc05 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -976,9 +976,14 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, kfree_skb(skb); if (err == -EOVERFLOW) { - /* Drop the packet */ - xsk_inc_num_desc(xs->skb); - xsk_drop_skb(xs->skb); + if (xs->skb) { + /* Drop the packet */ + xsk_inc_num_desc(xs->skb); + xsk_drop_skb(xs->skb); + } else { + xsk_cq_cancel_locked(xs->pool, 1); + xs->tx->invalid_descs++; + } xskq_cons_release(xs->tx); } else { /* Let application retry */ From 0f3776583d282550dbafe6082a914efcf9094d59 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:18 +0300 Subject: [PATCH 422/972] xsk: fix use-after-free of xs->skb in xsk_build_skb() free_err path When xsk_build_skb() processes multi-buffer packets in copy mode, the first descriptor stores data into the skb linear area without adding any frags, so nr_frags stays at 0. The caller then sets xs->skb = skb to accumulate subsequent descriptors. If a continuation descriptor fails (e.g. alloc_page returns NULL with -EAGAIN), we jump to free_err where the condition: if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); evaluates to true because nr_frags is still 0 (the first descriptor used the linear area, not frags). This frees the skb while xs->skb still points to it, creating a dangling pointer. On the next transmit attempt or socket close, xs->skb is dereferenced, causing a use-after-free or double-free. Fix by using a !xs->skb check to handle first frag situation, ensuring we only free skbs that were freshly allocated in this call (xs->skb is NULL) and never free an in-progress multi-buffer skb that the caller still references. Closes: https://lore.kernel.org/all/20260415082654.21026-4-kerneljasonxing@gmail.com/ Fixes: 6b9c129c2f93 ("xsk: remove @first_frag from xsk_build_skb()") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-5-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 06ee260f3afc05..55378c3855d5d0 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -972,7 +972,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: - if (skb && !skb_shinfo(skb)->nr_frags) + if (skb && !xs->skb) kfree_skb(skb); if (err == -EOVERFLOW) { From 3dec153ae484e3b2ddac841156e197ba54c8df94 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:19 +0300 Subject: [PATCH 423/972] xsk: prevent CQ desync when freeing half-built skbs in xsk_build_skb() Once xsk_skb_init_misc() has been called on an skb, its destructor is set to xsk_destruct_skb(), which submits the descriptor address(es) to the completion queue and advances the CQ producer. If such an skb is subsequently freed via kfree_skb() along an error path - before the skb has ever been handed to the driver - the destructor still runs and submits a bogus, half-initialized address to the CQ. Postpone the init phase when we believe the allocation of first frag is successfully completed. Before this init, skb can be safely freed by kfree_skb(). Closes: https://lore.kernel.org/all/20260419045822.843BFC2BCAF@smtp.kernel.org/ Fixes: c30d084960cf ("xsk: avoid overwriting skb fields for multi-buffer traffic") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-6-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 55378c3855d5d0..af3c5752bb63db 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -819,8 +819,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); - - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); if (unlikely(err)) @@ -917,7 +915,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); @@ -967,6 +964,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } + if (!xs->skb) + xsk_skb_init_misc(skb, xs, desc->addr); xsk_inc_num_desc(skb); return skb; From 8c2cff50afdd2b53c7cc2ca2297301c0ffd3e802 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:20 +0300 Subject: [PATCH 424/972] xsk: avoid skb leak in XDP_TX_METADATA case Fix it by explicitly adding kfree_skb() before returning back to its caller. How to reproduce it in virtio_net: 1. the current skb is the first one (which means no frag and xs->skb is NULL) and users enable metadata feature. 2. xsk_skb_metadata() returns a error code. 3. the caller xsk_build_skb() clears skb by using 'skb = NULL;'. 4. there is no chance to free this skb anymore. Closes: https://lore.kernel.org/all/20260415085204.3F87AC19424@smtp.kernel.org/ Fixes: 30c3055f9c0d ("xsk: wrap generic metadata handling onto separate function") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-7-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index af3c5752bb63db..770ba4695a9d53 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -821,8 +821,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); - if (unlikely(err)) + if (unlikely(err)) { + kfree_skb(skb); return ERR_PTR(err); + } } } else { struct xsk_addrs *xsk_addr; From e0f229025a8e774a695017a376c4a01279c0e66e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:21 +0300 Subject: [PATCH 425/972] xsk: fix xsk_addrs slab leak on multi-buffer error path When xsk_build_skb() / xsk_build_skb_zerocopy() sees the first continuation descriptor, it promotes destructor_arg from an inlined address to a freshly allocated xsk_addrs (num_descs = 1). The counter is bumped to >= 2 only at the very end of a successful build (by calling xsk_inc_num_desc()). If the build fails in between (e.g. alloc_page() returns NULL with -EAGAIN, or the MAX_SKB_FRAGS overflow hits), we jump to free_err, skip calling xsk_inc_num_desc() to increment num_descs and leave the half-built skb attached to xs->skb for the app to retry. The skb now has 1) destructor_arg = a real xsk_addrs pointer, 2) num_descs = 1 If the app never retries and just close()s the socket, xsk_release() calls xsk_drop_skb() -> xsk_consume_skb(), which decides whether to free xsk_addrs by testing num_descs > 1: if (unlikely(num_descs > 1)) kmem_cache_free(xsk_tx_generic_cache, destructor_arg); Because num_descs is exactly 1 the branch is skipped and the xsk_addrs object is leaked to the xsk_tx_generic_cache slab. Fix it by directly testing if destructor_arg is still addr. Or else it is modified and used to store the newly allocated memory from xsk_tx_generic_cache regardless of increment of num_desc, which we need to handle. Closes: https://lore.kernel.org/all/20260419045824.D9E5EC2BCAF@smtp.kernel.org/ Fixes: 0ebc27a4c67d ("xsk: avoid data corruption on cq descriptor number") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-8-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 770ba4695a9d53..079abd4bcb69d7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -685,7 +685,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, spin_lock_irqsave(&pool->cq_prod_lock, flags); idx = xskq_get_prod(pool->cq); - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; for (i = 0; i < num_descs; i++) { @@ -740,7 +740,7 @@ static void xsk_consume_skb(struct sk_buff *skb) u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } From 203cee647f551abc87b992045cd920b117ff990a Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:22 +0300 Subject: [PATCH 426/972] xsk: fix u64 descriptor address truncation on 32-bit architectures In copy mode TX, xsk_skb_destructor_set_addr() stores the 64-bit descriptor address into skb_shinfo(skb)->destructor_arg (void *) via a uintptr_t cast: skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); On 32-bit architectures uintptr_t is 32 bits, so the upper 32 bits of the descriptor address are silently dropped. In XDP_ZEROCOPY unaligned mode the chunk offset is encoded in bits 48-63 of the descriptor address (XSK_UNALIGNED_BUF_OFFSET_SHIFT = 48), meaning the offset is lost entirely. The completion queue then returns a truncated address to userspace, making buffer recycling impossible. Fix this by handling the 32-bit case directly in xsk_skb_destructor_set_addr(): when !CONFIG_64BIT, allocate an xsk_addrs struct (the same path already used for multi-descriptor SKBs) to store the full u64 address. The existing tagged-pointer logic in xsk_skb_destructor_is_addr() stays unchanged: slab pointers returned from kmem_cache_zalloc() are always word-aligned and therefore have bit 0 clear, which correctly identifies them as a struct pointer rather than an inline tagged address on every architecture. Factor the shared kmem_cache_zalloc + destructor_arg assignment into __xsk_addrs_alloc() and add a wrapper xsk_addrs_alloc() that handles the inline-to-list upgrade (is_addr check + get_addr + num_descs = 1). The three former open-coded kmem_cache_zalloc call sites now reduce to a single call each. Propagate the -ENOMEM from xsk_skb_destructor_set_addr() through xsk_skb_init_misc() so the caller can clean up the skb via kfree_skb() before skb->destructor is installed. The overhead is one extra kmem_cache_zalloc per first descriptor on 32-bit only; 64-bit builds are completely unchanged. Closes: https://lore.kernel.org/all/20260419045824.D9E5EC2BCAF@smtp.kernel.org/ Fixes: 0ebc27a4c67d ("xsk: avoid data corruption on cq descriptor number") Signed-off-by: Jason Xing Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-9-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 88 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 32 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 079abd4bcb69d7..5e5786cd9af55a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) { - skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + struct xsk_addrs *xsk_addr; + + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (unlikely(!xsk_addr)) + return NULL; + + xsk_addr->addrs[0] = addr; + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + return xsk_addr; +} + +static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) + return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); + if (likely(xsk_addr)) + xsk_addr->num_descs = 1; + return xsk_addr; +} + +static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + if (IS_ENABLED(CONFIG_64BIT)) { + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + return 0; + } + + if (unlikely(!__xsk_addrs_alloc(skb, addr))) + return -ENOMEM; + return 0; } static void xsk_inc_num_desc(struct sk_buff *skb) @@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, - u64 addr) +static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { + int err; + + err = xsk_skb_destructor_set_addr(skb, addr); + if (unlikely(err)) + return err; + skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; - xsk_skb_destructor_set_addr(skb, addr); + return 0; } static void xsk_consume_skb(struct sk_buff *skb) @@ -829,18 +868,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, } else { struct xsk_addrs *xsk_addr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) - return ERR_PTR(-ENOMEM); - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - } + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb @@ -929,19 +959,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct page *page; u8 *vaddr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) { - err = -ENOMEM; - goto free_err; - } - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; } if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { @@ -966,8 +987,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } - if (!xs->skb) - xsk_skb_init_misc(skb, xs, desc->addr); + if (!xs->skb) { + err = xsk_skb_init_misc(skb, xs, desc->addr); + if (unlikely(err)) + goto free_err; + } xsk_inc_num_desc(skb); return skb; From bd3c45dd01283ada23b0a388c578dcf5600deb8a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 23 Apr 2026 18:53:49 +0200 Subject: [PATCH 427/972] timers/migration: Fix another hotplug activation race The hotplug control CPU is assumed to be active in the hierarchy but that doesn't imply that the root is active. If the current CPU is not the one that activated the current hierarchy, and the CPU performing this duty is still halfway through the tree, the root may still be observed inactive. And this can break the activation of a new root as in the following scenario: 1) Initially, the whole system has 64 CPUs and only CPU 63 is awake. [GRP1:0] active / | \ / | \ [GRP0:0] [...] [GRP0:7] idle idle active / | \ | CPU 0 CPU 1 ... CPU 63 idle idle active 2) CPU 63 goes idle _but_ due to a #VMEXIT it hasn't yet reached the [GRP1:0]->parent dereference (that would be NULL and stop the walk) in __walk_groups_from(). [GRP1:0] idle / | \ / | \ [GRP0:0] [...] [GRP0:7] idle idle idle / | \ | CPU 0 CPU 1 ... CPU 63 idle idle idle 3) CPU 1 wakes up, activates GRP0:0 but didn't yet manage to propagate up to GRP1:0 due to yet another #VMEXIT. [GRP1:0] idle / | \ / | \ [GRP0:0] [...] [GRP0:7] active idle idle / | \ | CPU 0 CPU 1 ... CPU 63 idle active idle 3) CPU 0 wakes up and doesn't need to walk above GRP0:0 as it's CPU 1 role. [GRP1:0] idle / | \ / | \ [GRP0:0] [...] [GRP0:7] active idle idle / | \ | CPU 0 CPU 1 ... CPU 63 active active idle 4) CPU 0 boots CPU 64. It creates a new root for it. [GRP2:0] idle / \ / \ [GRP1:0] [GRP1:1] idle idle / | \ \ / | \ \ [GRP0:0] [...] [GRP0:7] [GRP0:8] active idle idle idle / | \ | | CPU 0 CPU 1 ... CPU 63 CPU 64 active active idle offline 5) CPU 0 activates the new root, but note that GRP1:0 is still idle, waiting for CPU 1 to resume from #VMEXIT and activate it. [GRP2:0] active / \ / \ [GRP1:0] [GRP1:1] idle idle / | \ \ / | \ \ [GRP0:0] [...] [GRP0:7] [GRP0:8] active idle idle idle / | \ | | CPU 0 CPU 1 ... CPU 63 CPU 64 active active idle offline 6) CPU 63 resumes after #VMEXIT and sees the new GRP1:0 parent. Therefore it propagates the stale inactive state of GRP1:0 up to GRP2:0. [GRP2:0] idle / \ / \ [GRP1:0] [GRP1:1] idle idle / | \ \ / | \ \ [GRP0:0] [...] [GRP0:7] [GRP0:8] active idle idle idle / | \ | | CPU 0 CPU 1 ... CPU 63 CPU 64 active active idle offline 7) CPU 1 resumes after #VMEXIT and finally activates GRP1:0. But it doesn't observe its parent link because no ordering enforced that. Therefore GRP2:0 is spuriously left idle. [GRP2:0] idle / \ / \ [GRP1:0] [GRP1:1] active idle / | \ \ / | \ \ [GRP0:0] [...] [GRP0:7] [GRP0:8] active idle idle idle / | \ | | CPU 0 CPU 1 ... CPU 63 CPU 64 active active idle offline Such races are highly theoretical and the problem would solve itself once the old root ever becomes idle again. But it still leaves a taste of discomfort. Fix it with enforcing a fully ordered atomic read of the old root state before propagating the activate state up to the new root. It has a two directions ordering effect: * Acquire + release of the latest old root state: If the hotplug control CPU is not the one that woke up the old root, make sure to acquire its active state and propagate it upwards through the ordered chain of activation (the acquire pairs with the cmpxchg() in tmigr_active_up() and subsequent releases will pair with atomic_read_acquire() and smp_mb__after_atomic() in tmigr_inactive_up()). * Release: If the hotplug control CPU is not the one that must wake up the old root, but the CPU covering that is lagging behind its duty, publish the links from the old root to the new parents. This way the lagging CPU will propagate the active state itself. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260423165354.95152-2-frederic@kernel.org --- kernel/time/timer_migration.c | 40 +++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 155eeaea411337..1d0d3a4058d597 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1860,19 +1860,37 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, * child to the new parents. So tmigr_active_up() activates the * new parents while walking up from the old root to the new. * - * * It is ensured that @start is active, as this setup path is - * executed in hotplug prepare callback. This is executed by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). */ - state.state = atomic_read(&start->migr_state); - WARN_ON_ONCE(!state.active); + state.state = atomic_fetch_or(0, &start->migr_state); WARN_ON_ONCE(!start->parent); - data.childmask = start->groupmask; - __walk_groups_from(tmigr_active_up, &data, start, start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. + */ + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } } /* Root update */ From 92429ca999db99febced82f23362a71b2ba4c1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Wed, 6 May 2026 00:15:48 -0300 Subject: [PATCH 428/972] ALSA: seq: Fix UMP group 16 filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sequencer UAPI defines group_filter as an unsigned int bitmap. Bit 0 filters groupless messages and bits 1-16 filter UMP groups 1-16. The internal snd_seq_client storage is only unsigned short, so bit 16 is truncated when userspace sets the filter. The same truncation affects the automatic UMP client filter used to avoid delivery to inactive groups, so events for group 16 cannot be filtered. Store the internal bitmap as unsigned int and keep both userspace-provided and automatically generated values limited to the defined UAPI bits. Fixes: d2b706077792 ("ALSA: seq: Add UMP group filter") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260506-alsa-seq-ump-group16-filter-v1-1-b75160bf6993@gmail.com Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 2 +- sound/core/seq/seq_clientmgr.h | 5 ++++- sound/core/seq/seq_ump_client.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 75a7a2af9d8c96..5719637575a911 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1253,7 +1253,7 @@ static int snd_seq_ioctl_set_client_info(struct snd_seq_client *client, if (client->user_pversion >= SNDRV_PROTOCOL_VERSION(1, 0, 3)) client->midi_version = client_info->midi_version; memcpy(client->event_filter, client_info->event_filter, 32); - client->group_filter = client_info->group_filter; + client->group_filter = client_info->group_filter & SND_SEQ_GROUP_FILTER_MASK; /* notify the change */ snd_seq_system_client_ev_client_change(client->number); diff --git a/sound/core/seq/seq_clientmgr.h b/sound/core/seq/seq_clientmgr.h index ece02c58db702d..feea8bb7d9870a 100644 --- a/sound/core/seq/seq_clientmgr.h +++ b/sound/core/seq/seq_clientmgr.h @@ -14,6 +14,9 @@ /* client manager */ +#define SND_SEQ_GROUP_FILTER_MASK GENMASK(SNDRV_UMP_MAX_GROUPS, 0) +#define SND_SEQ_GROUP_FILTER_GROUPS GENMASK(SNDRV_UMP_MAX_GROUPS, 1) + struct snd_seq_user_client { struct file *file; /* file struct of client */ /* ... */ @@ -40,7 +43,7 @@ struct snd_seq_client { int number; /* client number */ unsigned int filter; /* filter flags */ DECLARE_BITMAP(event_filter, 256); - unsigned short group_filter; + unsigned int group_filter; snd_use_lock_t use_lock; int event_lost; /* ports */ diff --git a/sound/core/seq/seq_ump_client.c b/sound/core/seq/seq_ump_client.c index fdc76f23e03f48..9079ccfdc8666d 100644 --- a/sound/core/seq/seq_ump_client.c +++ b/sound/core/seq/seq_ump_client.c @@ -369,7 +369,7 @@ static void setup_client_group_filter(struct seq_ump_client *client) cptr = snd_seq_kernel_client_get(client->seq_client); if (!cptr) return; - filter = ~(1U << 0); /* always allow groupless messages */ + filter = SND_SEQ_GROUP_FILTER_GROUPS; /* always allow groupless messages */ for (p = 0; p < SNDRV_UMP_MAX_GROUPS; p++) { if (client->ump->groups[p].active) filter &= ~(1U << (p + 1)); From 01801e20d69346e1e6cec0d908f1cea3a49e51b5 Mon Sep 17 00:00:00 2001 From: Rodrigo Faria Date: Tue, 5 May 2026 19:55:18 +0100 Subject: [PATCH 429/972] ALSA: hda/realtek: Add mute LED fixup for HP Pavilion 15-cs1xxx Add a new fixup for the mute LED on the HP Pavilion 15-cs1xxx series using the VREF on NID 0x1b. The BIOS on these models (tested up to F.32) incorrectly reports the mute LED on NID 0x18 via DMI OEM strings, which lacks VREF capabilities. This fixup overrides the LED pin to the correct NID 0x1b. Signed-off-by: Rodrigo Faria Link: https://patch.msgid.link/20260505185518.23625-1-rodrigofilipefaria@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index a7525ed83e2be3..11d0ea8ed8598a 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -1669,6 +1669,21 @@ static void alc295_fixup_hp_mute_led_coefbit11(struct hda_codec *codec, } } +/* Override wrong pin to NID 0x1b (F.32 BIOS reports 0x18 via DMI OEM string) + * on HP pavilion 15-cs1xxx laptops + */ +static void alc295_fixup_hp_pavilion_mute_led_1b(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + + alc269_fixup_hp_mute_led(codec, fix, action); + + if (action == HDA_FIXUP_ACT_PRE_PROBE) + spec->mute_led_nid = 0x1b; +} + static void alc233_fixup_lenovo_coef_micmute_led(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -3870,6 +3885,7 @@ enum { ALC290_FIXUP_SUBWOOFER, ALC290_FIXUP_SUBWOOFER_HSJACK, ALC295_FIXUP_HP_MUTE_LED_COEFBIT11, + ALC295_FIXUP_HP_PAVILION_MUTE_LED_1B, ALC269_FIXUP_THINKPAD_ACPI, ALC269_FIXUP_LENOVO_XPAD_ACPI, ALC269_FIXUP_DMIC_THINKPAD_ACPI, @@ -5715,6 +5731,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc295_fixup_hp_mute_led_coefbit11, }, + [ALC295_FIXUP_HP_PAVILION_MUTE_LED_1B] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc295_fixup_hp_pavilion_mute_led_1b, + }, [ALC298_FIXUP_SAMSUNG_AMP] = { .type = HDA_FIXUP_FUNC, .v.func = alc298_fixup_samsung_amp, @@ -6931,6 +6951,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8537, "HP ProBook 440 G6", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8548, "HP EliteBook x360 830 G6", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x854a, "HP EliteBook 830 G6", ALC285_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x856a, "HP Pavilion 15-cs1xxx", ALC295_FIXUP_HP_PAVILION_MUTE_LED_1B), SND_PCI_QUIRK(0x103c, 0x85c6, "HP Pavilion x360 Convertible 14-dy1xxx", ALC295_FIXUP_HP_MUTE_LED_COEFBIT11), SND_PCI_QUIRK(0x103c, 0x85de, "HP Envy x360 13-ar0xxx", ALC285_FIXUP_HP_ENVY_X360), SND_PCI_QUIRK(0x103c, 0x8603, "HP Omen 17-cb0xxx", ALC285_FIXUP_HP_MUTE_LED), From 5337213381df578058e2e41da93cbd0e4639935f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Wed, 6 May 2026 00:34:47 -0300 Subject: [PATCH 430/972] ALSA: core: Serialize deferred fasync state checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit snd_fasync_helper() updates fasync->on under snd_fasync_lock, and snd_fasync_work_fn() now also evaluates fasync->on under the same lock. snd_kill_fasync() still tests the flag before taking the lock, leaving an unsynchronized read against FASYNC enable/disable updates. Move the enabled-state check into the locked section. Also clear fasync->on under snd_fasync_lock in snd_fasync_free() before unlinking the pending entry. Together with the locked sender-side check, this publishes teardown before flushing the deferred work and prevents a racing sender from requeueing the entry after free has started. Fixes: ef34a0ae7a26 ("ALSA: core: Add async signal helpers") Fixes: 8146cd333d23 ("ALSA: core: Fix potential data race at fasync handling") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260506-alsa-core-fasync-on-lock-v1-1-ea48c77d6ca4@gmail.com Signed-off-by: Takashi Iwai --- sound/core/misc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/core/misc.c b/sound/core/misc.c index 5aca09edf9718a..833124c8e4fa83 100644 --- a/sound/core/misc.c +++ b/sound/core/misc.c @@ -148,9 +148,11 @@ EXPORT_SYMBOL_GPL(snd_fasync_helper); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll) { - if (!fasync || !fasync->on) + if (!fasync) return; guard(spinlock_irqsave)(&snd_fasync_lock); + if (!fasync->on) + return; fasync->signal = signal; fasync->poll = poll; list_move(&fasync->list, &snd_fasync_list); @@ -163,8 +165,10 @@ void snd_fasync_free(struct snd_fasync *fasync) if (!fasync) return; - scoped_guard(spinlock_irq, &snd_fasync_lock) + scoped_guard(spinlock_irq, &snd_fasync_lock) { + fasync->on = 0; list_del_init(&fasync->list); + } flush_work(&snd_fasync_work); kfree(fasync); From 2bcbb163162789d3488562073dbb99d9bd71a762 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 5 May 2026 20:18:54 -0700 Subject: [PATCH 431/972] ALSA: sparc/dbri: add missing fallthrough Fixes compiler error with probably newer compilers: sound/sparc/dbri.c:595:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] 595 | case 1: | ^ sound/sparc/dbri.c:595:2: note: insert 'break;' to avoid fall-through 595 | case 1: | ^ | break; Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260506031854.780411-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/sparc/dbri.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/sparc/dbri.c b/sound/sparc/dbri.c index 75f82a92ff44fd..2f5f62079fa4a5 100644 --- a/sound/sparc/dbri.c +++ b/sound/sparc/dbri.c @@ -592,6 +592,7 @@ static __u32 reverse_bytes(__u32 b, int len) fallthrough; case 2: b = ((b & 0xaaaaaaaa) >> 1) | ((b & 0x55555555) << 1); + fallthrough; case 1: case 0: break; From 283fc9e44ff5b5ac967439b4951b80bd4299f4e4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2026 15:15:34 +0200 Subject: [PATCH 432/972] wifi: mac80211: remove station if connection prep fails If connection preparation fails for MLO connections, then the interface is completely reset to non-MLD. In this case, we must not keep the station since it's related to the link of the vif being removed. Delete an existing station. Any "new_sta" is already being removed, so that doesn't need changes. This fixes a use-after-free/double-free in debugfs if that's enabled, because a vif going from MLD (and to MLD, but that's not relevant here) recreates its entire debugfs. Cc: stable@vger.kernel.org Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Reviewed-by: Miriam Rachel Korenblit Link: https://patch.msgid.link/20260505151533.c4e52deb06ad.Iafe56cec7de8512626169496b134bce3a6c17010@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 298ebff6bbf84b..0a0f27836d5706 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -9149,7 +9149,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_link_data *link; - bool have_sta = false; + struct sta_info *have_sta = NULL; bool mlo; int err; u16 new_links; @@ -9168,11 +9168,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, mlo = false; } - if (assoc) { - rcu_read_lock(); + if (assoc) have_sta = sta_info_get(sdata, ap_mld_addr); - rcu_read_unlock(); - } if (mlo && !have_sta && WARN_ON(sdata->vif.valid_links || sdata->vif.active_links)) @@ -9336,6 +9333,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, out_release_chan: ieee80211_link_release_channel(link); out_err: + if (mlo && have_sta) + WARN_ON(__sta_info_destroy(have_sta)); ieee80211_vif_set_links(sdata, 0, 0); return err; } From 0f3c0a197309717d74729568f88957d448847937 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2026 13:38:37 +0200 Subject: [PATCH 433/972] wifi: nl80211: fix NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST usage This is documented as a u8 and has a policy of NLA_U8, but uses nla_get_u32() which means it's completely broken on big-endian. Fix it to use nla_get_u8(). Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Link: https://patch.msgid.link/20260505113837.260159-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 4c8ea0583f9403..d6cd0de64d1f84 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); if (capa->ftm.max_ftms_per_burst && (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || From 15994bb0cbb8fc4879da7552ddd08c1896261c39 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 14:48:53 +0800 Subject: [PATCH 434/972] wifi: nl80211: require CAP_NET_ADMIN over the target netns in SET_WIPHY_NETNS NL80211_CMD_SET_WIPHY_NETNS dispatches with GENL_UNS_ADMIN_PERM, which verifies that the caller has CAP_NET_ADMIN for the source netns. It doesn't verify that the caller has CAP_NET_ADMIN over the target netns selected by NL80211_ATTR_NETNS_FD or NL80211_ATTR_PID. This diverges from the convention enforced in net/core/rtnetlink.c::rtnl_get_net_ns_capable(): /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) return ERR_PTR(-EACCES); A user with CAP_NET_ADMIN in their own user namespace can therefore push a wiphy into an arbitrary netns (including init_net) over which they have no privilege. Mirror the rtnetlink convention by requiring CAP_NET_ADMIN in the target netns before calling cfg80211_switch_netns(). Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506064854.2207105-2-maoyixie.tju@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 67088804dcc796..db546dd93d0845 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13867,6 +13867,19 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(net)) return PTR_ERR(net); + /* + * The caller already has CAP_NET_ADMIN over the source netns + * (enforced by GENL_UNS_ADMIN_PERM on the genl op). Mirror the + * convention used by net/core/rtnetlink.c::rtnl_get_net_ns_capable() + * and require CAP_NET_ADMIN over the target netns as well, so that + * a caller that is privileged in their own user namespace cannot + * push a wiphy into a netns where they have no privilege. + */ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return -EPERM; + } + err = 0; /* check if anything to do */ From 79240f3f6d766b342b57c32397d643e1cfa26b81 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 14:48:54 +0800 Subject: [PATCH 435/972] wifi: nl80211: re-check wiphy netns in nl80211_prepare_wdev_dump() continuation NL80211_CMD_GET_SCAN is implemented as a multi-call dumpit. The first invocation of nl80211_prepare_wdev_dump() validates the requested wdev against the caller's netns via __cfg80211_wdev_from_attrs(). Subsequent invocations look up the same wiphy by its global index and do not check that the wiphy is still in the caller's netns. Add the same filter to the continuation path. If the wiphy's netns no longer matches the caller's, return -ENODEV and the netlink dump machinery terminates the walk cleanly. Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506064854.2207105-3-maoyixie.tju@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index db546dd93d0845..7db9cd4338011a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1276,6 +1276,18 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, rtnl_unlock(); return -ENODEV; } + + /* + * The first invocation validated the wdev's netns against + * the caller via __cfg80211_wdev_from_attrs(). The wiphy + * may have moved netns between dumpit invocations (via + * NL80211_CMD_SET_WIPHY_NETNS), so re-check here. + */ + if (!net_eq(wiphy_net(wiphy), sock_net(cb->skb->sk))) { + rtnl_unlock(); + return -ENODEV; + } + *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; From 86f33ca9bea30cf011f2b1edad4593faea9c6e98 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 6 May 2026 16:22:38 +0800 Subject: [PATCH 436/972] ublk: validate physical_bs_shift, io_min_shift and io_opt_shift ublk_validate_params() checks logical_bs_shift is within [9, PAGE_SHIFT] but has no upper bound for physical_bs_shift, io_min_shift, or io_opt_shift. A malicious userspace can set any of these to a large value (e.g., 44), causing undefined behavior from `1 << shift` in ublk_ctrl_start_dev() since the result is stored in 32-bit unsigned int. Cap all three at ilog2(SZ_256M) (28). 256M is big enough to cover all practical block sizes, and originates from the maximum physical block size possible in NVMe (lba_size * (1 + npwg), where npwg is 16-bit). Also zero out ub->params with memset() when copy_from_user() fails or ublk_validate_params() returns error, so that no stale or partial params survive for a subsequent START_DEV to consume. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260506082238.22363-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d10460d29e4abc..57ec900f0ce0f6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -900,6 +900,20 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; + /* + * 256M is a reasonable upper bound for physical block size, + * io_min and io_opt; it aligns with the maximum physical + * block size possible in NVMe. + */ + if (p->physical_bs_shift > ilog2(SZ_256M)) + return -EINVAL; + + if (p->io_min_shift > ilog2(SZ_256M)) + return -EINVAL; + + if (p->io_opt_shift > ilog2(SZ_256M)) + return -EINVAL; + if (p->logical_bs_shift > p->physical_bs_shift) return -EINVAL; @@ -4992,13 +5006,15 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, */ ret = -EACCES; } else if (copy_from_user(&ub->params, argp, ph.len)) { + /* zero out partial copy so no stale params survive */ + memset(&ub->params, 0, sizeof(ub->params)); ret = -EFAULT; } else { /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); if (ret) - ub->params.types = 0; + memset(&ub->params, 0, sizeof(ub->params)); } mutex_unlock(&ub->mutex); From 9cc6bac1bebf8310d2950d1411a91479e86d69a1 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 23:37:54 +0800 Subject: [PATCH 437/972] io_uring/timeout: honour caller's time namespace for IORING_TIMEOUT_ABS io_uring's IORING_OP_TIMEOUT and IORING_OP_LINK_TIMEOUT accept a timespec from the caller via io_parse_user_time(). With IORING_TIMEOUT_ABS, the timestamp is an absolute deadline on the selected clock. The clock is CLOCK_MONOTONIC by default. CLOCK_BOOTTIME and CLOCK_REALTIME are also selectable. A submitter inside a CLONE_NEWTIME time namespace observes CLOCK_MONOTONIC and CLOCK_BOOTTIME shifted by the namespace's offsets relative to the host. Every other ABS timer interface in the kernel converts the caller's absolute time to host view via timens_ktime_to_host() before arming an hrtimer: kernel/time/posix-timers.c -- timer_settime(TIMER_ABSTIME) kernel/time/posix-stubs.c -- clock_nanosleep(TIMER_ABSTIME) kernel/time/alarmtimer.c -- alarm_timer_nsleep(TIMER_ABSTIME) fs/timerfd.c -- timerfd_settime(TFD_TIMER_ABSTIME) io_parse_user_time() does not. As a result, an absolute timeout submitted from within a time namespace is interpreted in host view. That is generally a different point in time. It may already be in the past, causing the timer to fire immediately, or far in the future, causing the timer not to fire when expected. Reproducer: in unshare --user --time, with a -10s monotonic offset, submit IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS and deadline = now + 1s. The CQE is delivered after <1ms instead of the expected ~1s. Apply timens_ktime_to_host() to the parsed time when IORING_TIMEOUT_ABS is set. Split the existing clock id resolver in io_timeout_get_clock() into a flags only helper io_flags_to_clock(), so io_parse_user_time() can resolve the clock without a struct io_timeout_data. timens_ktime_to_host() is a no-op for clocks not affected by time namespaces, e.g. CLOCK_REALTIME. It is also a no-op for callers in the initial time namespace. The fast path is unchanged. SQPOLL is also covered. The SQPOLL kernel thread is created via create_io_thread() with CLONE_THREAD and no CLONE_NEW* flag. copy_namespaces() therefore shares the submitter's nsproxy by reference. Inside the SQPOLL kthread, current->nsproxy->time_ns is the submitter's time_ns. timens_ktime_to_host() resolves correctly. Suggested-by: Pavel Begunkov Suggested-by: Jens Axboe Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260504153755.1293932-2-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/timeout.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4cfdfc519770ab..e2595cae2b073e 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -35,6 +36,22 @@ struct io_timeout_rem { bool ltimeout; }; +static clockid_t io_flags_to_clock(unsigned flags) +{ + switch (flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) { struct timespec64 ts; @@ -43,7 +60,7 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) *time = ns_to_ktime(arg); if (*time < 0) return -EINVAL; - return 0; + goto out; } if (get_timespec64(&ts, u64_to_user_ptr(arg))) @@ -51,6 +68,9 @@ static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) if (ts.tv_sec < 0 || ts.tv_nsec < 0) return -EINVAL; *time = timespec64_to_ktime(ts); +out: + if (flags & IORING_TIMEOUT_ABS) + *time = timens_ktime_to_host(io_flags_to_clock(flags), *time); return 0; } @@ -399,18 +419,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static clockid_t io_timeout_get_clock(struct io_timeout_data *data) { - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } + return io_flags_to_clock(data->flags); } static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, From 45d2b37a37ab98484693533496395c610a2cab96 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Mon, 4 May 2026 23:37:55 +0800 Subject: [PATCH 438/972] io_uring/wait: honour caller's time namespace for IORING_ENTER_ABS_TIMER io_uring_enter() with IORING_ENTER_ABS_TIMER takes an absolute timespec from the caller via ext_arg->ts. It arms an ABS mode hrtimer in __io_cqring_wait_schedule(). The conversion path in io_uring/wait.c parses ext_arg->ts inline rather than going through io_parse_user_time(). It therefore does not pick up the time namespace conversion added by the previous patch. Apply timens_ktime_to_host() to the parsed time on the IORING_ENTER_ABS_TIMER branch. This mirrors the IORING_TIMEOUT_ABS fix in io_parse_user_time(). Use ctx->clockid as the clock id. ctx->clockid is set either at ring creation or via IORING_REGISTER_CLOCK. timens_ktime_to_host() is a no-op for clocks not affected by time namespaces. It is also a no-op for callers in the initial time namespace. The fast path is unchanged. Reproducer: in unshare --user --time, with a -10s monotonic offset, call io_uring_enter with min_complete=1, IORING_ENTER_ABS_TIMER, and ts = now + 1s. The call returns -ETIME after <1ms instead of after the expected ~1s. Suggested-by: Pavel Begunkov Suggested-by: Jens Axboe Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260504153755.1293932-3-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/wait.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/wait.c b/io_uring/wait.c index 91df86ce0d18c1..ec01e78a216d6c 100644 --- a/io_uring/wait.c +++ b/io_uring/wait.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -229,7 +230,10 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (ext_arg->ts_set) { iowq.timeout = timespec64_to_ktime(ext_arg->ts); - if (!(flags & IORING_ENTER_ABS_TIMER)) + if (flags & IORING_ENTER_ABS_TIMER) + iowq.timeout = timens_ktime_to_host(ctx->clockid, + iowq.timeout); + else iowq.timeout = ktime_add(iowq.timeout, start_time); } From 5cbb61bf4168859d97c068d88d364f4f1f440325 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 5 May 2026 09:02:13 -0700 Subject: [PATCH 439/972] arm64/fpsimd: ptrace: zero target's fpsimd_state, not the tracer's sve_set_common() is the backend for PTRACE_SETREGSET(NT_ARM_SVE) and PTRACE_SETREGSET(NT_ARM_SSVE). Every write in the function operates on the tracee (target) - except a single memset that uses current instead, zeroing the tracer's saved V0-V31 / FPSR / FPCR shadow on every ptrace SETREGSET call. The memset is meant to give the tracee a defined zero register image before the user-supplied payload is copied in (for partial writes, header-only writes, and FPSIMD<->SVE format switches). Aiming it at current both denies the tracee that clean slate and silently corrupts the tracer. The corruption of the tracer's saved FPSIMD state is not always observable. Where the tracer's state is live on a CPU, this may be reused without loading the corrupted state from memory, and will eventually be written back over the corrupted state. Where the tracer's state is saved in SVE_PT_REGS_SVE format, only the FPSR and FPCR are clobbered, and the effective copy of the vectors is in the task's sve_state. Reproducible on an arm64 kernel with SVE: a single-threaded tracer that loads a known pattern into V0-V31, issues PTRACE_SETREGSET(NT_ARM_SVE) on a child, and reads V0-V31 back observes them all zeroed within tens of thousands of iterations when a sibling thread keeps stealing the FPSIMD CPU binding. Fixes: 316283f276eb ("arm64/fpsimd: ptrace: Consistently handle partial writes to NT_ARM_(S)SVE") Cc: Signed-off-by: Breno Leitao Acked-by: Mark Rutland Signed-off-by: Catalin Marinas --- arch/arm64/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index ba5eab23fd9008..4d08598e2891d3 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -983,8 +983,8 @@ static int sve_set_common(struct task_struct *target, } /* Always zero V regs, FPSR, and FPCR */ - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); + memset(&target->thread.uw.fpsimd_state, 0, + sizeof(target->thread.uw.fpsimd_state)); /* Registers: FPSIMD-only case */ From 628497e6d925d43efb56e3ffecef0a9d217926b3 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 6 May 2026 02:28:51 -0700 Subject: [PATCH 440/972] regulator: qcom-rpmh: Fix index for pmh0101 ldo16 The wrong index is assigned to pmh0101 ldo16, which results incorrect rpmh resource being used when the regulator device is voted. Fix it. Fixes: 65efe5404d15 ("regulator: rpmh-regulator: Add RPMH regulator support for Glymur") Signed-off-by: Fenglin Wu Reviewed-by: Konrad Dybcio Link: https://patch.msgid.link/20260506-fix_pmh0101_ldo16_index-v1-1-cdc8708b01f4@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/regulator/qcom-rpmh-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index 6e4cb2871fca8d..0dcb50bf5c35b1 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -1512,7 +1512,7 @@ static const struct rpmh_vreg_init_data pmh0101_vreg_data[] = { RPMH_VREG("ldo13", LDO, 13, &pmic5_pldo530_mvp150, "vdd-l2-l13-l14"), RPMH_VREG("ldo14", LDO, 14, &pmic5_pldo530_mvp150, "vdd-l2-l13-l14"), RPMH_VREG("ldo15", LDO, 15, &pmic5_nldo530, "vdd-l15"), - RPMH_VREG("ldo16", LDO, 15, &pmic5_pldo530_mvp600, "vdd-l5-l16"), + RPMH_VREG("ldo16", LDO, 16, &pmic5_pldo530_mvp600, "vdd-l5-l16"), RPMH_VREG("ldo17", LDO, 17, &pmic5_pldo515_mv, "vdd-l17"), RPMH_VREG("ldo18", LDO, 18, &pmic5_nldo530, "vdd-l18"), RPMH_VREG("bob1", BOB, 1, &pmic5_bob, "vdd-bob1"), From 4bacec2317527ba04b7172145848f1c206999ea1 Mon Sep 17 00:00:00 2001 From: Jiawei Liu Date: Wed, 6 May 2026 14:24:12 +0800 Subject: [PATCH 441/972] spi: ch341: correct company name in MODULE_DESCRIPTION The company name "QiHeng Electronics" is incorrect. The correct legal name is "Nanjing Qinheng Microelectronics Co., Ltd.". Update the module description accordingly. Signed-off-by: Jiawei Liu Link: https://patch.msgid.link/20260506062412.371034-1-ljw@wch.cn Signed-off-by: Mark Brown --- drivers/spi/spi-ch341.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-ch341.c b/drivers/spi/spi-ch341.c index 3eaa8f176f63aa..6448a44a8b67e1 100644 --- a/drivers/spi/spi-ch341.c +++ b/drivers/spi/spi-ch341.c @@ -250,5 +250,5 @@ static struct usb_driver ch341a_usb_driver = { module_usb_driver(ch341a_usb_driver); MODULE_AUTHOR("Johannes Thumshirn "); -MODULE_DESCRIPTION("QiHeng Electronics ch341 USB2SPI"); +MODULE_DESCRIPTION("Nanjing Qinheng Microelectronics CH341 USB2SPI driver"); MODULE_LICENSE("GPL v2"); From 0c8c88b8eb82a2a41bec5f17c076d6312dc40316 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Tue, 5 May 2026 15:42:57 -0700 Subject: [PATCH 442/972] ovl: fix verity lazy-load guard broken by fsverity_active() semantic change Commit f77f281b6118 ("fsverity: use a hashtable to find the fsverity_info") made fsverity_active() check whether the inode has the verity flag, rather than whether the inode's fsverity_info is loaded. This broke ovl_ensure_verity_loaded(), which wants to load the fsverity_info for any verity inodes that haven't had it loaded yet. Therefore, to check that the fsverity_info hasn't yet been loaded, use fsverity_get_info(inode) == NULL instead of !fsverity_active(inode). Also, since fsverity_get_info() now involves a hash table lookup, put the more lightweight IS_VERITY() flag check first. Fixes: f77f281b6118 ("fsverity: use a hashtable to find the fsverity_info") Cc: stable@vger.kernel.org Link: https://github.com/bootc-dev/bootc/issues/2174 Signed-off-by: Colin Walters Acked-by: Amir Goldstein Link: https://patch.msgid.link/20260505224257.23213-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/overlayfs/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7b86a6bac6449a..b41f4788e4f06b 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1354,7 +1354,7 @@ int ovl_ensure_verity_loaded(const struct path *datapath) struct inode *inode = d_inode(datapath->dentry); struct file *filp; - if (!fsverity_active(inode) && IS_VERITY(inode)) { + if (IS_VERITY(inode) && fsverity_get_info(inode) == NULL) { /* * If this inode was not yet opened, the verity info hasn't been * loaded yet, so we need to do that here to force it into memory. From fdf4eb632683bfc2840acebe62716cb468d43e10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 17:51:07 +0200 Subject: [PATCH 443/972] selftests/rseq: Validate legacy behavior The RSEQ legacy mode behavior requires that the ID fields in the rseq region are unconditionally updated on every context switch and before signal delivery even if not required by the ABI specification. To ensure that this behavior is preserved for legacy users in the future, add a test which validates that with a sleep() and a signal sent to self. Provide a run script which prevents GLIBC from registering a RSEQ region, so that the test can register it's own legacy sized region. Fixes: 566d8015f7ee ("rseq: Avoid CPU/MM CID updates when no event pending") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.764705536%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 5 +- tools/testing/selftests/rseq/legacy_check.c | 65 +++++++++++++++++++ .../selftests/rseq/run_legacy_check.sh | 4 ++ 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/rseq/legacy_check.c create mode 100755 tools/testing/selftests/rseq/run_legacy_check.sh diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0d1947c0d6235f..0293a2f17f5115 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -22,9 +22,10 @@ TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test_compare_twice \ param_test_mm_cid \ param_test_mm_cid_compare_twice \ - syscall_errors_test + syscall_errors_test \ + legacy_check -TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh TEST_FILES := settings diff --git a/tools/testing/selftests/rseq/legacy_check.c b/tools/testing/selftests/rseq/legacy_check.c new file mode 100644 index 00000000000000..3f7de4e2830330 --- /dev/null +++ b/tools/testing/selftests/rseq/legacy_check.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include + +#include "rseq.h" + +#include "../kselftest_harness.h" + +FIXTURE(legacy) +{ +}; + +static int cpu_id_in_sigfn = -1; + +static void sigfn(int sig) +{ + struct rseq_abi *rs = rseq_get_abi(); + + cpu_id_in_sigfn = rs->cpu_id_start; +} + +FIXTURE_SETUP(legacy) +{ + int res = __rseq_register_current_thread(true, true); + + switch (res) { + case -ENOSYS: + SKIP(return, "RSEQ not enabled\n"); + case -EBUSY: + SKIP(return, "GLIBC owns RSEQ. Disable GLIBC RSEQ registration\n"); + default: + ASSERT_EQ(res, 0); + } + + ASSERT_NE(signal(SIGUSR1, sigfn), SIG_ERR); +} + +FIXTURE_TEARDOWN(legacy) +{ +} + +TEST_F(legacy, legacy_test) +{ + struct rseq_abi *rs = rseq_get_abi(); + + ASSERT_NE(rs, NULL); + + /* Overwrite rs::cpu_id_start */ + rs->cpu_id_start = -1; + sleep(1); + ASSERT_NE(rs->cpu_id_start, -1); + + rs->cpu_id_start = -1; + ASSERT_EQ(raise(SIGUSR1), 0); + ASSERT_NE(rs->cpu_id_start, -1); + ASSERT_NE(cpu_id_in_sigfn, -1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/rseq/run_legacy_check.sh b/tools/testing/selftests/rseq/run_legacy_check.sh new file mode 100755 index 00000000000000..5577b46ea09272 --- /dev/null +++ b/tools/testing/selftests/rseq/run_legacy_check.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./legacy_check From 82f572449cfe75f12ea985986da60e11f308f77d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 16:21:02 +0200 Subject: [PATCH 444/972] rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode The optimized RSEQ V2 mode requires that user space adheres to the ABI specification and does not modify the read-only fields cpu_id_start, cpu_id, node_id and mm_cid behind the kernel's back. While the kernel does not rely on these fields, the adherence to this is a fundamental prerequisite to allow multiple entities, e.g. libraries, in an application to utilize the full potential of RSEQ without stepping on each other toes. Validate this adherence on every update of these fields. If the kernel detects that user space modified the fields, the application is force terminated. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.845230956%40kernel.org Cc: stable@vger.kernel.org --- include/linux/rseq_entry.h | 83 ++++++++++++++------------------------ include/linux/rseq_types.h | 4 +- kernel/rseq.c | 5 +-- 3 files changed, 35 insertions(+), 57 deletions(-) diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 934db41ec78261..2d0295df5107c7 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -248,7 +248,6 @@ static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, un #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -368,43 +367,6 @@ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, return false; } -/* - * On debug kernels validate that user space did not mess with it if the - * debug branch is enabled. - */ -bool rseq_debug_validate_ids(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id, uval, node_id; - - /* - * On the first exit after registering the rseq region CPU ID is - * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! - */ - node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? - cpu_to_node(t->rseq.ids.cpu_id) : 0; - - scoped_user_read_access(rseq, efault) { - unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); - if (cpu_id != t->rseq.ids.cpu_id) - goto die; - unsafe_get_user(uval, &rseq->cpu_id, efault); - if (uval != cpu_id) - goto die; - unsafe_get_user(uval, &rseq->node_id, efault); - if (uval != node_id) - goto die; - unsafe_get_user(uval, &rseq->mm_cid, efault); - if (uval != t->rseq.ids.mm_cid) - goto die; - } - return true; -die: - t->rseq.event.fatal = true; -efault: - return false; -} - #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -514,20 +476,32 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c * faults in task context are fatal too. */ static rseq_inline -bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, - u32 node_id, u64 *csaddr) +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr) { struct rseq __user *rseq = t->rseq.usrptr; - if (static_branch_unlikely(&rseq_debug_enabled)) { - if (!rseq_debug_validate_ids(t)) - return false; - } - scoped_user_rw_access(rseq, efault) { + /* Validate the R/O fields for debug and optimized mode */ + if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) { + u32 cpu_id, uval; + + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != t->rseq.ids.node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); - unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->node_id, &rseq->node_id, efault); unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); @@ -539,10 +513,13 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, rseq_slice_clear_grant(t); /* Cache the new values */ - t->rseq.ids.cpu_cid = ids->cpu_cid; + t->rseq.ids = *ids; rseq_stat_inc(rseq_stats.ids); rseq_trace_update(t, ids); return true; + +die: + t->rseq.event.fatal = true; efault: return false; } @@ -552,11 +529,11 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, * is in a critical section. */ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, - struct rseq_ids *ids, u32 node_id) + struct rseq_ids *ids) { u64 csaddr; - if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + if (!rseq_set_ids_get_csaddr(t, ids, &csaddr)) return false; /* @@ -659,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t } struct rseq_ids ids = { - .cpu_id = task_cpu(t), - .mm_cid = task_mm_cid(t), + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + .node_id = cpu_to_node(ids.cpu_id), }; - u32 node_id = cpu_to_node(ids.cpu_id); - return rseq_update_usr(t, regs, &ids, node_id); + return rseq_update_usr(t, regs, &ids); efault: return false; } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a469c1870849c4..85739a63e85e6f 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -66,8 +66,9 @@ struct rseq_event { * compiler emit a single compare on 64-bit * @cpu_id: The CPU ID which was written last to user space * @mm_cid: The MM CID which was written last to user space + * @node_id: The node ID which was written last to user space * - * @cpu_id and @mm_cid are updated when the data is written to user space. + * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space. */ struct rseq_ids { union { @@ -77,6 +78,7 @@ struct rseq_ids { u32 mm_cid; }; }; + u32 node_id; }; /** diff --git a/kernel/rseq.c b/kernel/rseq.c index aa25753ea1350c..101612027f6a36 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -263,7 +263,6 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) }; struct task_struct *t = current; struct rseq_ids ids; - u32 node_id; bool event; if (unlikely(t->flags & PF_EXITING)) @@ -299,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) if (!event) return; - node_id = cpu_to_node(ids.cpu_id); + ids.node_id = cpu_to_node(ids.cpu_id); - if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + if (unlikely(!rseq_update_usr(t, regs, &ids))) { /* * Clear the errors just in case this might survive magically, but * leave the rest intact. From 99428157dcf32fdac97355aa1cc1364dbc9e073c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2026 10:01:56 +0200 Subject: [PATCH 445/972] rseq: Reenable performance optimizations conditionally Due to the incompatibility with TCMalloc the RSEQ optimizations and extended features (time slice extensions) have been disabled and made run-time conditional. The original RSEQ implementation, which TCMalloc depends on, registers a 32 byte region (ORIG_RSEG_SIZE). This region has a 32 byte alignment requirement. The extension safe newer variant exposes the kernel RSEQ feature size via getauxval(AT_RSEQ_FEATURE_SIZE) and the alignment requirement via getauxval(AT_RSEQ_ALIGN). The alignment requirement is that the registered RSEQ region is aligned to the next power of two of the feature size. The kernel currently has a feature size of 33 bytes, which means the alignment requirement is 64 bytes. The TCMalloc RSEQ region is embedded into a cache line aligned data structure starting at offset 32 bytes so that bytes 28-31 and the cpu_id_start field at bytes 32-35 form a 64-bit little endian pointer with the top-most bit (63 set) to check whether the kernel has overwritten cpu_id_start with an actual CPU id value, which is guaranteed to not have the top most bit set. As this is part of their performance tuned magic, it's a pretty safe assumption, that TCMalloc won't use a larger RSEQ size. This allows the kernel to declare that registrations with a size greater than the original size of 32 bytes, which is the cases since time slice extensions got introduced, as RSEQ ABI v2 with the following differences to the original behaviour: 1) Unconditional updates of the user read only fields (CPU, node, MMCID) are removed. Those fields are only updated on registration, task migration and MMCID changes. 2) Unconditional evaluation of the criticial section pointer is removed. It's only evaluated when user space was interrupted and was scheduled out or before delivering a signal in the interrupted context. 3) The read/only requirement of the ID fields is enforced. When the kernel detects that userspace manipulated the fields, the process is terminated. This ensures that multiple entities (libraries) can utilize RSEQ without interfering. 4) Todays extended RSEQ feature (time slice extensions) and future extensions are only enabled in the v2 enabled mode. Registrations with the original size of 32 bytes operate in backwards compatible legacy mode without performance improvements and extended features. Unfortunately that also affects users of older GLIBC versions which register the original size of 32 bytes and do not evaluate the kernel required size in the auxiliary vector AT_RSEQ_FEATURE_SIZE. That's the result of the lack of enforcement in the original implementation and the unwillingness of a single entity to cooperate with the larger ecosystem for many years. Implement the required registration changes by restructuring the spaghetti code and adding the size/version check. Also add documentation about the differences of legacy and optimized RSEQ V2 mode. Thanks to Mathieu for pointing out the ORIG_RSEQ_SIZE constraints! Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224427.927160119%40kernel.org Cc: stable@vger.kernel.org --- Documentation/userspace-api/rseq.rst | 94 ++++++++++++++++- kernel/rseq.c | 144 ++++++++++++++++----------- 2 files changed, 178 insertions(+), 60 deletions(-) diff --git a/Documentation/userspace-api/rseq.rst b/Documentation/userspace-api/rseq.rst index 3cd27a3c7c7e5a..8549a6c61531cc 100644 --- a/Documentation/userspace-api/rseq.rst +++ b/Documentation/userspace-api/rseq.rst @@ -24,6 +24,97 @@ Quick access to CPU number, node ID Allows to implement per CPU data efficiently. Documentation is in code and selftests. :( +Optimized RSEQ V2 +----------------- + +On architectures which utilize the generic entry code and generic TIF bits +the kernel supports runtime optimizations for RSEQ, which also enable +enhanced features like scheduler time slice extensions. + +To enable them a task has to register the RSEQ region with at least the +length advertised by getauxval(AT_RSEQ_FEATURE_SIZE). + +If existing binaries register with RSEQ_ORIG_SIZE (32 bytes), the kernel +keeps the legacy low performance mode enabled to fulfil the expectations +of existing users regarding the original RSEQ implementation behaviour. + +The following table documents the ABI and behavioral guarantees of the +legacy and the optimized V2 mode. + +.. list-table:: RSEQ modes + :header-rows: 1 + + * - Nr + - What + + - Legacy + - Optimized V2 + + * - 1 + - The cpu_id_start, cpu_id, node_id and mm_cid fields (User mode read + only) + .. Legacy + - Updated by the kernel unconditionally after each context switch and + before signal delivery + .. Optimized V2 + - Updated by the kernel if and only if they change, i.e. if the task + is migrated or mm_cid changes + + * - 2 + - The rseq_cs critical section field + .. Legacy + - Evaluated and handled unconditionally after each context switch and + before signal delivery + .. Optimized V2 + - Evaluated and handled conditionally only when user space was + interrupted and was scheduled out or before delivering a signal in + the interrupted context. + + * - 3 + - Read only fields + .. Legacy + - No strict enforcement except in debug mode + .. Optimized V2 + - Strict enforcement + + * - 4 + - membarrier(...RSEQ) + .. Legacy + - All running threads of the process are interrupted and the ID fields + are rewritten and eventually active critical sections are aborted + before they return to user space. All threads which are scheduled + out whether voluntary or not are covered by #1/#2 above. + .. Optimized V2 + - All running threads of the process are interrupted and eventually + active critical sections are aborted before these threads return to + user space. The ID fields are only updated if changed as a + consequence of the interrupt. All threads which are scheduled out + whether voluntary or not are covered by #1/#2 above. + + * - 5 + - Time slice extensions + .. Legacy + - Not supported + .. Optimized V2 + - Supported + +The legacy mode is obviously less performant as it does unconditional +updates and critical section checks even if not strictly required by the +ABI contract. That can't be changed anymore as some users depend on that +observed behavior, which in turn enables them to violate the ABI and +overwrite the cpu_id_start field for their own purposes. This is obviously +discouraged as it renders RSEQ incompatible with the intended usage and +breaks the expectation of other libraries in the same application. + +The ABI compliant optimized v2 mode, which respects the read only fields, +does not require unconditional updates and therefore is way more +performant. The kernel validates the read only fields for compliance. If +user space modifies them, the process is killed. Compliant usage allows +multiple libraries in the same application to benefit from the RSEQ +functionality without disturbing each other. The ABI compliant optimized v2 +mode also enables extended RSEQ features like time slice extensions. + + Scheduler time slice extensions ------------------------------- @@ -37,7 +128,8 @@ The prerequisites for this functionality are: * Enabled at boot time (default is enabled) - * A rseq userspace pointer has been registered for the thread + * A rseq userspace pointer has been registered for the thread in + optimized V2 mode The thread has to enable the functionality via prctl(2):: diff --git a/kernel/rseq.c b/kernel/rseq.c index 101612027f6a36..e75e3a5e312c8a 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -412,70 +412,23 @@ static bool rseq_reset_ids(void) /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { u32 rseqfl = 0; u8 version = 1; - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq.usrptr != rseq || !current->rseq.usrptr) - return -EINVAL; - if (rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - if (!rseq_reset_ids()) - return -EFAULT; - rseq_reset(current); - return 0; - } - - if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) - return -EINVAL; - - if (current->rseq.usrptr) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } - - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * The rseq_len is required to be greater or equal to the original rseq - * size. In order to be valid, rseq_len is either the original rseq size, - * or large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; /* - * The version check effectivly disables time slice extensions until the - * RSEQ ABI V2 registration are implemented. + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { if (rseq_slice_extension_enabled()) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; @@ -523,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 #endif /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. */ - current->rseq.event.has_rseq = true; + current->rseq.event.has_rseq = version; rseq_force_update(); return 0; @@ -535,6 +487,80 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EFAULT; } +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} + +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) + return false; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); +} + +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +{ + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); + + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) + return -EINVAL; + + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); + + if (!rseq_length_valid(rseq, rseq_len)) + return -EINVAL; + + return rseq_register(rseq, rseq_len, flags, sig); +} + #ifdef CONFIG_RSEQ_SLICE_EXTENSION struct slice_timer { struct hrtimer timer; From e744060076871eebc2647b24420b550ff44b2b65 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Apr 2026 14:48:23 +0200 Subject: [PATCH 446/972] selftests/rseq: Expand for optimized RSEQ ABI v2 Update the selftests so they are executed for legacy (32 bytes RSEQ region) and optimized RSEQ ABI v2 mode. Fixes: d6200245c75e ("rseq: Allow registering RSEQ with slice extension") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Link: https://patch.msgid.link/20260428224428.009121296%40kernel.org Cc: stable@vger.kernel.org --- tools/testing/selftests/rseq/Makefile | 11 ++++-- .../testing/selftests/rseq/check_optimized.c | 17 ++++++++ tools/testing/selftests/rseq/param_test.c | 25 +++++++----- .../testing/selftests/rseq/run_param_test.sh | 39 +++++++++++++++++++ .../selftests/rseq/run_timeslice_test.sh | 14 +++++++ tools/testing/selftests/rseq/slice_test.c | 2 +- 6 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 tools/testing/selftests/rseq/check_optimized.c create mode 100755 tools/testing/selftests/rseq/run_timeslice_test.sh diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 0293a2f17f5115..50d69e22ee7a67 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -15,7 +15,7 @@ LDLIBS += -lpthread -ldl OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test \ - param_test_benchmark param_test_mm_cid_benchmark slice_test + param_test_benchmark param_test_mm_cid_benchmark TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test \ @@ -23,9 +23,11 @@ TEST_GEN_PROGS_EXTENDED = librseq.so \ param_test_mm_cid \ param_test_mm_cid_compare_twice \ syscall_errors_test \ - legacy_check + legacy_check \ + slice_test \ + check_optimized -TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh run_legacy_check.sh run_timeslice_test.sh TEST_FILES := settings @@ -66,3 +68,6 @@ $(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) $(OUTPUT)/slice_test: slice_test.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/check_optimized: check_optimized.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/check_optimized.c b/tools/testing/selftests/rseq/check_optimized.c new file mode 100644 index 00000000000000..a13e3f2c8fc62f --- /dev/null +++ b/tools/testing/selftests/rseq/check_optimized.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: LGPL-2.1 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "rseq.h" + +int main(int argc, char **argv) +{ + if (__rseq_register_current_thread(true, false)) + return -1; + return 0; +} diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 05d03e679e0608..e1e98dbabe4bcf 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -38,7 +38,7 @@ static int opt_modulo, verbose; static int opt_yield, opt_signal, opt_sleep, opt_disable_rseq, opt_threads = 200, opt_disable_mod = 0, opt_test = 's'; - +static bool opt_rseq_legacy; static long long opt_reps = 5000; static __thread __attribute__((tls_model("initial-exec"))) @@ -281,9 +281,12 @@ unsigned int yield_mod_cnt, nr_abort; } \ } +#define rseq_no_glibc true + #else #define printf_verbose(fmt, ...) +#define rseq_no_glibc false #endif /* BENCHMARK */ @@ -481,7 +484,7 @@ void *test_percpu_spinlock_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -558,7 +561,7 @@ void *test_percpu_inc_thread(void *arg) long long i, reps; if (!opt_disable_rseq && thread_data->reg && - rseq_register_current_thread()) + __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = thread_data->reps; for (i = 0; i < reps; i++) { @@ -712,7 +715,7 @@ void *test_percpu_list_thread(void *arg) long long i, reps; struct percpu_list *list = (struct percpu_list *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -895,7 +898,7 @@ void *test_percpu_buffer_thread(void *arg) long long i, reps; struct percpu_buffer *buffer = (struct percpu_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1105,7 +1108,7 @@ void *test_percpu_memcpy_buffer_thread(void *arg) long long i, reps; struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) abort(); reps = opt_reps; @@ -1258,7 +1261,7 @@ void *test_membarrier_worker_thread(void *arg) const int iters = opt_reps; int i; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1323,7 +1326,7 @@ void *test_membarrier_manager_thread(void *arg) intptr_t expect_a = 0, expect_b = 0; int cpu_a = 0, cpu_b = 0; - if (rseq_register_current_thread()) { + if (__rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) { fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n", errno, strerror(errno)); abort(); @@ -1475,6 +1478,7 @@ static void show_usage(int argc, char **argv) printf(" [-D M] Disable rseq for each M threads\n"); printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n"); printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n"); + printf(" [-O] Test with optimized RSEQ\n"); printf(" [-v] Verbose output.\n"); printf(" [-h] Show this help.\n"); printf("\n"); @@ -1602,6 +1606,9 @@ int main(int argc, char **argv) case 'M': opt_mo = RSEQ_MO_RELEASE; break; + case 'L': + opt_rseq_legacy = true; + break; default: show_usage(argc, argv); goto error; @@ -1618,7 +1625,7 @@ int main(int argc, char **argv) if (set_signal_handler()) goto error; - if (!opt_disable_rseq && rseq_register_current_thread()) + if (!opt_disable_rseq && __rseq_register_current_thread(rseq_no_glibc, opt_rseq_legacy)) goto error; if (!opt_disable_rseq && !rseq_validate_cpu_id()) { fprintf(stderr, "Error: cpu id getter unavailable\n"); diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh index 8d31426ab41f22..69a3fa049929f3 100755 --- a/tools/testing/selftests/rseq/run_param_test.sh +++ b/tools/testing/selftests/rseq/run_param_test.sh @@ -34,6 +34,11 @@ REPS=1000 SLOW_REPS=100 NR_THREADS=$((6*${NR_CPUS})) +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy and +# performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + function do_tests() { local i=0 @@ -103,6 +108,40 @@ function inject_blocking() NR_LOOPS= } +echo "Testing in legacy RSEQ mode" +echo "Yield injection (25%)" +inject_blocking -m 4 -y -L + +echo "Yield injection (50%)" +inject_blocking -m 2 -y -L + +echo "Yield injection (100%)" +inject_blocking -m 1 -y -L + +echo "Kill injection (25%)" +inject_blocking -m 4 -k -L + +echo "Kill injection (50%)" +inject_blocking -m 2 -k -L + +echo "Kill injection (100%)" +inject_blocking -m 1 -k -L + +echo "Sleep injection (1ms, 25%)" +inject_blocking -m 4 -s 1 -L + +echo "Sleep injection (1ms, 50%)" +inject_blocking -m 2 -s 1 -L + +echo "Sleep injection (1ms, 100%)" +inject_blocking -m 1 -s 1 -L + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +echo "Testing in optimized RSEQ mode" echo "Yield injection (25%)" inject_blocking -m 4 -y diff --git a/tools/testing/selftests/rseq/run_timeslice_test.sh b/tools/testing/selftests/rseq/run_timeslice_test.sh new file mode 100755 index 00000000000000..551ebed71ec61c --- /dev/null +++ b/tools/testing/selftests/rseq/run_timeslice_test.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ + +# Prevent GLIBC from registering RSEQ so the selftest can run in legacy +# and performance optimized mode. +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" +export GLIBC_TUNABLES + +./check_optimized || { + echo "Skipping optimized RSEQ mode test. Not supported"; + exit 0 +} + +./slice_test diff --git a/tools/testing/selftests/rseq/slice_test.c b/tools/testing/selftests/rseq/slice_test.c index 77e668ff74d7f0..e402d4440bc27f 100644 --- a/tools/testing/selftests/rseq/slice_test.c +++ b/tools/testing/selftests/rseq/slice_test.c @@ -124,7 +124,7 @@ FIXTURE_SETUP(slice_ext) { cpu_set_t affinity; - if (rseq_register_current_thread()) + if (__rseq_register_current_thread(true, false)) SKIP(return, "RSEQ not supported\n"); if (prctl(PR_RSEQ_SLICE_EXTENSION, PR_RSEQ_SLICE_EXTENSION_SET, From b6eee96843e8d088200f01b035da98e72067c5fe Mon Sep 17 00:00:00 2001 From: Zhan Xusheng Date: Fri, 1 May 2026 12:40:06 +0200 Subject: [PATCH 447/972] sched/fair: Fix overflow in vruntime_eligible() Zhan Xusheng reported running into sporadic a s64 mult overflow in vruntime_eligible(). When constructing a worst case scenario: If you have cgroups, then you can have an entity of weight 2 (per calc_group_shares()), and its vlag should then be bounded by: (slice+TICK_NSEC) * NICE_0_LOAD, which is around 44 bits as per the comment on entity_key(). The other extreme is 100*NICE_0_LOAD, thus you get: {key, weight}[] := { puny: { (slice + TICK_NSEC) * NICE_0_LOAD, 2 }, max: { 0, 100*NICE_0_LOAD }, } The avg_vruntime() would end up being very close to 0 (which is zero_vruntime), so no real help making that more accurate. vruntime_eligible(puny) ends up with: avg = 2 * puny.key (+ 0) load = 2 + 100 * NICE_0_LOAD avg >= puny.key * load And that is: (slice + TICK_NSEC) * NICE_0_LOAD * NICE_0_LOAD * 100, which will overflow s64. Zhan suggested using __builtin_mul_overflow(), however after staring at compiler output for various architectures using godbolt, it seems that using an __int128 multiplication often results in better code. Specifically, a number of architectures already compute the __int128 product to determine the overflow. Eg. arm64 already has the 'smulh' instruction used. By explicitly doing an __int128 multiply, it will emit the 'mul; smulh' pattern, which modern cores can fuse (armv8-a clang-22.1.0). x86_64 has less branches (no OF handling). Since Linux has ARCH_SUPPORTS_INT128 to gate __int128 usage, also provide the __builtin_mul_overflow() variant as a fallback. [peterz: Changelog and __int128 bits] Fixes: 556146ce5e94 ("sched/fair: Avoid overflow in enqueue_entity()") Reported-by: Zhan Xusheng Closes: https://patch.msgid.link/20260415145742.10359-1-zhanxusheng%40xiaomi.com Signed-off-by: Zhan Xusheng Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260505103155.GN3102924%40noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 728965851842e1..b91c8b2942290f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> V >= v_i * - * \Sum (v_i - v)*w_i - * V = ------------------ + v + * \Sum (v_i - v0)*w_i + * V = ------------------- + v0 * \Sum w_i * - * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. @@ -894,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; + s64 key, avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -904,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; + key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); + + /* + * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD' + * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24 + * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise. + * + * This gives that on 64bit the product will be at least 64bit which + * overflows s64, while on 32bit it will only be 44bits and should fit + * comfortably. + */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* This often results in simpler code than __builtin_mul_overflow(). */ + return avg >= (__int128)key * load; +#else + s64 rhs; + /* + * On overflow, the sign of key tells us the correct answer: a large + * positive key means vruntime >> V, so not eligible; a large negative + * key means vruntime << V, so eligible. + */ + if (check_mul_overflow(key, load, &rhs)) + return key <= 0; + + return avg >= rhs; +#endif +#else /* 32bit */ + return avg >= key * load; +#endif } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) From 9f6d929ee2c6f0266edb564bcd2bd47fd6e884a8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Sun, 3 May 2026 12:45:03 +0200 Subject: [PATCH 448/972] sched/fair: Fix wakeup_preempt_fair() for not waking up task Make sure to only call pick_next_entity() on an non-empty cfs_rq. The assumption that p is always enqueued and not delayed, is only true for wakeup. If p was moved while delayed, pick_next_entity() will dequeue it and the cfs might become empty. Test if there are still queued tasks before trying again to determine if p could be the next one to be picked. There are at least 2 cases: When cfs becomes idle, it tries to pull tasks but if those pulled tasks are delayed, they will be dequeued when attached to cfs. attach_tasks() -> attach_task() -> wakeup_preempt(rq, p, 0); A misfit task running on cfs A triggers a load balance to be pulled on a better cpu, the load balance on cfs B starts an active load balance to pulled the running misfit task. If there is a delayed dequeue task on cfs A, it can be pulled instead of the previously running misfit task. attach_one_task() -> attach_task() -> wakeup_preempt(rq, p, 0); Fixes: ac8e69e69363 ("sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260503104503.1732682-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b91c8b2942290f..3ebec186f98236 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9174,9 +9174,10 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f /* * Because p is enqueued, nse being null can only mean that we - * dequeued a delayed task. + * dequeued a delayed task. If there are still entities queued in + * cfs, check if the next one will be p. */ - if (!nse) + if (!nse && cfs_rq->nr_queued) goto pick; if (sched_feat(RUN_TO_PARITY)) From 1f7305d87aa23db2579df222eba504a333c2c978 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 5 May 2026 17:52:03 +0100 Subject: [PATCH 449/972] KVM: arm64: Work around C1-Pro erratum 4193714 for protected guests C1-Pro cores with SME have an erratum where TLBI+DSB does not complete all outstanding SME accesses. Instead a DSB needs to be executed on the affected CPUs. The implication is that pages cannot be unmapped from the host Stage 2 and then provided to a protected guest or to the hypervisor. Host SME accesses may still complete after this point. This erratum breaks pKVM's guarantees, and the workaround is hard to implement as EL2 and EL1 share a security state meaning EL1 can mask IPIs sent by EL2, leading to interrupt blackouts. Instead, do this in EL3. This has the advantage of a separate security state, meaning lower EL cannot mask the IPI. It is also simpler for EL3 to know about CPUs that are off or in PSCI's CPU_SUSPEND. Add the needed hook to host_stage2_set_owner_metadata_locked(). This covers the cases where the host loses access to a page: __pkvm_host_donate_guest() __pkvm_guest_unshare_host() host_stage2_set_owner_locked() when owner_id == PKVM_ID_HYP Since pKVM relies on the firmware call for correctness, check for the firmware counterpart during protected KVM initialisation and fail the pKVM initialisation if it is missing. Signed-off-by: James Morse Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Mark Rutland Cc: Marc Zyngier Cc: Oliver Upton Cc: Will Deacon Cc: Vincent Donnefort Cc: Lorenzo Pieralisi Cc: Sudeep Holla Link: https://patch.msgid.link/20260505165205.2690919-1-catalin.marinas@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 21 +++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 23 ++++++++++++++++++++++- include/linux/arm-smccc.h | 6 ++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8bb2c7422cc8b0..34c9950884d5e6 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -4,6 +4,7 @@ * Author: Christoffer Dall */ +#include #include #include #include @@ -2638,6 +2639,22 @@ static int init_pkvm_host_sve_state(void) return 0; } +static int pkvm_check_sme_dvmsync_fw_call(void) +{ + struct arm_smccc_res res; + + if (!cpus_have_final_cap(ARM64_WORKAROUND_4193714)) + return 0; + + arm_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + if (res.a0) { + kvm_err("pKVM requires firmware support for C1-Pro erratum 4193714\n"); + return -ENODEV; + } + + return 0; +} + /* * Finalizes the initialization of hyp mode, once everything else is initialized * and the initialziation process cannot fail. @@ -2838,6 +2855,10 @@ static int __init init_hyp_mode(void) if (err) goto out_err; + err = pkvm_check_sme_dvmsync_fw_call(); + if (err) + goto out_err; + err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 28a471d1927cd5..a3050e2b65b13d 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -5,6 +5,7 @@ */ #include + #include #include #include @@ -14,6 +15,7 @@ #include +#include #include #include #include @@ -29,6 +31,19 @@ static struct hyp_pool host_s2_pool; static DEFINE_PER_CPU(struct pkvm_hyp_vm *, __current_vm); #define current_vm (*this_cpu_ptr(&__current_vm)) +static void pkvm_sme_dvmsync_fw_call(void) +{ + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_4193714)) { + struct arm_smccc_res res; + + /* + * Ignore the return value. Probing for the workaround + * availability took place in init_hyp_mode(). + */ + hyp_smccc_1_1_smc(ARM_SMCCC_CPU_WORKAROUND_4193714, &res); + } +} + static void guest_lock_component(struct pkvm_hyp_vm *vm) { hyp_spin_lock(&vm->lock); @@ -574,8 +589,14 @@ static int host_stage2_set_owner_metadata_locked(phys_addr_t addr, u64 size, ret = host_stage2_try(kvm_pgtable_stage2_annotate, &host_mmu.pgt, addr, size, &host_s2_pool, KVM_HOST_INVALID_PTE_TYPE_DONATION, annotation); - if (!ret) + if (!ret) { + /* + * After stage2 maintenance has happened, but before the page + * owner has changed. + */ + pkvm_sme_dvmsync_fw_call(); __host_update_page_state(addr, size, PKVM_NOPAGE); + } return ret; } diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 50b47eba7d0152..e7195750d21bb4 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -105,6 +105,12 @@ ARM_SMCCC_SMC_32, \ 0, 0x3fff) +/* C1-Pro erratum 4193714: SME DVMSync early acknowledgement */ +#define ARM_SMCCC_CPU_WORKAROUND_4193714 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_CPU, 0x10) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ From 8d9b9d985ad3a81c751a6b97edaf1d3c0780af7c Mon Sep 17 00:00:00 2001 From: Wei-Lin Chang Date: Tue, 5 May 2026 15:47:35 +0100 Subject: [PATCH 450/972] KVM: arm64: nv: Consider the DS bit when translating TCR_EL2 When running an nVHE L1, TCR_EL2 is mapped to TCR_EL1. Writes to the register are trapped and written to TCR_EL1 after a translation. Booting an nVHE L1 with 52-bit VA isn't working because the translation was ignoring the DS bit set by the guest, hence causing repeating level 0 faults. Add it in the translation function. Signed-off-by: Wei-Lin Chang Link: https://patch.msgid.link/20260505144735.1496530-1-weilin.chang@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 091544e6af442e..dc2957662ff204 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -23,6 +23,7 @@ static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2) static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr) { return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ + ((tcr & TCR_EL2_DS) ? TCR_DS : 0) | ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | tcr_el2_ps_to_tcr_el1_ips(tcr) | (tcr & TCR_EL2_TG0_MASK) | From 9be19df816dea9eb7dfe1661b3690bed6a2cb146 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 5 May 2026 10:49:13 +0100 Subject: [PATCH 451/972] KVM: arm64: Handle permission faults with guest_memfd gmem_abort() calls kvm_pgtable_stage2_map() to make changes to stage 2. It does this for both relaxing permissions on an existing mapping and to install a missing mapping. kvm_pgtable_stage2_map() doesn't make changes to stage 2 if there is an existing, valid entry and the new entry modifies only the permissions. This is checked in: kvm_pgtable_stage2_map() stage2_map_walk_leaf() stage2_map_walker_try_leaf() stage2_pte_needs_update() and if only the permissions differ, kvm_pgtable_stage2_map() returns -EAGAIN and KVM returns to the guest to replay the instruction. The assumption is that a concurrent fault on a different VCPU already mapped the faulting IPA, and replaying the instruction will either succeed, or cause a permission fault, which should be handled with kvm_pgtable_stage2_relax_perms(). gmem_abort(), on a read or write fault on a system without DIC (instruction cache invalidation required for data to instruction coherence), installs a valid entry with read and write permissions, but without executable permissions. On an execution fault on the same page, gmem_abort() attempts to relax the permissions to allow execution, but calls kvm_pgtable_stage2_map() to change the existing, valid, entry. kvm_pgtable_stage2_map() returns -EAGAIN and KVM resumes execution from the faulting instruction, which leads to an infinite loop of permission faults on the same instruction. Allow the guest to make progress by using kvm_pgtable_stage2_relax_perms() to relax permissions. Fixes: a7b57e099592 ("KVM: arm64: Handle guest_memfd-backed guest page faults") Signed-off-by: Alexandru Elisei Reviewed-by: Fuad Tabba Link: https://patch.msgid.link/20260505094913.75317-1-alexandru.elisei@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/mmu.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d089c107d9b711..4da9281312eb8f 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1576,21 +1576,24 @@ struct kvm_s2_fault_desc { static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) { bool write_fault, exec_fault; + bool perm_fault = kvm_vcpu_trap_is_permission_fault(s2fd->vcpu); enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt = s2fd->vcpu->arch.hw_mmu->pgt; unsigned long mmu_seq; struct page *page; struct kvm *kvm = s2fd->vcpu->kvm; - void *memcache; + void *memcache = NULL; kvm_pfn_t pfn; gfn_t gfn; int ret; - memcache = get_mmu_memcache(s2fd->vcpu); - ret = topup_mmu_memcache(s2fd->vcpu, memcache); - if (ret) - return ret; + if (!perm_fault) { + memcache = get_mmu_memcache(s2fd->vcpu); + ret = topup_mmu_memcache(s2fd->vcpu, memcache); + if (ret) + return ret; + } if (s2fd->nested) gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT; @@ -1631,9 +1634,19 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd) goto out_unlock; } - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, - __pfn_to_phys(pfn), prot, - memcache, flags); + if (perm_fault) { + /* + * Drop the SW bits in favour of those stored in the + * PTE, which will be preserved. + */ + prot &= ~KVM_NV_GUEST_MAP_SZ; + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, s2fd->fault_ipa, + prot, flags); + } else { + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE, + __pfn_to_phys(pfn), prot, + memcache, flags); + } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, prot & KVM_PGTABLE_PROT_W); From fc240715fc5003538ff530e3cfb985e7769b7171 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 4 May 2026 13:28:08 +0200 Subject: [PATCH 452/972] KVM: selftests: arm64: Fix steal_time test after UAPI refactoring Fix the following failure to the steal_time test on arm64 by making the timer address known to the guest. ==== Test Assertion Failure ==== steal_time.c:229: !ret pid=18514 tid=18514 errno=22 - Invalid argument 1 0x000000000040252f: check_steal_time_uapi at steal_time.c:229 (discriminator 20) 2 (inlined by) main at steal_time.c:537 (discriminator 20) 3 0x0000ffffa23d621b: ?? ??:0 4 0x0000ffffa23d62fb: ?? ??:0 5 0x0000000000402b6f: _start at ??:? KVM_SET_DEVICE_ATTR failed, rc: -1 errno: 22 (Invalid argument) Fixes: 40351ed924dd ("KVM: selftests: Refactor UAPI tests into dedicated function") Signed-off-by: Sebastian Ott Link: https://patch.msgid.link/20260504112808.21276-1-sebott@redhat.com Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/steal_time.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 7df2bc8eec0221..76fcdd1fd3cb48 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -220,6 +220,8 @@ static void check_steal_time_uapi(void) }; vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, 1, 0); + virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, 1); st_ipa = (ulong)ST_GPA_BASE | 1; ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); From 9a624ea3f26f40c76bd2c7f77cde30659d42efbd Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Thu, 30 Apr 2026 10:37:24 +0000 Subject: [PATCH 453/972] KVM: arm64: Remove potential UB on nvhe tracing clock update Sashiko(locally) reports possiblity of division by zero and out-of-bounds bitwise shift in trace_clock_update(). Although the clock update is untrusted, we should at least have some basic checks to avoid undefined behaviours. Reviewed-by: Vincent Donnefort Signed-off-by: Mostafa Saleh Link: https://patch.msgid.link/20260430103724.2151625-1-smostafa@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/clock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/clock.c b/arch/arm64/kvm/hyp/nvhe/clock.c index 32fc4313fe432a..a7fc61976fd0dc 100644 --- a/arch/arm64/kvm/hyp/nvhe/clock.c +++ b/arch/arm64/kvm/hyp/nvhe/clock.c @@ -35,6 +35,9 @@ void trace_clock_update(u32 mult, u32 shift, u64 epoch_ns, u64 epoch_cyc) struct clock_data *clock = &trace_clock_data; u64 bank = clock->cur ^ 1; + if (!mult || shift >= 64) + return; + clock->data[bank].mult = mult; clock->data[bank].shift = shift; clock->data[bank].epoch_ns = epoch_ns; From 54ea22273eef67196f238263bc79f27da04d2114 Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Tue, 28 Apr 2026 18:05:12 +0200 Subject: [PATCH 454/972] MAINTAINERS: Add Steffen as reviewer for KVM/arm64 KVM/arm64 and KVM/s390 will eventually share some code. Add me as a cross-reviewer from the s390 team to arm64 to help to keep both architectures in sync. Signed-off-by: Steffen Eiden Link: https://patch.msgid.link/20260428160527.1378085-16-seiden@linux.ibm.com [maz: rephrase commit message to use future tense, since this is merged ahead of the code] Signed-off-by: Marc Zyngier --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 882214b0e7db53..3fe4ea59199c6b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14052,6 +14052,7 @@ KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier M: Oliver Upton R: Joey Gouly +R: Steffen Eiden R: Suzuki K Poulose R: Zenghui Yu L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) From b819db93d73f4593636299e229914052b89e3ef2 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 12 Apr 2026 21:47:42 +0300 Subject: [PATCH 455/972] Bluetooth: SCO: fix sleeping under spinlock in sco_conn_ready sco_conn_ready calls sleeping functions under conn->lock spinlock. The critical section can be reduced: conn->hcon is modified only with hdev->lock held. It is guaranteed to be held in sco_conn_ready, so conn->lock is not needed to guard it. Move taking conn->lock after lock_sock(parent). This also follows the lock ordering lock_sock() > conn->lock elsewhere in the file. Fixes: 27c24fda62b60 ("Bluetooth: switch to lock_sock in SCO") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 18826d4b9c0bf8..3a5479538e8587 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1377,26 +1377,24 @@ static void sco_conn_ready(struct sco_conn *conn) sk->sk_state_change(sk); release_sock(sk); } else { - sco_conn_lock(conn); - - if (!conn->hcon) { - sco_conn_unlock(conn); + if (!conn->hcon) return; - } + + lockdep_assert_held(&conn->hcon->hdev->lock); parent = sco_get_sock_listen(&conn->hcon->src); - if (!parent) { - sco_conn_unlock(conn); + if (!parent) return; - } lock_sock(parent); + sco_conn_lock(conn); + sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { - release_sock(parent); sco_conn_unlock(conn); + release_sock(parent); return; } @@ -1417,9 +1415,9 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); - release_sock(parent); - sco_conn_unlock(conn); + + release_sock(parent); } } From 0beddb0c380bed5f5b8e61ddbe14635bb73d0b41 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sun, 12 Apr 2026 21:29:16 +0100 Subject: [PATCH 456/972] Bluetooth: hci_conn: fix potential UAF in create_big_sync Add hci_conn_valid() check in create_big_sync() to detect stale connections before proceeding with BIG creation. Handle the resulting -ECANCELED in create_big_complete() and re-validate the connection under hci_dev_lock() before dereferencing, matching the pattern used by create_le_conn_complete() and create_pa_complete(). Keep the hci_conn object alive across the async boundary by taking a reference via hci_conn_get() when queueing create_big_sync(), and dropping it in the completion callback. The refcount and the lock are complementary: the refcount keeps the object allocated, while hci_dev_lock() serializes hci_conn_hash_del()'s list_del_rcu() on hdev->conn_hash, as required by hci_conn_del(). hci_conn_put() is called outside hci_dev_unlock() so the final put (which resolves to kfree() via bt_link_release) does not run under hdev->lock, though the release path would be safe either way. Without this, create_big_complete() would unconditionally dereference the conn pointer on error, causing a use-after-free via hci_connect_cfm() and hci_conn_del(). Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Cc: stable@vger.kernel.org Co-developed-by: Luiz Augusto von Dentz Signed-off-by: Luiz Augusto von Dentz Signed-off-by: David Carlier Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3a059259908615..96e345fcf30362 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2130,6 +2130,9 @@ static int create_big_sync(struct hci_dev *hdev, void *data) u32 flags = 0; int err; + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + if (qos->bcast.out.phys == BIT(1)) flags |= MGMT_ADV_FLAG_SEC_2M; @@ -2204,11 +2207,24 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "conn %p", conn); + if (err == -ECANCELED) + goto done; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, conn)) + goto unlock; + if (err) { bt_dev_err(hdev, "Unable to create BIG: %d", err); hci_connect_cfm(conn, err); hci_conn_del(conn); } + +unlock: + hci_dev_unlock(hdev); +done: + hci_conn_put(conn); } struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, @@ -2336,10 +2352,11 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ - err = hci_cmd_sync_queue(hdev, create_big_sync, conn, + err = hci_cmd_sync_queue(hdev, create_big_sync, hci_conn_get(conn), create_big_complete); if (err < 0) { hci_conn_drop(conn); + hci_conn_put(conn); return ERR_PTR(err); } From 5ddb8014261137cadaf83ab5617a588d80a22586 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 10 Apr 2026 15:29:52 -0400 Subject: [PATCH 457/972] Bluetooth: hci_event: Fix OOB read and infinite loop in hci_le_create_big_complete_evt hci_le_create_big_complete_evt() iterates over BT_BOUND connections for a BIG handle using a while loop, accessing ev->bis_handle[i++] on each iteration. However, there is no check that i stays within ev->num_bis before the array access. When a controller sends a LE_Create_BIG_Complete event with fewer bis_handle entries than there are BT_BOUND connections for that BIG, or with num_bis=0, the loop reads beyond the valid bis_handle[] flex array into adjacent heap memory. Since the out-of-bounds values typically exceed HCI_CONN_HANDLE_MAX (0x0EFF), hci_conn_set_handle() rejects them and the connection remains in BT_BOUND state. The same connection is then found again by hci_conn_hash_lookup_big_state(), creating an infinite loop with hci_dev_lock held. Fix this by terminating the BIG if in case not all BIS could be setup properly. Fixes: a0bfde167b50 ("Bluetooth: ISO: Add support for connecting multiple BISes") Cc: stable@vger.kernel.org Signed-off-by: ZhiTao Ou Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b2ee6b6a0f5650..1b3b9131affaa3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7118,9 +7118,29 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, continue; } + if (ev->num_bis <= i) { + bt_dev_err(hdev, + "Not enough BIS handles for BIG 0x%2.2x", + ev->handle); + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); + continue; + } + if (hci_conn_set_handle(conn, - __le16_to_cpu(ev->bis_handle[i++]))) + __le16_to_cpu(ev->bis_handle[i++]))) { + bt_dev_err(hdev, + "Failed to set BIS handle for BIG 0x%2.2x", + ev->handle); + /* Force error so BIG gets terminated as not all BIS + * could be connected. + */ + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); continue; + } conn->state = BT_CONNECTED; set_bit(HCI_CONN_BIG_CREATED, &conn->flags); @@ -7129,7 +7149,10 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, hci_iso_setup_path(conn); } - if (!ev->status && !i) + /* If there is an unexpected error or if no BISes have been connected + * for the BIG, terminate it. + */ + if (ev->status == HCI_ERROR_UNSPECIFIED || (!ev->status && !i)) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections * have been closed before the BIG creation From 72b8deccff17a7644e0367e1aaf1a36cfb014324 Mon Sep 17 00:00:00 2001 From: Dudu Lu Date: Wed, 15 Apr 2026 17:39:53 +0800 Subject: [PATCH 458/972] Bluetooth: bnep: fix incorrect length parsing in bnep_rx_frame() extension handling In bnep_rx_frame(), the BNEP_FILTER_NET_TYPE_SET and BNEP_FILTER_MULTI_ADDR_SET extension header parsing has two bugs: 1) The 2-byte length field is read with *(u16 *)(skb->data + 1), which performs a native-endian read. The BNEP protocol specifies this field in big-endian (network byte order), and the same file correctly uses get_unaligned_be16() for the identical fields in bnep_ctrl_set_netfilter() and bnep_ctrl_set_mcfilter(). 2) The length is multiplied by 2, but unlike BNEP_SETUP_CONN_REQ where the length byte counts UUID pairs (requiring * 2 for two UUIDs per entry), the filter extension length field already represents the total data size in bytes. This is confirmed by bnep_ctrl_set_netfilter() which reads the same field as a byte count and divides by 4 to get the number of filter entries. The bogus * 2 means skb_pull advances twice as far as it should, either dropping valid data from the next header or causing the pull to fail entirely when the doubled length exceeds the remaining skb. Fix by splitting the pull into two steps: first use skb_pull_data() to safely pull and validate the 3-byte fixed header (ctrl type + length), then pull the variable-length data using the properly decoded length. Fixes: bf8b9a9cb77b ("Bluetooth: bnep: Add support to extended headers of control frames") Signed-off-by: Dudu Lu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/bnep/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index d44987d4515c0b..853c8d7644b558 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -330,11 +330,18 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) goto badframe; break; case BNEP_FILTER_MULTI_ADDR_SET: - case BNEP_FILTER_NET_TYPE_SET: - /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */ - if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2)) + case BNEP_FILTER_NET_TYPE_SET: { + u8 *hdr; + + /* Pull ctrl type (1 b) + len (2 b) */ + hdr = skb_pull_data(skb, 3); + if (!hdr) + goto badframe; + /* Pull data (len bytes); length is big-endian */ + if (!skb_pull(skb, get_unaligned_be16(&hdr[1]))) goto badframe; break; + } default: kfree_skb(skb); return 0; From 4f42363c814f28fe3f59847c35acf1ed033bedd4 Mon Sep 17 00:00:00 2001 From: Dudu Lu Date: Wed, 15 Apr 2026 18:43:55 +0800 Subject: [PATCH 459/972] Bluetooth: l2cap: fix MPS check in l2cap_ecred_reconf_req The L2CAP specification states that if more than one channel is being reconfigured, the MPS shall not be decreased. The current check has two issues: 1) The comparison uses >= (greater-than-or-equal), which incorrectly rejects reconfiguration requests where the MPS stays the same. Since the spec says MPS "shall be greater than or equal to the current MPS", only a strict decrease (remote_mps > mps) should be rejected. Keeping the same MPS is valid. 2) The multi-channel guard uses `&& i` (loop index) to approximate "more than one channel", but this incorrectly allows MPS decrease for the first channel (i==0) even when multiple channels are being reconfigured. Replace with `&& num_scid > 1` which correctly checks whether the request covers more than one channel. Fixes: 7accb1c4321a ("Bluetooth: L2CAP: Fix invalid response to L2CAP_ECRED_RECONF_REQ") Signed-off-by: Dudu Lu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77dec104a9c367..b15374b951fa02 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5428,7 +5428,7 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, * configured, the MPS field may be less than the current MPS * of that channel. */ - if (chan[i]->remote_mps >= mps && i) { + if (chan[i]->remote_mps > mps && num_scid > 1) { BT_ERR("chan %p decreased MPS %u -> %u", chan[i], chan[i]->remote_mps, mps); result = L2CAP_RECONF_INVALID_MPS; From 91b5a598b5285da794b72619f31777b62dd336f8 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Wed, 15 Apr 2026 02:52:37 +0500 Subject: [PATCH 460/972] Bluetooth: l2cap: defer conn param update to avoid conn->lock/hdev->lock inversion When a BLE peripheral sends an L2CAP Connection Parameter Update Request the processing path is: process_pending_rx() [takes conn->lock] l2cap_le_sig_channel() l2cap_conn_param_update_req() hci_le_conn_update() [takes hdev->lock] Meanwhile other code paths take the locks in the opposite order: l2cap_chan_connect() [takes hdev->lock] ... mutex_lock(&conn->lock) l2cap_conn_ready() [hdev->lock via hci_cb_list_lock] ... mutex_lock(&conn->lock) This is a classic AB/BA deadlock which lockdep reports as a circular locking dependency when connecting a BLE MIDI keyboard (Carry-On FC-49). Fix this by making hci_le_conn_update() defer the HCI command through hci_cmd_sync_queue() so it no longer needs to take hdev->lock in the caller context. The sync callback uses __hci_cmd_sync_status_sk() to wait for the HCI_EV_LE_CONN_UPDATE_COMPLETE event, then updates the stored connection parameters (hci_conn_params) and notifies userspace (mgmt_new_conn_param) only after the controller has confirmed the update. A reference on hci_conn is held via hci_conn_get()/hci_conn_put() for the lifetime of the queued work to prevent use-after-free, and hci_conn_valid() is checked before proceeding in case the connection was removed while the work was pending. The hci_dev_lock is held across hci_conn_valid() and all conn field accesses to prevent a concurrent disconnect from invalidating the connection mid-use. Fixes: f044eb0524a0 ("Bluetooth: Store latency and supervision timeout in connection params") Signed-off-by: Mikhail Gavrilov Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 105 +++++++++++++++++++++++++------ net/bluetooth/l2cap_core.c | 12 +--- 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a7bffb908c1ec9..aa600fbf9a5359 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2495,7 +2495,7 @@ void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); int hci_abort_conn(struct hci_conn *conn, u8 reason); -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 96e345fcf30362..17b46ad6a34964 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -480,40 +480,107 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return hci_setup_sync_conn(conn, handle); } -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, - u16 to_multiplier) +struct le_conn_update_data { + struct hci_conn *conn; + u16 min; + u16 max; + u16 latency; + u16 to_multiplier; +}; + +static int le_conn_update_sync(struct hci_dev *hdev, void *data) { - struct hci_dev *hdev = conn->hdev; + struct le_conn_update_data *d = data; + struct hci_conn *conn = d->conn; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; + u16 timeout; + u8 store_hint; + int err; + /* Verify connection is still alive and read conn fields under + * the same lock to prevent a concurrent disconnect from freeing + * or reusing the connection while we build the HCI command. + */ hci_dev_lock(hdev); - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) { - params->conn_min_interval = min; - params->conn_max_interval = max; - params->conn_latency = latency; - params->supervision_timeout = to_multiplier; + if (!hci_conn_valid(hdev, conn)) { + hci_dev_unlock(hdev); + return -ECANCELED; } - hci_dev_unlock(hdev); - memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); - cp.conn_interval_min = cpu_to_le16(min); - cp.conn_interval_max = cpu_to_le16(max); - cp.conn_latency = cpu_to_le16(latency); - cp.supervision_timeout = cpu_to_le16(to_multiplier); + cp.conn_interval_min = cpu_to_le16(d->min); + cp.conn_interval_max = cpu_to_le16(d->max); + cp.conn_latency = cpu_to_le16(d->latency); + cp.supervision_timeout = cpu_to_le16(d->to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + timeout = conn->conn_timeout; - hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); + hci_dev_unlock(hdev); - if (params) - return 0x01; + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CONN_UPDATE, + sizeof(cp), &cp, + HCI_EV_LE_CONN_UPDATE_COMPLETE, + timeout, NULL); + if (err) + return err; - return 0x00; + /* Update stored connection parameters after the controller has + * confirmed the update via the LE Connection Update Complete event. + */ + hci_dev_lock(hdev); + + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + params->conn_min_interval = d->min; + params->conn_max_interval = d->max; + params->conn_latency = d->latency; + params->supervision_timeout = d->to_multiplier; + store_hint = 0x01; + } else { + store_hint = 0x00; + } + + hci_dev_unlock(hdev); + + mgmt_new_conn_param(hdev, &conn->dst, conn->dst_type, store_hint, + d->min, d->max, d->latency, d->to_multiplier); + + return 0; +} + +static void le_conn_update_complete(struct hci_dev *hdev, void *data, int err) +{ + struct le_conn_update_data *d = data; + + hci_conn_put(d->conn); + kfree(d); +} + +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, + u16 to_multiplier) +{ + struct le_conn_update_data *d; + + d = kzalloc_obj(*d); + if (!d) + return; + + hci_conn_get(conn); + d->conn = conn; + d->min = min; + d->max = max; + d->latency = latency; + d->to_multiplier = to_multiplier; + + if (hci_cmd_sync_queue(conn->hdev, le_conn_update_sync, d, + le_conn_update_complete) < 0) { + hci_conn_put(conn); + kfree(d); + } } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b15374b951fa02..7701528f11677c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4706,16 +4706,8 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, sizeof(rsp), &rsp); - if (!err) { - u8 store_hint; - - store_hint = hci_le_conn_update(hcon, min, max, latency, - to_multiplier); - mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type, - store_hint, min, max, latency, - to_multiplier); - - } + if (!err) + hci_le_conn_update(hcon, min, max, latency, to_multiplier); return 0; } From 2ff1a41a912de8517b4482e946dd951b7d80edbf Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:51:36 -0400 Subject: [PATCH 461/972] Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_state_change_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 89bc500e41fc ("Bluetooth: Add state tracking to struct l2cap_chan") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 71e8c1b45bcee1..fb3cb70a5a39d0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1657,6 +1657,9 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, { struct sock *sk = chan->data; + if (!sk) + return; + sk->sk_state = state; if (err) From 78a88d43dab8d23aeef934ed8ce34d40e6b3d613 Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:53:36 -0400 Subject: [PATCH 462/972] Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_get_sndtimeo_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 8d836d71e222 ("Bluetooth: Access sk_sndtimeo indirectly in l2cap_core.c") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index fb3cb70a5a39d0..879c9f90269a87 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1761,6 +1761,9 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return 0; + return READ_ONCE(sk->sk_sndtimeo); } From 0a120d96166301d7a95be75b52f843837dbd1219 Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:49:59 -0400 Subject: [PATCH 463/972] Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_new_connection_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 80808e431e1e ("Bluetooth: Add l2cap_chan_ops abstraction") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 879c9f90269a87..cf590a67d3641c 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1498,6 +1498,9 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) { struct sock *sk, *parent = chan->data; + if (!parent) + return NULL; + lock_sock(parent); /* Check for backlog size */ From 4e37f6452d586b95c346a9abdd2fb80b67794f39 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 18 Apr 2026 18:41:12 +0300 Subject: [PATCH 464/972] Bluetooth: SCO: hold sk properly in sco_conn_ready sk deref in sco_conn_ready must be done either under conn->lock, or holding a refcount, to avoid concurrent close. conn->sk and parent sk is currently accessed without either, and without checking parent->sk_state: [Task 1] [Task 2] sco_sock_release sco_conn_ready sk = conn->sk lock_sock(sk) conn->sk = NULL lock_sock(sk) release_sock(sk) sco_sock_kill(sk) UAF on sk deref and similarly for access to sco_get_sock_listen() return value. Fix possible UAF by holding sk refcount in sco_conn_ready() and making sco_get_sock_listen() increase refcount. Also recheck after lock_sock that the socket is still valid. Adjust conn->sk locking so it's protected also by lock_sock() of the associated socket if any. Fixes: 27c24fda62b60 ("Bluetooth: switch to lock_sock in SCO") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 3a5479538e8587..eba44525d41d98 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -472,9 +472,13 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src) sk1 = sk; } + sk = sk ? sk : sk1; + if (sk) + sock_hold(sk); + read_unlock(&sco_sk_list.lock); - return sk ? sk : sk1; + return sk; } static void sco_sock_destruct(struct sock *sk) @@ -515,11 +519,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + lock_sock(sk); if (sco_pi(sk)->conn) { sco_conn_lock(sco_pi(sk)->conn); sco_pi(sk)->conn->sk = NULL; sco_conn_unlock(sco_pi(sk)->conn); } + release_sock(sk); /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); @@ -1365,17 +1371,28 @@ static int sco_sock_release(struct socket *sock) static void sco_conn_ready(struct sco_conn *conn) { - struct sock *parent; - struct sock *sk = conn->sk; + struct sock *parent, *sk; + + sco_conn_lock(conn); + sk = sco_sock_hold(conn); + sco_conn_unlock(conn); BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); - sco_sock_clear_timer(sk); - sk->sk_state = BT_CONNECTED; - sk->sk_state_change(sk); + + /* conn->sk may have become NULL if racing with sk close, but + * due to held hdev->lock, it can't become different sk. + */ + if (conn->sk) { + sco_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + } + release_sock(sk); + sock_put(sk); } else { if (!conn->hcon) return; @@ -1390,13 +1407,15 @@ static void sco_conn_ready(struct sco_conn *conn) sco_conn_lock(conn); + /* hdev->lock guarantees conn->sk == NULL still here */ + + if (parent->sk_state != BT_LISTEN) + goto release; + sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); - if (!sk) { - sco_conn_unlock(conn); - release_sock(parent); - return; - } + if (!sk) + goto release; sco_sock_init(sk, parent); @@ -1415,9 +1434,10 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); +release: sco_conn_unlock(conn); - release_sock(parent); + sock_put(parent); } } From 5917dd39db2bfc8b1b4c6ea8ed99adb4badef707 Mon Sep 17 00:00:00 2001 From: Sai Teja Aluvala Date: Mon, 20 Apr 2026 23:07:35 +0530 Subject: [PATCH 465/972] Bluetooth: btintel_pcie: treat boot stage bit 12 as warning CSR boot stage register bit 12 is documented as a device warning, not a fatal error. Rename the bit definition accordingly and stop including it in btintel_pcie_in_error(). This keeps warning-only boot stage values from being classified as errors while preserving abort-handler state as the actual error condition. Fixes: 190377500fde ("Bluetooth: btintel_pcie: Dump debug registers on error") Signed-off-by: Kiran K Signed-off-by: Sai Teja Aluvala Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 13 ++++++++++--- drivers/bluetooth/btintel_pcie.h | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 2f59c0d6f9ec4c..a3643e67b33fd1 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -289,6 +289,9 @@ static inline void btintel_pcie_dump_debug_registers(struct hci_dev *hdev) skb_put_data(skb, buf, strlen(buf)); data->boot_stage_cache = reg; + if (reg & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING) + bt_dev_warn(hdev, "Controller device warning (boot_stage: 0x%8.8x)", reg); + reg = btintel_pcie_rd_reg32(data, BTINTEL_PCIE_CSR_IPC_STATUS_REG); snprintf(buf, sizeof(buf), "ipc status: 0x%8.8x", reg); skb_put_data(skb, buf, strlen(buf)); @@ -880,8 +883,11 @@ static inline bool btintel_pcie_in_lockdown(struct btintel_pcie_data *data) static inline bool btintel_pcie_in_error(struct btintel_pcie_data *data) { - return (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_ERR) || - (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER); + if (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING) + bt_dev_warn(data->hdev, "Controller device warning (boot_stage: 0x%8.8x)", + data->boot_stage_cache); + + return data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER; } static void btintel_pcie_msix_gp1_handler(struct btintel_pcie_data *data) @@ -914,7 +920,8 @@ static void btintel_pcie_msix_gp0_handler(struct btintel_pcie_data *data) data->img_resp_cache = reg; if (btintel_pcie_in_error(data)) { - bt_dev_err(data->hdev, "Controller in error state"); + bt_dev_err(data->hdev, "Controller in error state (boot_stage: 0x%8.8x)", + data->boot_stage_cache); btintel_pcie_dump_debug_registers(data->hdev); return; } diff --git a/drivers/bluetooth/btintel_pcie.h b/drivers/bluetooth/btintel_pcie.h index 3c7bb708362ded..f922abd1e7d881 100644 --- a/drivers/bluetooth/btintel_pcie.h +++ b/drivers/bluetooth/btintel_pcie.h @@ -48,7 +48,7 @@ #define BTINTEL_PCIE_CSR_BOOT_STAGE_OPFW (BIT(2)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_ROM_LOCKDOWN (BIT(10)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_IML_LOCKDOWN (BIT(11)) -#define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_ERR (BIT(12)) +#define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING (BIT(12)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER (BIT(13)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_HALTED (BIT(14)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_MAC_ACCESS_ON (BIT(16)) From 902fe40bce7059722f7ffa1c378e577675cf1918 Mon Sep 17 00:00:00 2001 From: Aurelien DESBRIERES Date: Tue, 21 Apr 2026 15:53:31 +0200 Subject: [PATCH 466/972] Bluetooth: hci_uart: Fix NULL deref in recv callbacks when priv is uninitialized When a fault is injected during hci_uart line discipline setup, the proto open() callback may fail leaving hu->priv as NULL. A subsequent TIOCSTI ioctl can trigger the recv() callback before priv is initialized, causing a NULL pointer dereference. Fix all four affected HCI UART protocol drivers by adding a NULL check on hu->priv at the start of their recv() callbacks: h4, h5, ath and bcsp. Reported-by: syzbot+ff30eeab8e07b37d524e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ff30eeab8e07b37d524e Signed-off-by: Aurelien DESBRIERES Assisted-by: Claude:claude-sonnet-4-6 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_ath.c | 3 +++ drivers/bluetooth/hci_bcsp.c | 3 +++ drivers/bluetooth/hci_h4.c | 3 +++ drivers/bluetooth/hci_h5.c | 3 +++ 4 files changed, 12 insertions(+) diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c index fa679ad0acdfa1..8201fa7f61e848 100644 --- a/drivers/bluetooth/hci_ath.c +++ b/drivers/bluetooth/hci_ath.c @@ -191,6 +191,9 @@ static int ath_recv(struct hci_uart *hu, const void *data, int count) { struct ath_struct *ath = hu->priv; + if (!ath) + return -ENODEV; + ath->rx_skb = h4_recv_buf(hu, ath->rx_skb, data, count, ath_recv_pkts, ARRAY_SIZE(ath_recv_pkts)); if (IS_ERR(ath->rx_skb)) { diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c index b386f91d8b46d5..db56eead27cebd 100644 --- a/drivers/bluetooth/hci_bcsp.c +++ b/drivers/bluetooth/hci_bcsp.c @@ -585,6 +585,9 @@ static int bcsp_recv(struct hci_uart *hu, const void *data, int count) if (!test_bit(HCI_UART_REGISTERED, &hu->flags)) return -EUNATCH; + if (!bcsp) + return -ENODEV; + BT_DBG("hu %p count %d rx_state %d rx_count %ld", hu, count, bcsp->rx_state, bcsp->rx_count); diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c index a889a66a326f7a..7673727074985c 100644 --- a/drivers/bluetooth/hci_h4.c +++ b/drivers/bluetooth/hci_h4.c @@ -109,6 +109,9 @@ static int h4_recv(struct hci_uart *hu, const void *data, int count) { struct h4_struct *h4 = hu->priv; + if (!h4) + return -ENODEV; + h4->rx_skb = h4_recv_buf(hu, h4->rx_skb, data, count, h4_recv_pkts, ARRAY_SIZE(h4_recv_pkts)); if (IS_ERR(h4->rx_skb)) { diff --git a/drivers/bluetooth/hci_h5.c b/drivers/bluetooth/hci_h5.c index cfdf75dc284751..d3538371821254 100644 --- a/drivers/bluetooth/hci_h5.c +++ b/drivers/bluetooth/hci_h5.c @@ -587,6 +587,9 @@ static int h5_recv(struct hci_uart *hu, const void *data, int count) struct h5 *h5 = hu->priv; const unsigned char *ptr = data; + if (!h5) + return -ENODEV; + BT_DBG("%s pending %zu count %d", hu->hdev->name, h5->rx_pending, count); From ca40d481079c05c6891a14a798c79596fd2d5f0c Mon Sep 17 00:00:00 2001 From: SeungJu Cheon Date: Tue, 21 Apr 2026 11:51:21 +0900 Subject: [PATCH 467/972] Bluetooth: ISO: Fix data-race on dst in iso_sock_connect() iso_sock_connect() copies the destination address into iso_pi(sk)->dst under lock_sock, then releases the lock and reads it back with bacmp() to decide between the CIS and BIS connect paths: lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) // <- no lock held This read after release_sock() races with any concurrent write to iso_pi(sk)->dst on the same socket. Fix by reading the destination address directly from the local sockaddr argument (sa->iso_bdaddr) instead of iso_pi(sk)->dst. Since sa is a function-local argument, reading it requires no locking and avoids the race. This patch addresses only the bacmp() race in iso_sock_connect(); other unprotected iso_pi(sk) accesses are fixed separately in the next patch. KCSAN report: BUG: KCSAN: data-race in memcmp+0x39/0xb0 race at unknown origin, with read to 0xffff8f96ea66dde3 of 1 bytes by task 549 on cpu 1: memcmp+0x39/0xb0 iso_sock_connect+0x275/0xb40 __sys_connect_file+0xbd/0xe0 __sys_connect+0xe0/0x110 __x64_sys_connect+0x40/0x50 x64_sys_call+0xcad/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x00 -> 0xee Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 549 Comm: iso_race_combin Not tainted 7.0.0-08391-g1d51b370a0f8 #40 PREEMPT(lazy) Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: SeungJu Cheon Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index be145e2736b783..290a1b9a9daa94 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1193,7 +1193,7 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, release_sock(sk); - if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) + if (bacmp(&sa->iso_bdaddr, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); From f958c7805b18e9d69f6b322b231ecee46ec6f331 Mon Sep 17 00:00:00 2001 From: SeungJu Cheon Date: Tue, 21 Apr 2026 11:51:22 +0900 Subject: [PATCH 468/972] Bluetooth: ISO: Fix data-race on iso_pi(sk) in socket and HCI event paths Several iso_pi(sk) fields (qos, qos_user_set, bc_sid, base, base_len, sync_handle, bc_num_bis) are written under lock_sock in iso_sock_setsockopt() and iso_sock_bind(), but read and written under hci_dev_lock only in two other paths: - iso_connect_bis() / iso_connect_cis(), invoked from connect(2), read qos/base/bc_sid and reset qos to default_qos on the qos_user_set validation failure -- all without lock_sock. - iso_connect_ind(), invoked from hci_rx_work, writes sync_handle, bc_sid, qos.bcast.encryption, bc_num_bis, base and base_len on PA_SYNC_ESTABLISHED / PAST_RECEIVED / BIG_INFO_ADV_REPORT / PER_ADV_REPORT events. The BIG_INFO handler additionally passes &iso_pi(sk)->qos together with sync_handle / bc_num_bis / bc_bis to hci_conn_big_create_sync() while setsockopt may be mutating them. Acquire lock_sock around the affected accesses in both paths. The locking order hci_dev_lock -> lock_sock matches the existing iso_conn_big_sync() precedent, whose comment documents the same requirement for hci_conn_big_create_sync(). The HCI connect/bind helpers do not wait for command completion -- they enqueue work via hci_cmd_sync_queue{,_once}() / hci_le_create_cis_pending() and return -- so the added hold time is comparable to iso_conn_big_sync(). KCSAN report: BUG: KCSAN: data-race in iso_connect_cis / iso_sock_setsockopt read to 0xffffa3ae8ce3cdc8 of 1 bytes by task 335 on cpu 0: iso_connect_cis+0x49f/0xa20 iso_sock_connect+0x60e/0xb40 __sys_connect_file+0xbd/0xe0 __sys_connect+0xe0/0x110 __x64_sys_connect+0x40/0x50 x64_sys_call+0xcad/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f write to 0xffffa3ae8ce3cdc8 of 60 bytes by task 334 on cpu 1: iso_sock_setsockopt+0x69a/0x930 do_sock_setsockopt+0xc3/0x170 __sys_setsockopt+0xd1/0x130 __x64_sys_setsockopt+0x64/0x80 x64_sys_call+0x1547/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 334 Comm: iso_setup_race Not tainted 7.0.0-10949-g8541d8f725c6 #44 PREEMPT(lazy) The iso_connect_ind() races were found by inspection. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: SeungJu Cheon Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 54 +++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 290a1b9a9daa94..7cb2864fe87241 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -347,6 +347,7 @@ static int iso_connect_bis(struct sock *sk) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; @@ -399,13 +400,9 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -421,9 +418,8 @@ static int iso_connect_bis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -444,6 +440,7 @@ static int iso_connect_cis(struct sock *sk) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; @@ -498,13 +495,9 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -520,9 +513,8 @@ static int iso_connect_cis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -2256,8 +2248,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); iso_pi(sk)->bc_sid = ev1->sid; + release_sock(sk); } goto done; @@ -2268,8 +2262,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid_past, ev1a); if (sk && !ev1a->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); iso_pi(sk)->bc_sid = ev1a->sid; + release_sock(sk); } goto done; @@ -2296,27 +2292,35 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev2); if (sk) { - int err; - struct hci_conn *hcon = iso_pi(sk)->conn->hcon; + int err = 0; + bool big_sync; + struct hci_conn *hcon; + lock_sock(sk); + + hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && - !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + big_sync = !test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && + !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags); + + if (big_sync) err = hci_conn_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); - if (err) { - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); - sock_put(sk); - sk = NULL; - } + + release_sock(sk); + + if (big_sync && err) { + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + sock_put(sk); + sk = NULL; } } @@ -2370,8 +2374,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!base || base_len > BASE_MAX_LENGTH) goto done; + lock_sock(sk); memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; + release_sock(sk); } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. From 634a4408c0615c523cf7531790f4f14a422b9206 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Tue, 21 Apr 2026 11:14:54 +0000 Subject: [PATCH 469/972] Bluetooth: btmtk: validate WMT event SKB length before struct access btmtk_usb_hci_wmt_sync() casts the WMT event response SKB data to struct btmtk_hci_wmt_evt (7 bytes) and struct btmtk_hci_wmt_evt_funcc (9 bytes) without first checking that the SKB contains enough data. A short firmware response causes out-of-bounds reads from SKB tailroom. Use skb_pull_data() to validate and advance past the base WMT event header. For the FUNC_CTRL case, pull the additional status field bytes before accessing them. Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 6fb6ca2748086e..f70c1b0f899035 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -695,8 +695,13 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, if (data->evt_skb == NULL) goto err_free_wc; - /* Parse and handle the return WMT event */ - wmt_evt = (struct btmtk_hci_wmt_evt *)data->evt_skb->data; + wmt_evt = skb_pull_data(data->evt_skb, sizeof(*wmt_evt)); + if (!wmt_evt) { + bt_dev_err(hdev, "WMT event too short (%u bytes)", + data->evt_skb->len); + err = -EINVAL; + goto err_free_skb; + } if (wmt_evt->whdr.op != hdr->op) { bt_dev_err(hdev, "Wrong op received %d expected %d", wmt_evt->whdr.op, hdr->op); @@ -712,6 +717,12 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, status = BTMTK_WMT_PATCH_DONE; break; case BTMTK_WMT_FUNC_CTRL: + if (!skb_pull_data(data->evt_skb, + sizeof(wmt_evt_funcc->status))) { + err = -EINVAL; + goto err_free_skb; + } + wmt_evt_funcc = (struct btmtk_hci_wmt_evt_funcc *)wmt_evt; if (be16_to_cpu(wmt_evt_funcc->status) == 0x404) status = BTMTK_WMT_ON_DONE; From 21bd244b6de5d2fe1063c23acc93fbdd2b20d112 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 13:08:44 -0400 Subject: [PATCH 470/972] Bluetooth: virtio_bt: clamp rx length before skb_put virtbt_rx_work() calls skb_put(skb, len) where len comes directly from virtqueue_get_buf() with no validation against the buffer we posted to the device. The RX skb is allocated in virtbt_add_inbuf() and exposed to virtio as exactly 1000 bytes via sg_init_one(). Checking len against skb_tailroom(skb) is not sufficient because alloc_skb() can leave more tailroom than the 1000 bytes actually handed to the device. A malicious or buggy backend can therefore report used.len between 1001 and skb_tailroom(skb), causing skb_put() to include uninitialized kernel heap bytes that were never written by the device. The same path also accepts len == 0, in which case skb_put(skb, 0) leaves the skb empty but virtbt_rx_handle() still reads the pkt_type byte from skb->data, consuming uninitialized memory. Define VIRTBT_RX_BUF_SIZE once and reuse it in alloc_skb() and sg_init_one(), and gate virtbt_rx_work() on that same constant so the bound checked matches the buffer actually exposed to the device. Reject used.len == 0 in the same gate so an empty completion can no longer reach virtbt_rx_handle(). Use bt_dev_err_ratelimited() because the length value comes from an untrusted backend that can otherwise flood the kernel log. Same class of bug as commit c04db81cd028 ("net/9p: Fix buffer overflow in USB transport layer"), which hardened the USB 9p transport against unchecked device-reported length. Fixes: 160fbcf3bfb9 ("Bluetooth: virtio_bt: Use skb_put to set length") Cc: stable@vger.kernel.org Cc: Soenke Huster Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/virtio_bt.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 76d61af8a275e9..2c5c39356a1c81 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -12,6 +12,7 @@ #include #define VERSION "0.1" +#define VIRTBT_RX_BUF_SIZE 1000 enum { VIRTBT_VQ_TX, @@ -33,11 +34,11 @@ static int virtbt_add_inbuf(struct virtio_bluetooth *vbt) struct sk_buff *skb; int err; - skb = alloc_skb(1000, GFP_KERNEL); + skb = alloc_skb(VIRTBT_RX_BUF_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; - sg_init_one(sg, skb->data, 1000); + sg_init_one(sg, skb->data, VIRTBT_RX_BUF_SIZE); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); if (err < 0) { @@ -227,8 +228,15 @@ static void virtbt_rx_work(struct work_struct *work) if (!skb) return; - skb_put(skb, len); - virtbt_rx_handle(vbt, skb); + if (!len || len > VIRTBT_RX_BUF_SIZE) { + bt_dev_err_ratelimited(vbt->hdev, + "rx reply len %u outside [1, %u]\n", + len, VIRTBT_RX_BUF_SIZE); + kfree_skb(skb); + } else { + skb_put(skb, len); + virtbt_rx_handle(vbt, skb); + } if (virtbt_add_inbuf(vbt) < 0) return; From daf23014e5d975e72ea9c02b5160d3fcf070ea47 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 13:08:45 -0400 Subject: [PATCH 471/972] Bluetooth: virtio_bt: validate rx pkt_type header length virtbt_rx_handle() reads the leading pkt_type byte from the RX skb and forwards the remainder to hci_recv_frame() for every event/ACL/SCO/ISO type, without checking that the remaining payload is at least the fixed HCI header for that type. After the preceding patch bounds the backend-supplied used.len to [1, VIRTBT_RX_BUF_SIZE], a one-byte completion still reaches hci_recv_frame() with skb->len already pulled to 0. If the byte happened to be HCI_ACLDATA_PKT, the ACL-vs-ISO classification fast-path in hci_dev_classify_pkt_type() dereferences hci_acl_hdr(skb)->handle whenever the HCI device has an active CIS_LINK, BIS_LINK, or PA_LINK connection, reading two bytes of uninitialized RX-buffer data. The same hazard exists for every packet type the driver accepts because none of the switch cases in virtbt_rx_handle() check skb->len against the per-type minimum HCI header size before handing the frame to the core. After stripping pkt_type, require skb->len to cover the fixed header size for the selected type (event 2, ACL 4, SCO 3, ISO 4) before calling hci_recv_frame(); drop ratelimited otherwise. Unknown pkt_type values still take the original kfree_skb() default path. Use bt_dev_err_ratelimited() because both the length and pkt_type values come from an untrusted backend that can otherwise flood the kernel log. Fixes: 160fbcf3bfb9 ("Bluetooth: virtio_bt: Use skb_put to set length") Cc: stable@vger.kernel.org Cc: Soenke Huster Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/virtio_bt.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 2c5c39356a1c81..140ab55c9fc5a9 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -198,6 +198,7 @@ static int virtbt_shutdown_generic(struct hci_dev *hdev) static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) { + size_t min_hdr; __u8 pkt_type; pkt_type = *((__u8 *) skb->data); @@ -205,16 +206,32 @@ static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) switch (pkt_type) { case HCI_EVENT_PKT: + min_hdr = sizeof(struct hci_event_hdr); + break; case HCI_ACLDATA_PKT: + min_hdr = sizeof(struct hci_acl_hdr); + break; case HCI_SCODATA_PKT: + min_hdr = sizeof(struct hci_sco_hdr); + break; case HCI_ISODATA_PKT: - hci_skb_pkt_type(skb) = pkt_type; - hci_recv_frame(vbt->hdev, skb); + min_hdr = sizeof(struct hci_iso_hdr); break; default: kfree_skb(skb); - break; + return; } + + if (skb->len < min_hdr) { + bt_dev_err_ratelimited(vbt->hdev, + "rx pkt_type 0x%02x payload %u < hdr %zu\n", + pkt_type, skb->len, min_hdr); + kfree_skb(skb); + return; + } + + hci_skb_pkt_type(skb) = pkt_type; + hci_recv_frame(vbt->hdev, skb); } static void virtbt_rx_work(struct work_struct *work) From 8f59d17b18a78fdfdbb67d693b3d3eb03db184e0 Mon Sep 17 00:00:00 2001 From: Pengpeng Hou Date: Thu, 23 Apr 2026 23:31:00 +0800 Subject: [PATCH 472/972] Bluetooth: RFCOMM: pull credit byte with skb_pull_data() rfcomm_recv_data() treats the first payload byte as a credit field when the UIH frame carries PF and credit-based flow control is enabled. After the header has been stripped, the PF/CFC path consumes that byte with a direct skb->data dereference followed by skb_pull(). A malformed short frame can reach this path without a byte available. Use skb_pull_data() so the length check and pull happen together before the returned credit byte is consumed. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Pengpeng Hou Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 611a9a94151ecf..d11bd5337d573e 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1715,9 +1715,12 @@ static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk } if (pf && d->cfc) { - u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); + u8 *credits = skb_pull_data(skb, 1); - d->tx_credits += credits; + if (!credits) + goto drop; + + d->tx_credits += *credits; if (d->tx_credits) clear_bit(RFCOMM_TX_THROTTLED, &d->flags); } From 72d97cae2a83cecf6f47208646675ecd066d0a3e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 Apr 2026 15:40:46 +0200 Subject: [PATCH 473/972] Bluetooth: hci_event: fix memset typo hci_le_big_sync_established_evt() currently does: conn->num_bis = 0; memset(conn->bis, 0, sizeof(conn->num_bis)); sizeof(conn->num_bis) is wrong - it would make sense to either use conn->num_bis (before setting that to 0) or sizeof(conn->bis). Fix it by using sizeof(conn->bis), the least intrusive change. Luckily, nothing actually depends on this memset() working properly: Nothing seems to ever read from conn->bis beyond conn->num_bis, and when conn->num_bis is increased, the corresponding elements of conn->bis are initialized. So I think this line could also just be removed. This is a purely theoretical fix and should have no impact on actual behavior. Fixes: 42ecf1947135 ("Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending") Signed-off-by: Jann Horn Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1b3b9131affaa3..eea2f810aafab3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7191,7 +7191,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); conn->num_bis = 0; - memset(conn->bis, 0, sizeof(conn->num_bis)); + memset(conn->bis, 0, sizeof(conn->bis)); for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); From c5d415596cb6fbdf6334b06cc87a1a5a268d8725 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 2 May 2026 12:43:03 -0400 Subject: [PATCH 474/972] Bluetooth: HIDP: serialise l2cap_unregister_user via hidp_session_sem Commit dbf666e4fc9b ("Bluetooth: HIDP: Fix possible UAF") made hidp_session_remove() drop the L2CAP reference and set session->conn = NULL once the session is considered removed, and added a bare if (session->conn) guard around the kthread-exit l2cap_unregister_user() call in hidp_session_thread(). The sibling ioctl site in hidp_connection_del() still reads session->conn unlocked and unguarded, and the kthread-exit guard itself is a lockless double-read. hidp_session_find() drops hidp_session_sem before returning, so hidp_session_remove() can null session->conn between the lookup and the call in hidp_connection_del(). Worse, since commit 752a6c9596dd ("Bluetooth: L2CAP: Fix use-after-free in l2cap_unregister_user") takes mutex_lock(&conn->lock) inside l2cap_unregister_user(), a stale non-NULL snapshot also UAFs on conn->lock. v1 only added an if (session->conn) guard at the ioctl site, which doesn't address either race; Luiz suggested snapshotting session->conn under the sem and clearing it before the call. Taking hidp_session_sem across l2cap_unregister_user() would be wrong: l2cap_conn_del() already establishes the lock order conn->lock -> hidp_session_sem via l2cap_unregister_all_users() -> user->remove == hidp_session_remove(), so taking hidp_session_sem before conn->lock would AB/BA deadlock. Factor a helper hidp_session_unregister_conn() that under down_write(&hidp_session_sem) snapshots session->conn and clears the member, then outside the sem calls l2cap_unregister_user() and l2cap_conn_put() on the snapshot. Call it from both hidp_connection_del() and hidp_session_thread()'s exit path. At most one consumer wins the write-sem; later callers observe session->conn == NULL and skip the unregister and put, so the reference hidp_session_new() took via l2cap_conn_get() is consumed exactly once. session_free() already tolerates a NULL session->conn. Fixes: dbf666e4fc9b ("Bluetooth: HIDP: Fix possible UAF") Suggested-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20260422011437.176643-1-michael.bommarito@gmail.com/ Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 7bcf8c5ceaeedc..976f91eeb745e4 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1035,6 +1035,28 @@ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr) return session; } +/* + * Consume session->conn: clear the member under hidp_session_sem, then + * l2cap_unregister_user() and l2cap_conn_put() the snapshot outside the + * sem. At most one caller wins; later callers see NULL and skip. The + * reference is the one hidp_session_new() took via l2cap_conn_get(). + */ +static void hidp_session_unregister_conn(struct hidp_session *session) +{ + struct l2cap_conn *conn; + + down_write(&hidp_session_sem); + conn = session->conn; + if (conn) + session->conn = NULL; + up_write(&hidp_session_sem); + + if (conn) { + l2cap_unregister_user(conn, &session->user); + l2cap_conn_put(conn); + } +} + /* * Start session synchronously * This starts a session thread and waits until initialization @@ -1311,8 +1333,7 @@ static int hidp_session_thread(void *arg) * Instead, this call has the same semantics as if user-space tried to * delete the session. */ - if (session->conn) - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); @@ -1418,7 +1439,7 @@ int hidp_connection_del(struct hidp_conndel_req *req) HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); else - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); From 0e1368a28dd5231ae0dbe240dfe0ff2657de5647 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 May 2026 17:22:05 -0700 Subject: [PATCH 475/972] selftests: drv-net: fix sort order of makefile and config Recent changes added configs and tests in the wrong spot. Link: https://lore.kernel.org/20260506170435.34984dfc@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/Makefile | 2 +- tools/testing/selftests/drivers/net/hw/config | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 3b6ff4708005d9..82809d5b24780c 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -30,8 +30,8 @@ TEST_PROGS = \ gro_hw.py \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ - ipsec_vxlan.py \ iou-zcrx.py \ + ipsec_vxlan.py \ irq.py \ loopback.sh \ nic_timestamp.py \ diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index ae0168c2bbe6fa..8c132ace2b8de1 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -3,6 +3,10 @@ CONFIG_FAIL_FUNCTION=y CONFIG_FAULT_INJECTION=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FUNCTION_ERROR_INJECTION=y +CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y CONFIG_IO_URING=y CONFIG_IPV6=y CONFIG_IPV6_GRE=y @@ -12,10 +16,6 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NETKIT=y CONFIG_NET_SCH_INGRESS=y -CONFIG_INET6_ESP=y -CONFIG_INET6_ESP_OFFLOAD=y -CONFIG_INET_ESP=y -CONFIG_INET_ESP_OFFLOAD=y CONFIG_UDMABUF=y CONFIG_VXLAN=y CONFIG_XFRM_USER=y From 7aaa8f5e45a92678256c1e17f1fa2c2f45c61dd1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 13:00:56 +0000 Subject: [PATCH 476/972] ipv6: fix potential UAF caused by ip6_forward_proxy_check() ip6_forward_proxy_check() calls pskb_may_pull() which might re-allocate skb->head. Reload ipv6_hdr() after the pskb_may_pull() call to avoid using the freed memory. Fixes: e21e0b5f19ac ("[IPV6] NDISC: Handle NDP messages to proxied addresses.") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260505130056.2927197-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1f2a33fbed6e63..c14adcdd43960d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,6 +468,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) default: break; } + hdr = ipv6_hdr(skb); } /* @@ -582,6 +583,8 @@ int ip6_forward(struct sk_buff *skb) if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); + + hdr = ipv6_hdr(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the From 7ce3f1bedaac88880594720ba0f687da3bd7fc8a Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:23 -0700 Subject: [PATCH 477/972] netdevsim: psp: only call nsim_psp_uninit() on PFs VFs go through nsim_init_netdevsim_vf() which never calls nsim_psp_init(), so ns->psp.dev stays NULL. nsim_psp_uninit() guards with !IS_ERR(ns->psp.dev), so destroying a VF reaches psp_dev_unregister(NULL) and dereferences NULL on the first mutex_lock(&psd->lock): BUG: kernel NULL pointer dereference, address: 0000000000000020 RIP: 0010:mutex_lock+0x1c/0x30 Call Trace: psp_dev_unregister+0x2a/0x1a0 nsim_psp_uninit+0x1f/0x40 [netdevsim] nsim_destroy+0x61/0x1e0 [netdevsim] __nsim_dev_port_del+0x47/0x90 [netdevsim] nsim_drv_configure_vfs+0xc9/0x130 [netdevsim] nsim_bus_dev_numvfs_store+0x79/0xb0 [netdevsim] Gate nsim_psp_uninit() on nsim_dev_port_is_pf(), matching the pattern already used for nsim_exit_netdevsim() and the bpf/ipsec/macsec/queue teardowns. Reproducer: modprobe netdevsim echo "10 1" > /sys/bus/netdevsim/new_device echo 1 > /sys/bus/netdevsim/devices/netdevsim10/sriov_numvfs devlink dev eswitch set netdevsim/netdevsim10 mode switchdev echo 0 > /sys/bus/netdevsim/devices/netdevsim10/sriov_numvfs Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-1-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index a05af192caf325..a750768912b5a1 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -1182,7 +1182,8 @@ void nsim_destroy(struct netdevsim *ns) unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb, &ns->nn); - nsim_psp_uninit(ns); + if (nsim_dev_port_is_pf(ns->nsim_dev_port)) + nsim_psp_uninit(ns); rtnl_lock(); peer = rtnl_dereference(ns->peer); From 24c96a42006ee27a078ec8c631c906dea8a3ca6d Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:24 -0700 Subject: [PATCH 478/972] netdevsim: psp: serialize calls to nsim_psp_uninit() The debugfs write handler, nsim_psp_rereg_write(), can race against nsim_destroy() and against itself, causing nsim_psp_uninit() to run more than once concurrently. Two complementary changes serialize all callers: 1. Delete the psp_rereg debugfs file from nsim_psp_uninit() before doing the actual teardown. debugfs_remove() drains any in-flight writers and prevents new ones from starting. 2. Add a mutex around the body of nsim_psp_rereg_write() so that two concurrent userspace writers cannot both enter the teardown path at once. The teardown work itself is moved into a new __nsim_psp_uninit() that the rereg handler calls under the mutex, while the public nsim_psp_uninit() wraps it with the debugfs_remove()/mutex_destroy() pair so nsim_destroy() doesn't have to know about the psp internals. Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-2-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdevsim.h | 2 ++ drivers/net/netdevsim/psp.c | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 7e129dddbbe7f9..e373ffc26b0c0f 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -121,6 +121,8 @@ struct netdevsim { u64_stats_t tx_bytes; struct u64_stats_sync syncp; struct psp_dev *dev; + struct dentry *rereg; + struct mutex rereg_lock; u32 spi; u32 assoc_cnt; } psp; diff --git a/drivers/net/netdevsim/psp.c b/drivers/net/netdevsim/psp.c index 0b4d717253b080..86d84b7e566b28 100644 --- a/drivers/net/netdevsim/psp.c +++ b/drivers/net/netdevsim/psp.c @@ -209,13 +209,20 @@ static struct psp_dev_caps nsim_psp_caps = { .assoc_drv_spc = sizeof(void *), }; -void nsim_psp_uninit(struct netdevsim *ns) +static void __nsim_psp_uninit(struct netdevsim *ns) { if (!IS_ERR(ns->psp.dev)) psp_dev_unregister(ns->psp.dev); WARN_ON(ns->psp.assoc_cnt); } +void nsim_psp_uninit(struct netdevsim *ns) +{ + debugfs_remove(ns->psp.rereg); + mutex_destroy(&ns->psp.rereg_lock); + __nsim_psp_uninit(ns); +} + static ssize_t nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) @@ -223,11 +230,13 @@ nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, struct netdevsim *ns = file->private_data; int err; - nsim_psp_uninit(ns); + mutex_lock(&ns->psp.rereg_lock); + __nsim_psp_uninit(ns); ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); err = PTR_ERR_OR_ZERO(ns->psp.dev); + mutex_unlock(&ns->psp.rereg_lock); return err ?: count; } @@ -249,6 +258,8 @@ int nsim_psp_init(struct netdevsim *ns) if (err) return err; - debugfs_create_file("psp_rereg", 0200, ddir, ns, &nsim_psp_rereg_fops); + mutex_init(&ns->psp.rereg_lock); + ns->psp.rereg = debugfs_create_file("psp_rereg", 0200, ddir, ns, + &nsim_psp_rereg_fops); return 0; } From 07bdec3fc737aac7f4c273aafa803d353174c43e Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:25 -0700 Subject: [PATCH 479/972] netdevsim: psp: rcu protect psp_dev reference There are two issues with the way psp_dev is used in nsim_do_psp(): 1. There is no check for IS_ERR() on the peers psp_dev, before dereferencing. 2. The refcount on this psp_dev can be dropped by nsim_psp_rereg_write() To fix this, we can make netdevsim's reference to its psp_dev an rcu reference, and then nsim_do_psp() can read the fields it needs from an rcu critical section. Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-3-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdevsim.h | 2 +- drivers/net/netdevsim/psp.c | 54 ++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index e373ffc26b0c0f..d909c4160ea1fe 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -120,7 +120,7 @@ struct netdevsim { u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; - struct psp_dev *dev; + struct psp_dev __rcu *dev; struct dentry *rereg; struct mutex rereg_lock; u32 spi; diff --git a/drivers/net/netdevsim/psp.c b/drivers/net/netdevsim/psp.c index 86d84b7e566b28..6936ecb8173e20 100644 --- a/drivers/net/netdevsim/psp.c +++ b/drivers/net/netdevsim/psp.c @@ -19,6 +19,7 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, struct netdevsim *peer_ns, struct skb_ext **psp_ext) { enum skb_drop_reason rc = 0; + struct psp_dev *peer_psd; struct psp_assoc *pas; struct net *net; void **ptr; @@ -48,7 +49,8 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, } /* Now pretend we just received this frame */ - if (peer_ns->psp.dev->config.versions & (1 << pas->version)) { + peer_psd = rcu_dereference(peer_ns->psp.dev); + if (peer_psd && peer_psd->config.versions & (1 << pas->version)) { bool strip_icv = false; u8 generation; @@ -61,8 +63,7 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, skb_ext_reset(skb); skb->mac_len = ETH_HLEN; - if (psp_dev_rcv(skb, peer_ns->psp.dev->id, generation, - strip_icv)) { + if (psp_dev_rcv(skb, peer_psd->id, generation, strip_icv)) { rc = SKB_DROP_REASON_PSP_OUTPUT; goto out_unlock; } @@ -209,10 +210,18 @@ static struct psp_dev_caps nsim_psp_caps = { .assoc_drv_spc = sizeof(void *), }; -static void __nsim_psp_uninit(struct netdevsim *ns) +static void __nsim_psp_uninit(struct netdevsim *ns, bool teardown) { - if (!IS_ERR(ns->psp.dev)) - psp_dev_unregister(ns->psp.dev); + struct psp_dev *psd; + + psd = rcu_dereference_protected(ns->psp.dev, + teardown || + lockdep_is_held(&ns->psp.rereg_lock)); + if (psd) { + rcu_assign_pointer(ns->psp.dev, NULL); + synchronize_rcu(); + psp_dev_unregister(psd); + } WARN_ON(ns->psp.assoc_cnt); } @@ -220,7 +229,7 @@ void nsim_psp_uninit(struct netdevsim *ns) { debugfs_remove(ns->psp.rereg); mutex_destroy(&ns->psp.rereg_lock); - __nsim_psp_uninit(ns); + __nsim_psp_uninit(ns, true); } static ssize_t @@ -228,16 +237,23 @@ nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct netdevsim *ns = file->private_data; - int err; + struct psp_dev *psd; + ssize_t ret; mutex_lock(&ns->psp.rereg_lock); - __nsim_psp_uninit(ns); + __nsim_psp_uninit(ns, false); + + psd = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); + if (IS_ERR(psd)) { + ret = PTR_ERR(psd); + goto out; + } - ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, - &nsim_psp_caps, ns); - err = PTR_ERR_OR_ZERO(ns->psp.dev); + rcu_assign_pointer(ns->psp.dev, psd); + ret = count; +out: mutex_unlock(&ns->psp.rereg_lock); - return err ?: count; + return ret; } static const struct file_operations nsim_psp_rereg_fops = { @@ -250,13 +266,13 @@ static const struct file_operations nsim_psp_rereg_fops = { int nsim_psp_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; - int err; + struct psp_dev *psd; + + psd = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); + if (IS_ERR(psd)) + return PTR_ERR(psd); - ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, - &nsim_psp_caps, ns); - err = PTR_ERR_OR_ZERO(ns->psp.dev); - if (err) - return err; + rcu_assign_pointer(ns->psp.dev, psd); mutex_init(&ns->psp.rereg_lock); ns->psp.rereg = debugfs_create_file("psp_rereg", 0200, ddir, ns, From 701ea57feaabdea403cf299ee5cd0445083bc0ac Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Tue, 5 May 2026 18:02:36 +0530 Subject: [PATCH 480/972] net: rtsn: fix mdio_node leak in rtsn_mdio_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of_get_child_by_name() takes a reference. The rtsn_reset() and rtsn_change_mode() failure paths jump to out_free_bus and leak mdio_node. Add out_put_node to drop it before falling through. Fixes: b0d3969d2b4d ("net: ethernet: rtsn: Add support for Renesas Ethernet-TSN") Signed-off-by: Shitalkumar Gandhi Reviewed-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Reviewed-by: Niklas Söderlund Link: https://patch.msgid.link/20260505123236.406000-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rtsn.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/rtsn.c b/drivers/net/ethernet/renesas/rtsn.c index 03a2669f051879..ee8381b60b8d49 100644 --- a/drivers/net/ethernet/renesas/rtsn.c +++ b/drivers/net/ethernet/renesas/rtsn.c @@ -797,11 +797,11 @@ static int rtsn_mdio_alloc(struct rtsn_private *priv) /* Enter config mode before registering the MDIO bus */ ret = rtsn_reset(priv); if (ret) - goto out_free_bus; + goto out_put_node; ret = rtsn_change_mode(priv, OCR_OPC_CONFIG); if (ret) - goto out_free_bus; + goto out_put_node; rtsn_modify(priv, MPIC, MPIC_PSMCS_MASK | MPIC_PSMHT_MASK, MPIC_PSMCS_DEFAULT | MPIC_PSMHT_DEFAULT); @@ -824,6 +824,8 @@ static int rtsn_mdio_alloc(struct rtsn_private *priv) return 0; +out_put_node: + of_node_put(mdio_node); out_free_bus: mdiobus_free(mii); return ret; From 67ef49047d312be692c8c439145f4514174e517f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 13:32:33 +0000 Subject: [PATCH 481/972] inetpeer: add a missing read_seqretry() in inet_getpeer() When performing a lockless lookup over the inet_peer rbtree, if a matching node is found, inet_getpeer() returns it immediately without validating the seqlock sequence. This missing check introduces a race condition: Trigger Path: When a host receives an incoming fragmented IPv4 packet, ip4_frag_init() (in net/ipv4/ip_fragment.c) calls inet_getpeer_v4() to track the peer. The Race: If the packet is from a new source IP, CPU A acquires the write_seqlock, allocates a new inet_peer node (p), sets its IP address (daddr), and links it to the rbtree (rb_link_node). Uninitialized Access: Due to the lack of memory barriers between rb_link_node and the initialization of the rest of the struct (like refcount_set(&p->refcnt, 1)), CPU A can make the node visible to readers before its refcnt is initialized. This is especially true on weakly-ordered architectures like ARM64 where the CPU can reorder the memory stores. Lockless Reader: Concurrently, CPU B processes a second fragmented packet from the same source IP. CPU B does a lockless lookup, finds the newly inserted node, and returns it immediately. Use-After-Free (UAF): CPU B reads p->refcnt as uninitialized garbage (left over from previous kmalloc-128/192 allocations). If the garbage is > 0, refcount_inc_not_zero(&p->refcnt) succeeds. CPU A then executes refcount_set(&p->refcnt, 1), overwriting CPU B's increment. When CPU B finishes with the fragment queue, it calls inet_putpeer(), which drops the refcount to 0 and frees the node via RCU. The node is now freed but remains linked in the rbtree, resulting in a Use-After-Free in the rbtree. Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260505133233.3039575-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inetpeer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d8083b9033c272..5b957a831e7c39 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -179,7 +179,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - if (p) + /* Make sure tree was not modified during our lookup. */ + if (p && !read_seqretry(&base->lock, seq)) return p; /* retry an exact lookup, taking the lock before. From 770b136ff9bf3e319d19875da59c4f7f4853da3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 09:11:33 +0000 Subject: [PATCH 482/972] net/sched: sch_sfq: annotate data-races from sfq_dump_class_stats() sfq_dump_class_stats() runs locklessly, add needed READ_ONCE() and WRITE_ONCE() annotations. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260505091133.2452510-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfq.c | 48 +++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c3f3181dba5424..f39822babf88be 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -225,7 +225,8 @@ static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = q->slots[x].qlen--; + d = q->slots[x].qlen; + WRITE_ONCE(q->slots[x].qlen, d - 1); if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); @@ -238,7 +239,8 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = ++q->slots[x].qlen; + d = q->slots[x].qlen + 1; + WRITE_ONCE(q->slots[x].qlen, d); if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); @@ -298,7 +300,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); - slot->backlog -= len; + WRITE_ONCE(slot->backlog, slot->backlog - len); sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -314,7 +316,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) q->tail = NULL; /* no more active slots */ else q->tail->next = slot->next; - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); goto drop; } @@ -364,10 +366,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; - slot->backlog = 0; /* should already be 0 anyway... */ + WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } @@ -426,7 +428,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; - slot->backlog -= delta; + WRITE_ONCE(slot->backlog, slot->backlog - delta); qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); slot_queue_add(slot, skb); @@ -436,7 +438,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) enqueue: qdisc_qstats_backlog_inc(sch, skb); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ @@ -452,7 +454,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; @@ -489,7 +491,7 @@ sfq_dequeue(struct Qdisc *sch) slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; - slot->allot += q->quantum; + WRITE_ONCE(slot->allot, slot->allot + q->quantum); goto next_slot; } skb = slot_dequeue_head(slot); @@ -497,10 +499,10 @@ sfq_dequeue(struct Qdisc *sch) qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); - slot->backlog -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb)); /* Is the slot empty? */ if (slot->qlen == 0) { - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ @@ -508,7 +510,7 @@ sfq_dequeue(struct Qdisc *sch) } q->tail->next = next_a; } else { - slot->allot -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb)); } return skb; } @@ -549,9 +551,9 @@ static void sfq_rehash(struct Qdisc *sch) sfq_dec(q, i); __skb_queue_tail(&list, skb); } - slot->backlog = 0; + WRITE_ONCE(slot->backlog, 0); red_set_vars(&slot->vars); - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); } q->tail = NULL; @@ -570,7 +572,7 @@ static void sfq_rehash(struct Qdisc *sch) dropped++; continue; } - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; } @@ -581,7 +583,7 @@ static void sfq_rehash(struct Qdisc *sch) slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ @@ -591,7 +593,7 @@ static void sfq_rehash(struct Qdisc *sch) q->tail->next = x; } q->tail = slot; - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } } sch->q.qlen -= dropped; @@ -905,16 +907,16 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); - sfq_index idx = q->ht[cl - 1]; + sfq_index idx = READ_ONCE(q->ht[cl - 1]); struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; - xstats.allot = slot->allot; - qs.qlen = slot->qlen; - qs.backlog = slot->backlog; + xstats.allot = READ_ONCE(slot->allot); + qs.qlen = READ_ONCE(slot->qlen); + qs.backlog = READ_ONCE(slot->backlog); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; @@ -930,7 +932,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->divisor; i++) { - if (q->ht[i] == SFQ_EMPTY_SLOT) { + if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) { arg->count++; continue; } From c8f7244c8cccaaed4e6c9fe4b8a07e101d0423e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 15:39:27 +0000 Subject: [PATCH 483/972] tcp: tcp_child_process() related UAF tcp_child_process( .. child ...) currently calls sock_put(child). Unfortunately @child (named @nsk in callers) can be used after this point to send a RST packet. To fix this UAF, I remove the sock_put() from tcp_child_process() and let the callers handle this after it is safe. Remove @rsk variable in tcp_v4_do_rcv() and change tcp_v6_do_rcv() so that both functions look the same. Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260505153927.3435532-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 14 ++++++-------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 13 ++++++++----- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8fc24c3743c5f9..c0526cc0398049 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1827,7 +1827,6 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; - struct sock *rsk; reason = psp_sk_rx_policy_check(sk, skb); if (reason) @@ -1863,24 +1862,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; if (nsk != sk) { reason = tcp_child_process(sk, nsk, skb); - if (reason) { - rsk = nsk; + sock_put(nsk); + if (reason) goto reset; - } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); - if (reason) { - rsk = sk; + if (reason) goto reset; - } return 0; reset: - tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); + tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and @@ -2193,8 +2189,10 @@ int tcp_v4_rcv(struct sk_buff *skb) rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v4_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 199f0b579e89cf..e6092c3ac840bd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1012,6 +1012,6 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, } bh_unlock_sock(child); - sock_put(child); + return reason; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2c3f7a739709d7..51583aef0643e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1617,12 +1617,13 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) + return 0; if (nsk != sk) { - if (nsk) { - reason = tcp_child_process(sk, nsk, skb); - if (reason) - goto reset; - } + reason = tcp_child_process(sk, nsk, skb); + sock_put(nsk); + if (reason) + goto reset; return 0; } } else @@ -1827,8 +1828,10 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } From b12014d2d36eaed4e4bec5f1ac7e91110eeb100d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:49 +0200 Subject: [PATCH 484/972] mptcp: pm: kernel: correctly retransmit ADD_ADDR ID 0 When adding the ADD_ADDR to the list, the address including the IP, port and ID are copied. On the other hand, when the endpoint corresponds to the one from the initial subflow, the ID is set to 0, as specified by the MPTCP protocol. The issue is that the ID was reset after having copied the ID in the ADD_ADDR entry. So the retransmission was done, but using a different ID than the initial one. Fixes: 8b8ed1b429f8 ("mptcp: pm: reuse ID 0 after delete and re-add") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-1-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index c9f1e5af3cd3ec..fc818b63752e37 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -347,6 +347,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < endp_signal_max) { + u8 endp_id; + /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -360,19 +362,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (!select_signal_address(pernet, msk, &local)) goto subflow; + /* Special case for ID0: set the correct ID */ + endp_id = local.addr.id; + if (endp_id == msk->mpc_endpoint_id) + local.addr.id = 0; + /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + __clear_bit(endp_id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_addr_send_ack(msk); From 03f324f3f1f7619a47b9c91282cb12775ab0a2f1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:50 +0200 Subject: [PATCH 485/972] mptcp: pm: ADD_ADDR rtx: allow ID 0 ADD_ADDR can be sent for the ID 0, which corresponds to the local address and port linked to the initial subflow. Indeed, this address could be removed, and re-added later on, e.g. what is done in the "delete re-add signal" MPTCP Join selftests. So no reason to ignore it. Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-2-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 57a45669040679..5056eb8db24e05 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -337,9 +337,6 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (inet_sk_state_load(sk) == TCP_CLOSE) return; - if (!entry->addr.id) - return; - if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; From 5cd6e0ad79d2615264f63929f8b457ad97ae550d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:51 +0200 Subject: [PATCH 486/972] mptcp: pm: ADD_ADDR rtx: fix potential data-race This mptcp_pm_add_timer() helper is executed as a timer callback in softirq context. To avoid any data races, the socket lock needs to be held with bh_lock_sock(). If the socket is in use, retry again soon after, similar to what is done with the keepalive timer. Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-3-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 5056eb8db24e05..3912128d9b8668 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -337,6 +337,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (inet_sk_state_load(sk) == TCP_CLOSE) return; + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + sk_reset_timer(sk, timer, jiffies + HZ / 20); + goto out; + } + if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; @@ -365,6 +372,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) mptcp_pm_subflow_established(msk); out: + bh_unlock_sock(sk); __sock_put(sk); } From 9634cb35af17019baec21ca648516ce376fa10e6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:52 +0200 Subject: [PATCH 487/972] mptcp: pm: ADD_ADDR rtx: always decrease sk refcount When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer(). It should then be released in all cases at the end. Some (unlikely) checks were returning directly instead of calling sock_put() to decrease the refcount. Jump to a new 'exit' label to call __sock_put() (which will become sock_put() in the next commit) to fix this potential leak. While at it, drop the '!msk' check which cannot happen because it is never reset, and explicitly mark the remaining one as "unlikely". Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-4-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3912128d9b8668..2a01bf1b5bfdad 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -331,11 +331,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) pr_debug("msk=%p\n", msk); - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + goto exit; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -373,6 +370,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) out: bh_unlock_sock(sk); +exit: __sock_put(sk); } From b7b9a461569734d33d3259d58d2507adfac107ed Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:53 +0200 Subject: [PATCH 488/972] mptcp: pm: ADD_ADDR rtx: free sk if last When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer(), and released at the end. If at that moment, it was the last reference being held, the sk would not be freed. sock_put() should then be called instead of __sock_put(). But that's not enough: if it is the last reference, sock_put() will call sk_free(), which will end up calling sk_stop_timer_sync() on the same timer, and waiting indefinitely to finish. So it is needed to mark that the timer is done at the end of the timer handler when it has not been rescheduled, not to call sk_stop_timer_sync() on "itself". Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-5-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 2a01bf1b5bfdad..8899327e59a1da 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,6 +16,7 @@ struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; + bool timer_done; struct timer_list add_timer; struct mptcp_sock *sock; struct rcu_head rcu; @@ -327,22 +328,22 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - unsigned int timeout; + unsigned int timeout = 0; pr_debug("msk=%p\n", msk); + bh_lock_sock(sk); if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) - goto exit; + goto out; - bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - sk_reset_timer(sk, timer, jiffies + HZ / 20); + timeout = HZ / 20; goto out; } if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + timeout = TCP_RTO_MAX / 8; goto out; } @@ -360,8 +361,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer) } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + (timeout << entry->retrans_times)); + timeout <<= entry->retrans_times; + else + timeout = 0; spin_unlock_bh(&msk->pm.lock); @@ -369,9 +371,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) mptcp_pm_subflow_established(msk); out: + if (timeout) + sk_reset_timer(sk, timer, jiffies + timeout); + else + /* if sock_put calls sk_free: avoid waiting for this timer */ + entry->timer_done = true; bh_unlock_sock(sk); -exit: - __sock_put(sk); + sock_put(sk); } struct mptcp_pm_add_entry * @@ -434,6 +440,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: + add_entry->timer_done = false; timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); @@ -454,7 +461,8 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); + if (!entry->timer_done) + sk_stop_timer_sync(sk, &entry->add_timer); kfree_rcu(entry, rcu); } } From 3cf12492891c4b5ff54dda404a2de4ec54c9e1b5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:54 +0200 Subject: [PATCH 489/972] mptcp: pm: ADD_ADDR rtx: resched blocked ADD_ADDR quicker When an ADD_ADDR needs to be retransmitted and another one has already been prepared -- e.g. multiple ADD_ADDRs have been sent in a row and need to be retransmitted later -- this additional retransmission will need to wait. In this case, the timer was reset to TCP_RTO_MAX / 8, which is ~15 seconds. This delay is unnecessary long: it should just be rescheduled at the next opportunity, e.g. after the retransmission timeout. Without this modification, some issues can be seen from time to time in the selftests when multiple ADD_ADDRs are sent, and the host takes time to process them, e.g. the "signal addresses, ADD_ADDR timeout" MPTCP Join selftest, especially with a debug kernel config. Note that on older kernels, 'timeout' is not available. It should be enough to replace it by one second (HZ). Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-6-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 8899327e59a1da..29d1bb6a69cffe 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -342,13 +342,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } - if (mptcp_pm_should_add_signal_addr(msk)) { - timeout = TCP_RTO_MAX / 8; - goto out; - } - timeout = mptcp_adjust_add_addr_timeout(msk); - if (!timeout) + if (!timeout || mptcp_pm_should_add_signal_addr(msk)) goto out; spin_lock_bh(&msk->pm.lock); From c6d395e2de1306b5fef0344a3c3835fbbfaa18be Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:55 +0200 Subject: [PATCH 490/972] mptcp: pm: ADD_ADDR rtx: skip inactive subflows When looking at the maximum RTO amongst the subflows, inactive subflows were taken into account: that includes stale ones, and the initial one if it has been already been closed. Unusable subflows are now simply skipped. Stale ones are used as an alternative: if there are only stale ones, to take their maximum RTO and avoid to eventually fallback to net.mptcp.add_addr_timeout, which is set to 2 minutes by default. Fixes: 30549eebc4d8 ("mptcp: make ADD_ADDR retransmission timeout adaptive") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-7-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 29d1bb6a69cffe..8a5dba7fe66eca 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -306,18 +306,28 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) const struct net *net = sock_net((struct sock *)msk); unsigned int rto = mptcp_get_add_addr_timeout(net); struct mptcp_subflow_context *subflow; - unsigned int max = 0; + unsigned int max = 0, max_stale = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); - if (icsk->icsk_rto > max) + if (!__mptcp_subflow_active(subflow)) + continue; + + if (unlikely(subflow->stale)) { + if (icsk->icsk_rto > max_stale) + max_stale = icsk->icsk_rto; + } else if (icsk->icsk_rto > max) { max = icsk->icsk_rto; + } } - if (max && max < rto) - rto = max; + if (max) + return min(max, rto); + + if (max_stale) + return min(max_stale, rto); return rto; } From 62a9b19dce77e72426f049fb99b9d1d032b9a8ea Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:56 +0200 Subject: [PATCH 491/972] mptcp: pm: ADD_ADDR rtx: return early if no retrans No need to iterate over all subflows if there is no retransmission needed. Exit early in this case then. Fixes: 30549eebc4d8 ("mptcp: make ADD_ADDR retransmission timeout adaptive") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-8-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 8a5dba7fe66eca..4a6e5ab30d809d 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -308,6 +308,9 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; unsigned int max = 0, max_stale = 0; + if (!rto) + return 0; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); From 166b78344031bf7ac9f55cb5282776cfd85f220e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:57 +0200 Subject: [PATCH 492/972] mptcp: pm: prio: skip closed subflows When sending an MP_PRIO, closed subflows need to be skipped. This fixes the case where the initial subflow got closed, re-opened later, then an MP_PRIO is needed for the same local address. Note that explicit MP_PRIO cannot be sent during the 3WHS, so it is fine to use __mptcp_subflow_active(). Fixes: 067065422fcd ("mptcp: add the outgoing MP_PRIO support") Cc: stable@vger.kernel.org Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-9-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 4a6e5ab30d809d..3c152bf66cd5ac 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -284,6 +284,9 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; + if (!__mptcp_subflow_active(subflow)) + continue; + mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; From 65db7b27b90e2ea8d4966935aa9a50b6a60c31ac Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:58 +0200 Subject: [PATCH 493/972] selftests: mptcp: check output: catch cmd errors Using '${?}' inside the if-statement to check the returned value from the command that was evaluated as part of the if-statement is not correct: here, '${?}' will be linked to the previous instruction, not the one that is expected here (${cmd}). Instead, simply mark the error, except if an error is expected. If that's the case, 1 can be passed as the 4th argument of this helper. Three checks from pm_netlink.sh expect an error. While at it, improve the error message when the command unexpectedly fails or succeeds. Note that we could expect a specific returned value, but the checks currently expecting an error can be used with 'ip mptcp' or 'pm_nl_ctl', and these two tools don't return the same error code. Fixes: 2d0c1d27ea4e ("selftests: mptcp: add mptcp_lib_check_output helper") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-10-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 16 ++++++++++------ tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 ++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 5fea7e7df628c8..989a5975dcea62 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -474,20 +474,24 @@ mptcp_lib_wait_local_port_listen() { wait_local_port_listen "${@}" "tcp" } +# $1: error file, $2: cmd, $3: expected msg, [$4: expected error] mptcp_lib_check_output() { local err="${1}" local cmd="${2}" local expected="${3}" + local exp_error="${4:-0}" local cmd_ret=0 local out - if ! out=$(${cmd} 2>"${err}"); then - cmd_ret=${?} - fi + out=$(${cmd} 2>"${err}") || cmd_ret=1 - if [ ${cmd_ret} -ne 0 ]; then - mptcp_lib_pr_fail "command execution '${cmd}' stderr" - cat "${err}" + if [ "${cmd_ret}" != "${exp_error}" ]; then + mptcp_lib_pr_fail "unexpected returned code for '${cmd}', info:" + if [ "${exp_error}" = 0 ]; then + cat "${err}" + else + echo "${out}" + fi return 2 elif [ "${out}" = "${expected}" ]; then return 0 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 123d9d7a0278cd..b69f30fcb91e96 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -122,10 +122,12 @@ check() local cmd="$1" local expected="$2" local msg="$3" + local exp_error="$4" local rc=0 mptcp_lib_print_title "$msg" - mptcp_lib_check_output "${err}" "${cmd}" "${expected}" || rc=${?} + mptcp_lib_check_output "${err}" "${cmd}" "${expected}" "${exp_error}" || + rc=${?} if [ ${rc} -eq 2 ]; then mptcp_lib_result_fail "${msg} # error ${rc}" ret=${KSFT_FAIL} @@ -158,13 +160,13 @@ check "show_endpoints" \ "3,10.0.1.3,signal backup")" "dump addrs" del_endpoint 2 -check "get_endpoint 2" "" "simple del addr" +check "get_endpoint 2" "" "simple del addr" 1 check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "3,10.0.1.3,signal backup")" "dump addrs after del" add_endpoint 10.0.1.3 2>/dev/null -check "get_endpoint 4" "" "duplicate addr" +check "get_endpoint 4" "" "duplicate addr" 1 add_endpoint 10.0.1.4 flags signal check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" @@ -173,7 +175,7 @@ for i in $(seq 5 9); do add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" -check "get_endpoint 10" "" "above hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" 1 del_endpoint 9 for i in $(seq 10 255); do From 53705ddfa18408f8e1f064331b6387509fa19f7f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:59 +0200 Subject: [PATCH 494/972] selftests: mptcp: pm: restrict 'unknown' check to pm_nl_ctl When pm_netlink.sh is executed with '-i', 'ip mptcp' is used instead of 'pm_nl_ctl'. IPRoute2 doesn't support the 'unknown' flag, which has only been added to 'pm_nl_ctl' for this specific check: to ensure that the kernel ignores such unsupported flag. No reason to add this flag to 'ip mptcp'. Then, this check should be skipped when 'ip mptcp' is used. Fixes: 0cef6fcac24d ("selftests: mptcp: ip_mptcp option for more scripts") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-11-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index b69f30fcb91e96..04594dfc22b134 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -194,9 +194,13 @@ check "show_endpoints" \ flush_endpoint check "show_endpoints" "" "flush addrs" -add_endpoint 10.0.1.1 flags unknown -check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" "ignore unknown flags" -flush_endpoint +# "unknown" flag is only supported by pm_nl_ctl +if ! mptcp_lib_is_ip_mptcp; then + add_endpoint 10.0.1.1 flags unknown + check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" \ + "ignore unknown flags" + flush_endpoint +fi set_limits 9 1 2>/dev/null check "get_limits" "${default_limits}" "rcv addrs above hard limit" From b266bacba796ff5c4dcd2ae2fc08aacf7ab39153 Mon Sep 17 00:00:00 2001 From: Andreas Haarmann-Thiemann Date: Tue, 5 May 2026 23:52:17 +0200 Subject: [PATCH 495/972] net: ethernet: cortina: Drop half-assembled SKB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In gmac_rx() (drivers/net/ethernet/cortina/gemini.c), when gmac_get_queue_page() returns NULL for the second page of a multi-page fragment, the driver logs an error and continues — but does not free the partially assembled skb that was being assembled via napi_build_skb() / napi_get_frags(). Free the in-progress partially assembled skb via napi_free_frags() and increase the number of dropped frames appropriately and assign the skb pointer NULL to make sure it is not lingering around, matching the pattern already used elsewhere in the driver. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Signed-off-by: Andreas Haarmann-Thiemann Signed-off-by: Linus Walleij Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260505-gemini-ethernet-fix-v2-1-997c31d06079@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 4824232f489072..065cbbf52686cb 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1491,6 +1491,11 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) gpage = gmac_get_queue_page(geth, port, mapping + PAGE_SIZE); if (!gpage) { dev_err(geth->dev, "could not find mapping\n"); + if (skb) { + napi_free_frags(&port->napi); + port->stats.rx_dropped++; + skb = NULL; + } continue; } page = gpage->page; From 5772f6535227ebd104065d80afa8ed3478d34c5c Mon Sep 17 00:00:00 2001 From: David Gow Date: Thu, 16 Apr 2026 14:57:43 +0800 Subject: [PATCH 496/972] x86/boot/e820: Re-enable BIOS fallback if e820 table is empty In commit: 157266edcc56 ("x86/boot/e820: Simplify append_e820_table() and remove restriction on single-entry tables") the check on the number of entries in the e820 table was removed. The intention was to support single-entry maps, but by removing the check entirely, we also skip the fallback (to, e.g., the BIOS 88h function). This means that if no E820 map is passed in from the bootloader (which is the case on some bootloaders, like linld), we end up with an empty memory map, and the kernel fails to boot (either by deadlocking on OOM, or by failing to allocate the real mode trampoline, or similar). Re-instate the check in append_e820_table(), but only check that nr_entries is non-zero. This allows e820__memory_setup_default() to fall back to other memory size sources, and doesn't affect e820__memory_setup_extended(), as the latter ignores the return value from append_e820_table(). In doing so, we also update the return values to be proper error codes, with -ENOENT for this case (there are no entries), and -EINVAL for the case where an entry appears invalid. Given none of the callers check the actual value -- just whether it's nonzero -- this is largely aesthetic in practice. Tested against linld, and the kernel boots again fine. [ mingo: Readability edits to the comment and the changelog. ] Fixes: 157266edcc56 ("x86/boot/e820: Simplify append_e820_table() and remove restriction on single-entry tables") Signed-off-by: David Gow Signed-off-by: Ingo Molnar Reviewed-by: Andy Shevchenko Cc: stable@vger.kernel.org Cc: Arnd Bergmann Cc: "H. Peter Anvin" Link: https://patch.msgid.link/20260416065746.1896647-1-david@davidgow.net --- arch/x86/kernel/e820.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 2a999275893369..eb72537bc0b195 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -450,6 +450,10 @@ __init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entr { struct boot_e820_entry *entry = entries; + /* If there aren't any entries, we'll want to fall back to another source: */ + if (!nr_entries) + return -ENOENT; + while (nr_entries) { u64 start = entry->addr; u64 size = entry->size; @@ -458,7 +462,7 @@ __init static int append_e820_table(struct boot_e820_entry *entries, u32 nr_entr /* Ignore the remaining entries on 64-bit overflow: */ if (start > end && likely(size)) - return -1; + return -EINVAL; e820__range_add(start, size, type); From b15838b03cd0c6cf35651cfde62d17f14bb1d566 Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Fri, 24 Apr 2026 21:34:28 +0900 Subject: [PATCH 497/972] drm/bochs: Drop manual put on probe error path bochs_pci_probe() allocates the DRM device with devm_drm_dev_alloc(), which registers a devres action to drop the initial DRM device reference on driver detach or probe failure. The error path currently calls drm_dev_put() manually. If probe then returns an error, devres will run the registered release action and put the same device again, after the first put may already have released it. Return the probe error directly and let devres own the final put. Signed-off-by: Myeonghun Pak Fixes: 04826f588682 ("drm/bochs: Allocate DRM device in struct bochs_device") Signed-off-by: Thomas Zimmermann Reviewed-by: Thomas Zimmermann Link: https://patch.msgid.link/20260424123506.32275-1-mhun512@gmail.com --- drivers/gpu/drm/tiny/bochs.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/tiny/bochs.c b/drivers/gpu/drm/tiny/bochs.c index 222e4ae1abbd14..5d8dc5efec776e 100644 --- a/drivers/gpu/drm/tiny/bochs.c +++ b/drivers/gpu/drm/tiny/bochs.c @@ -761,25 +761,21 @@ static int bochs_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent ret = pcim_enable_device(pdev); if (ret) - goto err_free_dev; + return ret; pci_set_drvdata(pdev, dev); ret = bochs_load(bochs); if (ret) - goto err_free_dev; + return ret; ret = drm_dev_register(dev, 0); if (ret) - goto err_free_dev; + return ret; drm_client_setup(dev, NULL); return ret; - -err_free_dev: - drm_dev_put(dev); - return ret; } static void bochs_pci_remove(struct pci_dev *pdev) From 3051cd060fa496df42954291fa2306ed2eab4ecc Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Wed, 6 May 2026 01:55:11 +0800 Subject: [PATCH 498/972] i2c: smbus: reject oversized block transfers in the common path The SMBus block transfer length data->block[0] is validated in i2c_smbus_xfer_emulated() but that check runs too late for tracepoints and is skipped entirely when the adapter provides a native smbus_xfer implementation. This allows user-controlled oversized block lengths to reach tracepoint memcpy calls and driver callbacks unchecked. Add an early validation in __i2c_smbus_xfer() that rejects block transfers whose caller-supplied length is zero or exceeds I2C_SMBUS_BLOCK_MAX before any tracepoint fires or driver callback runs. data->block[0] is filled in by the device on SMBus block reads, so the check is scoped to operations where the length is actually supplied by the caller. This is consistent with the existing -EINVAL convention in the emulated path and protects all downstream consumers at once: the smbus_write tracepoint, all native smbus_xfer driver implementations, and the emulated path. Two distinct bugs are fixed by this change: Bug 1: smbus_write tracepoint OOB (include/trace/events/smbus.h) trace_smbus_write() fires before any validation and copies data->block[0]+1 bytes into a 34-byte event buffer. With block[0]=0xfe the tracepoint copies 255 bytes, overflowing by 221. BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_smbus_write+0x27c/0x530 Read of size 255 at addr ffff88800d98fcf8 by task poc_smbus/91 Call Trace: __asan_memcpy+0x23/0x80 trace_event_raw_event_smbus_write+0x27c/0x530 __i2c_smbus_xfer+0x43a/0xa40 i2c_smbus_xfer+0x19e/0x340 i2cdev_ioctl_smbus+0x38f/0x7f0 i2cdev_ioctl+0x35e/0x680 __x64_sys_ioctl+0x147/0x1e0 do_syscall_64+0xcf/0x15a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Bug 2: i2c-stub I2C_SMBUS_I2C_BLOCK_DATA OOB (drivers/i2c/i2c-stub.c) stub_xfer() implements .smbus_xfer directly and only clamps block[0] against 256-command, not I2C_SMBUS_BLOCK_MAX. With block[0]=0xff and command=0 the loop accesses block[1+i] for i up to 254, far past the 34-byte union. UBSAN: array-index-out-of-bounds in drivers/i2c/i2c-stub.c:223:44 index 34 is out of range for type '__u8 [34]' Call Trace: __ubsan_handle_out_of_bounds+0xd7/0x120 stub_xfer+0x1971/0x198f [i2c_stub] __i2c_smbus_xfer+0x306/0xa40 i2c_smbus_xfer+0x19e/0x340 i2cdev_ioctl_smbus+0x38f/0x7f0 i2cdev_ioctl+0x35e/0x680 __x64_sys_ioctl+0x147/0x1e0 do_syscall_64+0xcf/0x15a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Both traces reproduced on v7.0-rc6+i2c/for-current with KASAN+UBSAN. Fixes: 8a325997d95d ("i2c: Add message transfer tracepoints for SMBUS [ver #2]") Fixes: 4710317891e4 ("i2c-stub: Implement I2C block support") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-smbus.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/i2c/i2c-core-smbus.c b/drivers/i2c/i2c-core-smbus.c index 71eb1ef56f0c34..ad6acb5ebadc32 100644 --- a/drivers/i2c/i2c-core-smbus.c +++ b/drivers/i2c/i2c-core-smbus.c @@ -566,6 +566,18 @@ s32 __i2c_smbus_xfer(struct i2c_adapter *adapter, u16 addr, if (res) return res; + /* Reject invalid caller-supplied block lengths before any + * tracepoint or native smbus_xfer callback runs. + */ + if (data && + (protocol == I2C_SMBUS_I2C_BLOCK_DATA || + protocol == I2C_SMBUS_BLOCK_PROC_CALL || + (protocol == I2C_SMBUS_BLOCK_DATA && + read_write == I2C_SMBUS_WRITE)) && + (data->block[0] == 0 || + data->block[0] > I2C_SMBUS_BLOCK_MAX)) + return -EINVAL; + /* If enabled, the following two tracepoints are conditional on * read_write and protocol. */ From 593dfd40a94ca0ab20297ea4629d94268deed0ed Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 4 May 2026 18:42:11 -0700 Subject: [PATCH 499/972] eth: fbnic: fix double-free of PCS on phylink creation failure fbnic_phylink_create() stores the newly allocated PCS in fbn->pcs and then calls phylink_create(). When phylink_create() fails, the error path correctly destroys the PCS via xpcs_destroy_pcs(), but the caller, fbnic_netdev_alloc(), responds by invoking fbnic_netdev_free() which calls fbnic_phylink_destroy(). That function finds fbn->pcs non-NULL and calls xpcs_destroy_pcs() a second time on the already-freed object, triggering a refcount underflow use-after-free: [ 1.934973] fbnic 0000:01:00.0: Failed to create Phylink interface, err: -22 [ 1.935103] ------------[ cut here ]------------ [ 1.935179] refcount_t: underflow; use-after-free. [ 1.935252] WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x59/0x90, CPU#0: swapper/0/1 [ 1.935389] Modules linked in: [ 1.935484] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-virtme-04244-g1f5ffc672165-dirty #1 PREEMPT(lazy) [ 1.935661] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 1.935826] RIP: 0010:refcount_warn_saturate+0x59/0x90 [ 1.935931] Code: 44 48 8d 3d 49 f9 a7 01 67 48 0f b9 3a e9 bf 1e 96 00 48 8d 3d 48 f9 a7 01 67 48 0f b9 3a c3 cc cc cc cc 48 8d 3d 47 f9 a7 01 <67> 48 0f b9 3a c3 cc cc cc cc 48 8d 3d 46 f9 a7 01 67 48 0f b9 3a [ 1.936274] RSP: 0000:ffffd0d440013c58 EFLAGS: 00010246 [ 1.936376] RAX: 0000000000000000 RBX: ffff8f39c188c278 RCX: 000000000000002b [ 1.936524] RDX: ffff8f39c004f000 RSI: 0000000000000003 RDI: ffffffff96abab00 [ 1.936692] RBP: ffff8f39c188c240 R08: ffffffff96988e88 R09: 00000000ffffdfff [ 1.936835] R10: ffffffff96878ea0 R11: 0000000000000187 R12: 0000000000000000 [ 1.936970] R13: ffff8f39c0cef0c8 R14: ffff8f39c1ac01c0 R15: 0000000000000000 [ 1.937114] FS: 0000000000000000(0000) GS:ffff8f3ba08b4000(0000) knlGS:0000000000000000 [ 1.937273] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.937382] CR2: ffff8f3b3ffff000 CR3: 0000000172642001 CR4: 0000000000372ef0 [ 1.937540] Call Trace: [ 1.937619] [ 1.937698] xpcs_destroy_pcs+0x25/0x40 [ 1.937783] fbnic_netdev_alloc+0x1e5/0x200 [ 1.937859] fbnic_probe+0x230/0x370 [ 1.937939] local_pci_probe+0x3e/0x90 [ 1.938013] pci_device_probe+0xbb/0x1e0 [ 1.938091] ? sysfs_do_create_link_sd+0x6d/0xe0 [ 1.938188] really_probe+0xc1/0x2b0 [ 1.938282] __driver_probe_device+0x73/0x120 [ 1.938371] driver_probe_device+0x1e/0xe0 [ 1.938466] __driver_attach+0x8d/0x190 [ 1.938560] ? __pfx___driver_attach+0x10/0x10 [ 1.938663] bus_for_each_dev+0x7b/0xd0 [ 1.938758] bus_add_driver+0xe8/0x210 [ 1.938854] driver_register+0x60/0x120 [ 1.938929] ? __pfx_fbnic_init_module+0x10/0x10 [ 1.939026] fbnic_init_module+0x25/0x60 [ 1.939109] do_one_initcall+0x49/0x220 [ 1.939202] ? rdinit_setup+0x20/0x40 [ 1.939304] kernel_init_freeable+0x1b0/0x310 [ 1.939449] ? __pfx_kernel_init+0x10/0x10 [ 1.939560] kernel_init+0x1a/0x1c0 [ 1.939640] ret_from_fork+0x1ed/0x240 [ 1.939730] ? __pfx_kernel_init+0x10/0x10 [ 1.939805] ret_from_fork_asm+0x1a/0x30 [ 1.939886] [ 1.939927] ---[ end trace 0000000000000000 ]--- [ 1.940184] fbnic 0000:01:00.0: Netdev allocation failed Instead of calling fbnic_phylink_destroy(), the prior initialization of netdev should just be unrolled with free_netdev() and clearing fbd->netdev. Clearing fbd->netdev to NULL avoids UAF in init_failure_mode where callers guard by checking !fbd->netdev, such as fbnic_mdio_read_pmd(). These callers remain active even after a failed probe, so fdb->netdev still needs to be cleared. Fixes: d0fe7104c795 ("fbnic: Replace use of internal PCS w/ Designware XPCS") Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260504-fbnic-pcs-fix-v2-1-de45192821d9@meta.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index c406a3b56b37fd..4dea2bb58d2fca 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -826,7 +826,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) netif_tx_stop_all_queues(netdev); if (fbnic_phylink_create(netdev)) { - fbnic_netdev_free(fbd); + free_netdev(netdev); + fbd->netdev = NULL; return NULL; } From 91892231ae5e638326e7eaa0174de86fac9aa5fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A1mon=20van=20Raaij?= Date: Wed, 6 May 2026 20:31:18 +0200 Subject: [PATCH 500/972] ALSA: hda/realtek: Add codec SSID quirk for Lenovo Yoga Pro 9 16IMH9 (17aa:38d5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Lenovo Yoga Pro 9 16IMH9 units carry codec SSID 17aa:38d5 instead of 17aa:38d6, which was added in commit 56722cfbb78d ("ALSA: hda/realtek: Add codec SSID quirk for Lenovo Yoga Pro 9 16IMH9"). The corresponding firmware blob TAS2XXX38D5.bin already ships in linux-firmware, and the hardware is otherwise identical: same PCI subsystem ID 17aa:3811 shared with the Legion S7 15IMH05, same TI TAS2781 amplifiers behind ACPI HID TIAS2781, same ALC287_FIXUP_TAS2781_I2C requirement. Add a second HDA_CODEC_QUIRK entry directly above the existing 17aa:38d6 entry so both variants resolve to the correct fixup. Reported and verified on hardware by GitHub user 0xEthamin. Link: https://github.com/ramonvanraaij/yoga9-tas2781-hda/issues/1 Signed-off-by: Rámon van Raaij Link: https://patch.msgid.link/20260506183118.patch1-ramon@vanraaij.eu Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 11d0ea8ed8598a..55bb98e2e55a01 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7678,6 +7678,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { /* Yoga Pro 9 16IMH9 shares PCI SSID 17aa:3811 with Legion S7 15IMH05; * use codec SSID to distinguish them */ + HDA_CODEC_QUIRK(0x17aa, 0x38d5, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), HDA_CODEC_QUIRK(0x17aa, 0x38d6, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3811, "Legion S7 15IMH05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), From d6854daa67be623860f4e1873fd3d3c275aba4ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Thu, 7 May 2026 00:40:51 -0300 Subject: [PATCH 501/972] ALSA: usb-audio: Bound MIDI endpoint descriptor scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit snd_usbmidi_get_ms_info() validates the internal MIDIStreaming endpoint descriptor size before using baAssocJackID[], but the descriptor walker can still return a class-specific endpoint descriptor whose bLength exceeds the remaining bytes in the endpoint-extra scan. That leaves later flexible-array reads bounded by bLength, but not by the remaining bytes in the endpoint-extra scan. Stop walking when bLength is zero or extends past the remaining endpoint-extra scan. Fixes: 5c6cd7021a05 ("ALSA: usb-audio: Fix case when USB MIDI interface has more than one extra endpoint descriptor") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260507-usb-midi-endpoint-scan-bounds-v1-1-329d7348160e@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index 0a5b8941ebdaaf..d87e3f357cf71b 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1951,15 +1951,17 @@ static struct usb_ms_endpoint_descriptor *find_usb_ms_endpoint_descriptor( while (extralen > 3) { struct usb_ms_endpoint_descriptor *ms_ep = (struct usb_ms_endpoint_descriptor *)extra; + int length = ms_ep->bLength; - if (ms_ep->bLength > 3 && + if (!length || length > extralen) + break; + + if (length > 3 && ms_ep->bDescriptorType == USB_DT_CS_ENDPOINT && ms_ep->bDescriptorSubtype == UAC_MS_GENERAL) return ms_ep; - if (!extra[0]) - break; - extralen -= extra[0]; - extra += extra[0]; + extralen -= length; + extra += length; } return NULL; } From 918be519c7876329e1b6e2ea1c59f0b75e792dca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Thu, 7 May 2026 00:40:52 -0300 Subject: [PATCH 502/972] ALSA: usb-audio: Bound MIDI 2.0 endpoint descriptor scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The USB MIDI 2.0 endpoint parser has the same descriptor walking pattern as the legacy MIDI parser. It validates bLength against bNumGrpTrmBlock before reading baAssoGrpTrmBlkID[], but not against the remaining bytes in the endpoint-extra scan. A malformed device can therefore make later baAssoGrpTrmBlkID[] reads consume bytes past the walked descriptor. Reject zero-length and overlong descriptors while walking endpoint extras. Fixes: ff49d1df79ae ("ALSA: usb-audio: USB MIDI 2.0 UMP support") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260507-usb-midi-endpoint-scan-bounds-v1-2-329d7348160e@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/midi2.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sound/usb/midi2.c b/sound/usb/midi2.c index 2785600d23124a..04aeb9052f139e 100644 --- a/sound/usb/midi2.c +++ b/sound/usb/midi2.c @@ -496,15 +496,17 @@ static void *find_usb_ms_endpoint_descriptor(struct usb_host_endpoint *hostep, while (extralen > 3) { struct usb_ms_endpoint_descriptor *ms_ep = (struct usb_ms_endpoint_descriptor *)extra; + int length = ms_ep->bLength; - if (ms_ep->bLength > 3 && + if (!length || length > extralen) + break; + + if (length > 3 && ms_ep->bDescriptorType == USB_DT_CS_ENDPOINT && ms_ep->bDescriptorSubtype == subtype) return ms_ep; - if (!extra[0]) - break; - extralen -= extra[0]; - extra += extra[0]; + extralen -= length; + extra += length; } return NULL; } From 72d52bac023b376b73277804b24315ea2a49ad1e Mon Sep 17 00:00:00 2001 From: Ayaan Mirza Baig Date: Sat, 18 Apr 2026 00:46:13 +0000 Subject: [PATCH 503/972] platform/x86: samsung-galaxybook: Refactor camera lens cover input device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the camera_lens_cover_switch input device to a generic input device which can be used for multiple input events. Move input device allocation and registration into a dedicated galaxybook_input_init() helper which is called early in probe so that the device is available to all features. No functional change. Signed-off-by: Ayaan Mirza Baig Link: https://patch.msgid.link/20260418004613.93981-2-ayaanmirzabaig85@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/samsung-galaxybook.c | 46 ++++++++++++----------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/platform/x86/samsung-galaxybook.c b/drivers/platform/x86/samsung-galaxybook.c index 755cb82bdb606b..a51ad6b03164ed 100644 --- a/drivers/platform/x86/samsung-galaxybook.c +++ b/drivers/platform/x86/samsung-galaxybook.c @@ -53,7 +53,7 @@ struct samsung_galaxybook { void *i8042_filter_ptr; struct work_struct block_recording_hotkey_work; - struct input_dev *camera_lens_cover_switch; + struct input_dev *input; struct acpi_battery_hook battery_hook; @@ -859,13 +859,28 @@ static int block_recording_acpi_set(struct samsung_galaxybook *galaxybook, const if (err) return err; - input_report_switch(galaxybook->camera_lens_cover_switch, + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, value ? 1 : 0); - input_sync(galaxybook->camera_lens_cover_switch); + input_sync(galaxybook->input); return 0; } +static int galaxybook_input_init(struct samsung_galaxybook *galaxybook) +{ + galaxybook->input = devm_input_allocate_device(&galaxybook->platform->dev); + if (!galaxybook->input) + return -ENOMEM; + + galaxybook->input->name = "Samsung Galaxy Book Camera Lens Cover"; + galaxybook->input->phys = DRIVER_NAME "/input0"; + galaxybook->input->id.bustype = BUS_HOST; + + input_set_capability(galaxybook->input, EV_SW, SW_CAMERA_LENS_COVER); + + return input_register_device(galaxybook->input); +} + static int galaxybook_block_recording_init(struct samsung_galaxybook *galaxybook) { bool value; @@ -887,24 +902,8 @@ static int galaxybook_block_recording_init(struct samsung_galaxybook *galaxybook return GB_NOT_SUPPORTED; } - galaxybook->camera_lens_cover_switch = - devm_input_allocate_device(&galaxybook->platform->dev); - if (!galaxybook->camera_lens_cover_switch) - return -ENOMEM; - - galaxybook->camera_lens_cover_switch->name = "Samsung Galaxy Book Camera Lens Cover"; - galaxybook->camera_lens_cover_switch->phys = DRIVER_NAME "/input0"; - galaxybook->camera_lens_cover_switch->id.bustype = BUS_HOST; - - input_set_capability(galaxybook->camera_lens_cover_switch, EV_SW, SW_CAMERA_LENS_COVER); - - err = input_register_device(galaxybook->camera_lens_cover_switch); - if (err) - return err; - - input_report_switch(galaxybook->camera_lens_cover_switch, - SW_CAMERA_LENS_COVER, value ? 1 : 0); - input_sync(galaxybook->camera_lens_cover_switch); + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, value ? 1 : 0); + input_sync(galaxybook->input); return 0; } @@ -1392,6 +1391,11 @@ static int galaxybook_probe(struct platform_device *pdev) return dev_err_probe(&galaxybook->platform->dev, err, "failed to initialize kbd_backlight\n"); + err = galaxybook_input_init(galaxybook); + if (err) + return dev_err_probe(&galaxybook->platform->dev, err, + "failed to initialize input device\n"); + err = galaxybook_fw_attrs_init(galaxybook); if (err) return dev_err_probe(&galaxybook->platform->dev, err, From 90dc96c61be35a1f81b56dfc5dcb80d50debbf89 Mon Sep 17 00:00:00 2001 From: Ayaan Mirza Baig Date: Sat, 18 Apr 2026 00:46:14 +0000 Subject: [PATCH 504/972] platform/x86: samsung-galaxybook: Handle ACPI hotkey notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Samsung Galaxy Book 5 (SAM0430), the keyboard backlight, microphone mute, and camera block hotkeys do not generate i8042 scancodes. Instead they arrive as ACPI notifications 0x7d, 0x6e, and 0x6f respectively, all of which previously fell through to the default "unknown" warning in galaxybook_acpi_notify(). Add handling for these three events: - 0x7d (Fn+F9, keyboard backlight): schedule the existing kbd_backlight_hotkey_work which cycles brightness. - 0x6e (Fn+F10, microphone mute): emit KEY_MICMUTE via the driver's input device. - 0x6f (Fn+F11, camera block): if block_recording is active use the existing block_recording_hotkey_work; otherwise emit a toggle of SW_CAMERA_LENS_COVER via the driver's input device on models where the block_recording ACPI feature is not supported. Tested on Samsung Galaxy Book 5 (SAM0430) and Samsung Galaxy Book2 Pro (SAM0429). Signed-off-by: Ayaan Mirza Baig Co-developed-by: Joshua Grisham Signed-off-by: Joshua Grisham Link: https://patch.msgid.link/20260418004613.93981-3-ayaanmirzabaig85@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/samsung-galaxybook.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/platform/x86/samsung-galaxybook.c b/drivers/platform/x86/samsung-galaxybook.c index a51ad6b03164ed..6382af0b106c06 100644 --- a/drivers/platform/x86/samsung-galaxybook.c +++ b/drivers/platform/x86/samsung-galaxybook.c @@ -197,6 +197,9 @@ static const guid_t performance_mode_guid = #define GB_ACPI_NOTIFY_DEVICE_ON_TABLE 0x6c #define GB_ACPI_NOTIFY_DEVICE_OFF_TABLE 0x6d #define GB_ACPI_NOTIFY_HOTKEY_PERFORMANCE_MODE 0x70 +#define GB_ACPI_NOTIFY_HOTKEY_KBD_BACKLIGHT 0x7d +#define GB_ACPI_NOTIFY_HOTKEY_MICMUTE 0x6e +#define GB_ACPI_NOTIFY_HOTKEY_CAMERA 0x6f #define GB_KEY_KBD_BACKLIGHT_KEYDOWN 0x2c #define GB_KEY_KBD_BACKLIGHT_KEYUP 0xac @@ -876,6 +879,7 @@ static int galaxybook_input_init(struct samsung_galaxybook *galaxybook) galaxybook->input->phys = DRIVER_NAME "/input0"; galaxybook->input->id.bustype = BUS_HOST; + input_set_capability(galaxybook->input, EV_KEY, KEY_MICMUTE); input_set_capability(galaxybook->input, EV_SW, SW_CAMERA_LENS_COVER); return input_register_device(galaxybook->input); @@ -1259,6 +1263,25 @@ static void galaxybook_acpi_notify(acpi_handle handle, u32 event, void *data) if (galaxybook->has_performance_mode) platform_profile_cycle(); break; + case GB_ACPI_NOTIFY_HOTKEY_KBD_BACKLIGHT: + if (galaxybook->has_kbd_backlight) + schedule_work(&galaxybook->kbd_backlight_hotkey_work); + break; + case GB_ACPI_NOTIFY_HOTKEY_MICMUTE: + input_report_key(galaxybook->input, KEY_MICMUTE, 1); + input_sync(galaxybook->input); + input_report_key(galaxybook->input, KEY_MICMUTE, 0); + input_sync(galaxybook->input); + break; + case GB_ACPI_NOTIFY_HOTKEY_CAMERA: + if (galaxybook->has_block_recording) { + schedule_work(&galaxybook->block_recording_hotkey_work); + } else { + input_report_switch(galaxybook->input, SW_CAMERA_LENS_COVER, + !test_bit(SW_CAMERA_LENS_COVER, galaxybook->input->sw)); + input_sync(galaxybook->input); + } + break; default: dev_warn(&galaxybook->platform->dev, "unknown ACPI notification event: 0x%x\n", event); From ad3bff944c0f4f2e913298a9664391af32f87491 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:01 -0700 Subject: [PATCH 505/972] platform/x86: intel: Move debugfs register before creating devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that the driver handling device is enumerated before registering debugfs. If the driver wants to access debugfs by calling tpmi_get_debugfs_dir(), this will return error in this case. Hence register debugfs before creating devices. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-2-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec_tpmi.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 7fc6ff8d104063..a38014e81e8523 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -817,10 +817,6 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) auxiliary_set_drvdata(auxdev, tpmi_info); - ret = tpmi_create_devices(tpmi_info); - if (ret) - return ret; - /* * Allow debugfs when security policy allows. Everything this debugfs * interface provides, can also be done via /dev/mem access. If @@ -830,6 +826,12 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) if (!security_locked_down(LOCKDOWN_DEV_MEM) && capable(CAP_SYS_RAWIO)) tpmi_dbgfs_register(tpmi_info); + ret = tpmi_create_devices(tpmi_info); + if (ret) { + debugfs_remove_recursive(tpmi_info->dbgfs_dir); + return ret; + } + return 0; } From 57c347a2e2473bfb5c1f1132a3209c55efbe640b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:02 -0700 Subject: [PATCH 506/972] platform/x86: intel: Add notifiers support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases a driver using services of vsec_tpmi driver requires some processing before vsec_tpmi exits. For example a children using debugfs can't use debugfs as this will be deleted by the vsec_tpmi driver. This is the case when unbind using PCI driver interface. In this case the remove callback of vsec_tpmi driver is called first, then remove callback of its children. Add support of blocking chain notifiers support. Notify on successful probe and before clean up in the remove callback. Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-3-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec_tpmi.c | 19 +++++++++++++++++++ include/linux/intel_tpmi.h | 6 ++++++ 2 files changed, 25 insertions(+) diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index a38014e81e8523..16fd7aa41f20e2 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -188,6 +189,20 @@ struct tpmi_feature_state { /* Used during auxbus device creation */ static DEFINE_IDA(intel_vsec_tpmi_ida); +static BLOCKING_NOTIFIER_HEAD(tpmi_notify_list); + +int tpmi_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_register_notifier, "INTEL_TPMI"); + +int tpmi_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&tpmi_notify_list, nb); +} +EXPORT_SYMBOL_NS_GPL(tpmi_unregister_notifier, "INTEL_TPMI"); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev) { struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev); @@ -832,6 +847,8 @@ static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) return ret; } + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_INIT, auxdev); + return 0; } @@ -845,6 +862,8 @@ static void tpmi_remove(struct auxiliary_device *auxdev) { struct intel_tpmi_info *tpmi_info = auxiliary_get_drvdata(auxdev); + blocking_notifier_call_chain(&tpmi_notify_list, TPMI_CORE_EXIT, auxdev); + debugfs_remove_recursive(tpmi_info->dbgfs_dir); } diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 94c06bf214fb6f..15f02422e9ca5e 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -28,6 +28,12 @@ enum intel_tpmi_id { TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ }; +#define TPMI_CORE_INIT 0 +#define TPMI_CORE_EXIT 1 + +int tpmi_register_notifier(struct notifier_block *nb); +int tpmi_unregister_notifier(struct notifier_block *nb); + struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); From 14473e8c4e97d51eff9b2f384ae696f7a32f182b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 30 Apr 2026 08:11:03 -0700 Subject: [PATCH 507/972] platform/x86/intel/tpmi/plr: Prevent fault during unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver faults when intel vsec driver is unbound from PCI driver interface. For example: echo 0000:00:03.1 > /sys/bus/pci/drivers/intel_vsec/unbind This is caused by accessing plr->dbgfs_dir after vsec_tpmi driver is removed. Here vsec_tpmi driver is the parent. On unbind, the parent device remove callback is called first which here will remove debugfs interface. Hence plr->dbgfs_dir is no longer valid. Register notifier for TPMI_CORE_EXIT and make this pointer to NULL, so that debugfs_remove_recursive() is not called with bad plr->dbgfs_dir pointer. After notifier is returned the vsec_tpmi driver will call remove debugfs by calling debugfs_remove_recursive(). Fixes: 811f67c51636 ("platform/x86/intel/tpmi: Add new auxiliary driver for performance limits") Signed-off-by: Srinivas Pandruvada Cc: Stable@vger.kernel.org Link: https://patch.msgid.link/20260430151103.1549733-4-srinivas.pandruvada@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/plr_tpmi.c | 45 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/intel/plr_tpmi.c b/drivers/platform/x86/intel/plr_tpmi.c index 05727169f49c14..8faecc311038f9 100644 --- a/drivers/platform/x86/intel/plr_tpmi.c +++ b/drivers/platform/x86/intel/plr_tpmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,8 @@ struct tpmi_plr { struct tpmi_plr_die *die_info; int num_dies; struct auxiliary_device *auxdev; + struct notifier_block nb; + struct mutex lock; /* Protect access to dbgfs_dir */ }; static const char * const plr_coarse_reasons[] = { @@ -255,6 +258,30 @@ static ssize_t plr_status_write(struct file *filp, const char __user *ubuf, } DEFINE_SHOW_STORE_ATTRIBUTE(plr_status); +static int intel_plr_notify(struct notifier_block *self, unsigned long action, void *data) +{ + struct tpmi_plr *plr = container_of(self, struct tpmi_plr, nb); + + if (action == TPMI_CORE_EXIT) { + guard(mutex)(&plr->lock); + plr->dbgfs_dir = NULL; + } + + return NOTIFY_DONE; +} + +static int intel_plr_register_notifier(struct notifier_block *nb) +{ + nb->notifier_call = intel_plr_notify; + nb->priority = 0; + return tpmi_register_notifier(nb); +} + +static void intel_plr_unregister_notifier(struct notifier_block *nb) +{ + tpmi_unregister_notifier(nb); +} + static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id) { struct oobmsm_plat_info *plat_info; @@ -282,10 +309,18 @@ static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxilia if (!plr) return -ENOMEM; + err = devm_mutex_init(&auxdev->dev, &plr->lock); + if (err) + return err; + + intel_plr_register_notifier(&plr->nb); + plr->die_info = devm_kcalloc(&auxdev->dev, num_resources, sizeof(*plr->die_info), GFP_KERNEL); - if (!plr->die_info) - return -ENOMEM; + if (!plr->die_info) { + err = -ENOMEM; + goto err_notify; + } plr->num_dies = num_resources; plr->dbgfs_dir = debugfs_create_dir("plr", dentry); @@ -326,6 +361,9 @@ static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxilia err: debugfs_remove_recursive(plr->dbgfs_dir); +err_notify: + intel_plr_unregister_notifier(&plr->nb); + return err; } @@ -333,6 +371,9 @@ static void intel_plr_remove(struct auxiliary_device *auxdev) { struct tpmi_plr *plr = auxiliary_get_drvdata(auxdev); + intel_plr_unregister_notifier(&plr->nb); + + guard(mutex)(&plr->lock); debugfs_remove_recursive(plr->dbgfs_dir); } From d7396a72eae795d7f968fb451237b6ac1616d712 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:44 +0100 Subject: [PATCH 508/972] KVM: arm64: Make EL2 exception entry and exit context-synchronization events SCTLR_EL2.EIS and SCTLR_EL2.EOS control whether exception entry and exit at EL2 are Context Synchronisation Events (CSEs). Per ARM DDI 0487 M.b D24.2.175 (p. D24-9754): - !FEAT_ExS: the bit is RES1, so the entry/exit is unconditionally a CSE. - FEAT_ExS: the reset value is architecturally UNKNOWN; software must set the bit to make the entry/exit a CSE. INIT_SCTLR_EL2_MMU_ON in arch/arm64/include/asm/sysreg.h sets neither bit. KVM/arm64 hot paths rely on ERET from EL2 being a CSE, and on synchronous EL1->EL2 entry being a CSE, to elide explicit ISBs after MSRs to context-switching system registers (HCR_EL2, ZCR_EL2, ptrauth keys, etc.). On FEAT_ExS hardware those reliances are not architecturally backed unless EOS=1 (and, for entry, EIS=1). Until commit 0a35bd285f43 ("arm64: Convert SCTLR_EL2 to sysreg infrastructure"), SCTLR_EL2_RES1 was a hand-rolled mask that included BIT(11) (EOS) and BIT(22) (EIS), so INIT_SCTLR_EL2_MMU_ON was setting both unconditionally. The conversion made SCTLR_EL2_RES1 auto-generated; because the sysreg tooling only models unconditionally-RES1 fields and EIS/EOS are RES1 only when FEAT_ExS is absent, the auto-generated mask is UL(0). The seven other bits dropped from the old mask (positions 4, 5, 16, 18, 23, 28, 29) are unconditionally RES1 in the E2H=0 SCTLR_EL2 layout per DDI 0487 M.b D24.2.175, so dropping them is harmless. EIS and EOS are the only bits whose semantics changed for FEAT_ExS hardware and where the kernel relies on the value being 1. Make the guarantee explicit: include SCTLR_ELx_EIS | SCTLR_ELx_EOS in INIT_SCTLR_EL2_MMU_ON so that EL2 exception entry and exit are unconditionally CSEs regardless of whether FEAT_ExS is implemented. This matches the pairing in arch/arm64/kvm/config.c which treats EIS and EOS together as RES1 under !FEAT_ExS. Fixes: 0a35bd285f43 ("arm64: Convert SCTLR_EL2 to sysreg infrastructure") Reviewed-by: Yuan Yao Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-2-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 736561480f3650..7aa08d59d49448 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -844,7 +844,7 @@ #define INIT_SCTLR_EL2_MMU_ON \ (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \ SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \ - SCTLR_ELx_ITFSB | SCTLR_EL2_RES1) + SCTLR_ELx_ITFSB | SCTLR_ELx_EIS | SCTLR_ELx_EOS | SCTLR_EL2_RES1) #define INIT_SCTLR_EL2_MMU_OFF \ (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) From 300fac4cc266b7782d88602b6b6a7faf31ce6405 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:45 +0100 Subject: [PATCH 509/972] KVM: arm64: Guard against NULL vcpu on VHE hyp panic path On VHE, __hyp_call_panic() unconditionally calls __deactivate_traps(vcpu) on the vcpu pointer read from host_ctxt->__hyp_running_vcpu. That pointer is cleared after every guest exit (and is never set when no guest is running), so an unexpected EL2 exception landing in _guest_exit_panic, e.g. via the el2t*_invalid / el2h_irq_invalid vectors - reaches this function with vcpu == NULL. __deactivate_traps() then dereferences vcpu via ___deactivate_traps() -> vserror_state_is_nested() -> vcpu_has_nv() -> vcpu->arch.features, faulting inside the panic handler and obscuring the original failure. The nVHE counterpart (hyp_panic() in arch/arm64/kvm/hyp/nvhe/switch.c) already guards its vcpu-using cleanup with "if (vcpu)"; mirror that here. sysreg_restore_host_state_vhe() does not depend on vcpu and continues to run unconditionally, preserving panic forensics. The trailing panic("...VCPU:%p", vcpu) prints "(null)" safely via printk's %p handling. Fixes: 6a0259ed29bb ("KVM: arm64: Remove hyp_panic arguments") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 9db3f11a4754d6..1e8995add14fa3 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -663,7 +663,8 @@ static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par) host_ctxt = host_data_ptr(host_ctxt); vcpu = host_ctxt->__hyp_running_vcpu; - __deactivate_traps(vcpu); + if (vcpu) + __deactivate_traps(vcpu); sysreg_restore_host_state_vhe(host_ctxt); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", From d4d215e5b81ba5acb17752cab12c514a8062bada Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:46 +0100 Subject: [PATCH 510/972] KVM: arm64: Fix __deactivate_fgt macro parameter typo __deactivate_fgt() declares its first parameter as "htcxt" but the body references "hctxt". The parameter is unused; the macro silently captures "hctxt" from the enclosing scope. Both existing callers (__deactivate_traps_hfgxtr() and __deactivate_traps_ich_hfgxtr()) happen to define a local "struct kvm_cpu_context *hctxt", so the macro works by coincidence. A future caller without an "hctxt" local in scope, or naming it differently, would compile but bind to the wrong context. Align the parameter name with the sibling __activate_fgt() macro. The "vcpu" parameter remains unused in the body, kept for API symmetry with __activate_fgt() (which uses it). Fixes: f5a5a406b4b8 ("KVM: arm64: Propagate and handle Fine-Grained UNDEF bits") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 98b2976837b116..bf0eb5e434274a 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -245,7 +245,7 @@ static inline void __activate_traps_ich_hfgxtr(struct kvm_vcpu *vcpu) __activate_fgt(hctxt, vcpu, ICH_HFGITR_EL2); } -#define __deactivate_fgt(htcxt, vcpu, reg) \ +#define __deactivate_fgt(hctxt, vcpu, reg) \ do { \ write_sysreg_s(ctxt_sys_reg(hctxt, reg), \ SYS_ ## reg); \ From 5130d450d1488e62e1b5310f41910a3c7320e827 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:47 +0100 Subject: [PATCH 511/972] KVM: arm64: Seed pkvm_ownership_selftest vcpu memcache The hypercall handlers call pkvm_refill_memcache() to top up the hyp_vcpu memcache before invoking __pkvm_host_{share,donate}_guest(). pkvm_ownership_selftest invokes those functions directly with a static selftest_vcpu that has an empty memcache. Seed selftest_vcpu's memcache from the prepopulated selftest pages, leaving the remainder for selftest_vm.pool. Required by the memcache-sufficiency pre-check added in the following patches. Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index e7496eb8562897..eb1c10120f9f58 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -752,16 +752,30 @@ static struct pkvm_hyp_vcpu selftest_vcpu = { struct pkvm_hyp_vcpu *init_selftest_vm(void *virt) { struct hyp_page *p = hyp_virt_to_page(virt); + unsigned long min_pages, seeded = 0; int i; selftest_vm.kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; WARN_ON(kvm_guest_prepare_stage2(&selftest_vm, virt)); + /* + * Mirror pkvm_refill_memcache() for the share/donate pre-checks; + * the selftest invokes those functions directly and would + * otherwise see an empty memcache. + */ + min_pages = kvm_mmu_cache_min_pages(&selftest_vm.kvm.arch.mmu); + for (i = 0; i < pkvm_selftest_pages(); i++) { if (p[i].refcount) continue; p[i].refcount = 1; - hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + if (seeded < min_pages) { + push_hyp_memcache(&selftest_vcpu.vcpu.arch.pkvm_memcache, + hyp_page_to_virt(&p[i]), hyp_virt_to_phys); + seeded++; + } else { + hyp_put_page(&selftest_vm.pool, hyp_page_to_virt(&p[i])); + } } selftest_vm.kvm.arch.pkvm.handle = __pkvm_reserve_vm(); From 8234409ffb656970e2f5b29e416f041419980bef Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:48 +0100 Subject: [PATCH 512/972] KVM: arm64: Pre-check vcpu memcache for host->guest share __pkvm_host_share_guest() ends with kvm_pgtable_stage2_map() to install the guest stage-2 mapping, after a forward pass that mutates the host vmemmap (sets PKVM_PAGE_SHARED_OWNED and increments host_share_guest_count) for every page in the range. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM when the stage-2 walker exhausts the caller's memcache, and the host controls the vcpu memcache via the topup interface, so an under-provisioned share request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation in the existing pre-check pass so that kvm_pgtable_stage2_map() cannot fail at the call site, using kvm_mmu_cache_min_pages() -- the same bound host EL1 uses for its own stage-2 maps. If the vcpu memcache holds fewer pages, return -ENOMEM before any state mutation. Fixes: d0bd3e6570ae ("KVM: arm64: Introduce __pkvm_host_share_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a3050e2b65b13d..ffb123fc1145d3 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1390,6 +1390,22 @@ int __pkvm_host_reclaim_page_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret && ret != -EHWPOISON ? ret : 0; } +/* + * share/donate install at most one stage-2 leaf (PAGE_SIZE, or one + * KVM_PGTABLE_LAST_LEVEL - 1 block for share). kvm_mmu_cache_min_pages() + * bounds the worst-case allocation: exact for the PAGE_SIZE leaf, + * conservative by one for the block. + */ +static int __guest_check_pgtable_memcache(struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + + if (vcpu->vcpu.arch.pkvm_memcache.nr_pages < kvm_mmu_cache_min_pages(vm->pgt.mmu)) + return -ENOMEM; + + return 0; +} + int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) { struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); @@ -1474,6 +1490,10 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu } } + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + for_each_hyp_page(page, phys, size) { set_host_state(page, PKVM_PAGE_SHARED_OWNED); page->host_share_guest_count++; From effc0a39b8e0f30670fe24f51e44329d4324e566 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 1 May 2026 12:21:49 +0100 Subject: [PATCH 513/972] KVM: arm64: Pre-check vcpu memcache for host->guest donate __pkvm_host_donate_guest() flips the host stage-2 PTE for the donated page to a non-valid annotation via host_stage2_set_owner_metadata_locked() and then calls kvm_pgtable_stage2_map() to install the matching guest stage-2 mapping. The map's return value is wrapped in WARN_ON() and otherwise discarded, asserting that the call cannot fail. WARN_ON() at nVHE EL2 panics, so this assertion is only correct if the call genuinely cannot fail. kvm_pgtable_stage2_map() can fail with -ENOMEM even at PAGE_SIZE granularity: the donate path verifies PKVM_NOPAGE for the guest IPA before the map, so the walker must allocate fresh page-table pages from the vcpu memcache, and the host controls the vcpu memcache via the topup interface. An under-provisioned donation request would otherwise turn a recoverable -ENOMEM into a fatal hyp panic. Bound the worst-case walker allocation alongside the existing __host_check_page_state_range() / __guest_check_page_state_range() pre-checks, using the helper introduced for host->guest share. If the vcpu memcache holds fewer pages than kvm_mmu_cache_min_pages(), return -ENOMEM before any state mutation. Fixes: 1e579adca177 ("KVM: arm64: Introduce __pkvm_host_donate_guest()") Assisted-by: Gemini:gemini-3.1-pro review-prompts Signed-off-by: Fuad Tabba Link: https://patch.msgid.link/20260501112149.2824881-7-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index ffb123fc1145d3..25f04629014e61 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1425,6 +1425,10 @@ int __pkvm_host_donate_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu) if (ret) goto unlock; + ret = __guest_check_pgtable_memcache(vcpu); + if (ret) + goto unlock; + meta = host_stage2_encode_gfn_meta(vm, gfn); WARN_ON(host_stage2_set_owner_metadata_locked(phys, PAGE_SIZE, PKVM_ID_GUEST, meta)); From 74570e12b4705ea11dcdfbfbd0a0b0fdaeff3059 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Sun, 19 Apr 2026 16:17:15 +0900 Subject: [PATCH 514/972] accel/rocket: Fix prep_bo ioctl leaking positive return from dma_resv_wait_timeout() dma_resv_wait_timeout() returns a positive 'remaining jiffies' value on success, 0 on timeout, and -errno on failure. rocket_ioctl_prep_bo() returns this 'long' result from an int-typed ioctl handler, so positive values reach userspace as bogus errors. Explicitly set ret to 0 on the success path. Fixes: 525ad89dd904 ("accel/rocket: Add IOCTLs for synchronizing memory accesses") Cc: stable@vger.kernel.org Signed-off-by: Gyeyoung Baek Reviewed-by: Tomeu Vizoso Link: https://patch.msgid.link/c0ebf83b345721701b22d8f5bc41c52c0ecf5e16.1776581974.git.gye976@gmail.com Signed-off-by: Steven Price --- drivers/accel/rocket/rocket_gem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/accel/rocket/rocket_gem.c b/drivers/accel/rocket/rocket_gem.c index b6a385d2edfc30..c8084719208a2a 100644 --- a/drivers/accel/rocket/rocket_gem.c +++ b/drivers/accel/rocket/rocket_gem.c @@ -145,6 +145,8 @@ int rocket_ioctl_prep_bo(struct drm_device *dev, void *data, struct drm_file *fi ret = dma_resv_wait_timeout(gem_obj->resv, DMA_RESV_USAGE_WRITE, true, timeout); if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY; + else if (ret > 0) + ret = 0; shmem_obj = &to_rocket_bo(gem_obj)->base; From 459d75523b71c0ec254d153d8850d0b7008af396 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Sun, 19 Apr 2026 16:17:16 +0900 Subject: [PATCH 515/972] drm/panfrost: Fix wait_bo ioctl leaking positive return from dma_resv_wait_timeout() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dma_resv_wait_timeout() returns a positive 'remaining jiffies' value on success, 0 on timeout, and -errno on failure. panfrost_ioctl_wait_bo() returns this 'long' result from an int-typed ioctl handler, so positive values reach userspace as bogus errors. Explicitly set ret to 0 on the success path. Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver") Cc: stable@vger.kernel.org Signed-off-by: Gyeyoung Baek Reviewed-by: Adrián Larumbe Reviewed-by: Boris Brezillon Reviewed-by: Steven Price Link: https://patch.msgid.link/fe33f82fded7be1c18e2e0eb2db451d5a738cf39.1776581974.git.gye976@gmail.com Signed-off-by: Steven Price --- drivers/gpu/drm/panfrost/panfrost_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 711f5101aa04ce..074c0995ddc26c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -390,6 +390,8 @@ panfrost_ioctl_wait_bo(struct drm_device *dev, void *data, true, timeout); if (!ret) ret = timeout ? -ETIMEDOUT : -EBUSY; + else if (ret > 0) + ret = 0; drm_gem_object_put(gem_obj); From a59e45221df82e8a6246c617615c1ccc12e3545d Mon Sep 17 00:00:00 2001 From: Haichen Feng <2806891994@qq.com> Date: Thu, 7 May 2026 22:02:42 +0800 Subject: [PATCH 516/972] platform/x86: hp-wmi: Add support for Victus 16-r0xxx (8BC2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HP Victus 16-r0xxx (board ID: 8BC2) has the same WMI as other Victus S boards, but requires quirks for correctly switching thermal profile. Add the DMI board name to victus_s_thermal_profile_boards[] table and map it to omen_v1_thermal_params. Testing on board 8BC2 confirmed that platform profile is registered successfully and fan RPMs are readable and controllable. Signed-off-by: Haichen Feng <2806891994@qq.com> Link: https://patch.msgid.link/tencent_8E29805D8DC7B6005244C3433C62DD9DF606@qq.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/hp/hp-wmi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/platform/x86/hp/hp-wmi.c b/drivers/platform/x86/hp/hp-wmi.c index 24c151289dd3dd..6950bec2a9d80d 100644 --- a/drivers/platform/x86/hp/hp-wmi.c +++ b/drivers/platform/x86/hp/hp-wmi.c @@ -205,6 +205,10 @@ static const struct dmi_system_id victus_s_thermal_profile_boards[] __initconst .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BBE") }, .driver_data = (void *)&victus_s_thermal_params, }, + { + .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BC2") }, + .driver_data = (void *)&omen_v1_thermal_params, + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "8BCA") }, .driver_data = (void *)&omen_v1_thermal_params, From 08f566e8f83bb70f04ad5aba5be352c490a01c8a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 May 2026 15:21:53 +0200 Subject: [PATCH 517/972] veth: fix OOB txq access in veth_poll() with asymmetric queue counts XDP redirect into a veth device (via bpf_redirect()) calls veth_xdp_xmit(), which enqueues frames into the peer's ptr_ring using smp_processor_id() % peer->real_num_rx_queues as the ring index. With an asymmetric veth pair where the peer has fewer TX queues than RX queues, that index can exceed peer->real_num_tx_queues. veth_poll() then resolves peer_txq for the ring via: peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; where queue_idx = rq->xdp_rxq.queue_index. When queue_idx exceeds peer_dev->real_num_tx_queues this is an out-of-bounds (OOB) access into the peer's netdev_queue array, triggering DEBUG_NET_WARN_ON_ONCE in netdev_get_tx_queue(). The normal ndo_start_xmit path is not affected: the stack clamps skb->queue_mapping via netdev_cap_txqueue() before invoking ndo_start_xmit, so rxq in veth_xmit() never exceeds real_num_tx_queues. Fix veth_poll() by clamping: only dereference peer_txq when queue_idx is within bounds, otherwise set it to NULL. The out-of-range rings are fed exclusively via XDP redirect (veth_xdp_xmit), never via ndo_start_xmit (veth_xmit), so the peer txq was never stopped and there is nothing to wake; NULL is the correct fallback. Reported-by: Sashiko Closes: https://lore.kernel.org/all/20260502071828.616C3C19425@smtp.kernel.org/ Fixes: dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to reduce TX drops") Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20260505132159.241305-2-hawk@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index e35df717e65e28..0cfb19b760dd54 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -972,7 +972,8 @@ static int veth_poll(struct napi_struct *napi, int budget) /* NAPI functions as RCU section */ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); - peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + peer_txq = (peer_dev && queue_idx < peer_dev->real_num_tx_queues) ? + netdev_get_tx_queue(peer_dev, queue_idx) : NULL; xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); From aa2fbece1b07954ef26488c800d126a36a8ab93e Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Tue, 28 Apr 2026 16:01:39 +0800 Subject: [PATCH 518/972] ALSA: hda: cs35l56: Put ACPI device after setting companion acpi_dev_get_first_match_dev() returns a refcounted ACPI device and callers are expected to balance it with acpi_dev_put(). When no companion is already attached, cs35l56_hda_read_acpi() looks up an ACPI device and sets it with ACPI_COMPANION_SET(), but leaves the lookup reference held. ACPI_COMPANION_SET() does not take ownership of that reference, so drop it with acpi_dev_put() after attaching the companion. Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Signed-off-by: Shuhao Fu Tested-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260428080139.GA1649104@chcpu16 --- sound/hda/codecs/side-codecs/cs35l56_hda.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/side-codecs/cs35l56_hda.c b/sound/hda/codecs/side-codecs/cs35l56_hda.c index 4c8d01799931c8..cdbc576569efee 100644 --- a/sound/hda/codecs/side-codecs/cs35l56_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l56_hda.c @@ -1041,6 +1041,7 @@ static int cs35l56_hda_read_acpi(struct cs35l56_hda *cs35l56, int hid, int id) return -ENODEV; } ACPI_COMPANION_SET(cs35l56->base.dev, adev); + acpi_dev_put(adev); } /* Initialize things that could be overwritten by a fixup */ From fca7401fe37f7abc6e54147ea560f37279231137 Mon Sep 17 00:00:00 2001 From: Shuhao Fu Date: Tue, 28 Apr 2026 16:12:38 +0800 Subject: [PATCH 519/972] ALSA: hda: cs35l41: Put ACPI device on missing physical node acpi_dev_get_first_match_dev() returns a refcounted ACPI device and callers must balance it with acpi_dev_put(). cs35l41_hda_read_acpi() stores the returned ACPI device in cs35l41->dacpi. That reference is normally released by the later probe cleanup or the remove path, but the NULL-check on physdev exits before either of those paths can run. Drop the lookup reference before returning -ENODEV. Fixes: c34b04cc6178 ("ALSA: hda: cs35l41: Fix NULL pointer dereference in cs35l41_hda_read_acpi()") Signed-off-by: Shuhao Fu Tested-by: Simon Trimmer Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260428081238.GA1659932@chcpu16 --- sound/hda/codecs/side-codecs/cs35l41_hda.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda.c b/sound/hda/codecs/side-codecs/cs35l41_hda.c index b64890006bb701..acfccc848f82d8 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda.c @@ -1896,8 +1896,10 @@ static int cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, const char *hid, i cs35l41->dacpi = adev; physdev = get_device(acpi_get_first_physical_node(adev)); - if (!physdev) + if (!physdev) { + acpi_dev_put(adev); return -ENODEV; + } sub = acpi_get_subsystem_id(ACPI_HANDLE(physdev)); if (IS_ERR(sub)) From d119775f2bad827edc28071c061fdd4a91f889a5 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Wed, 6 May 2026 22:08:23 +0800 Subject: [PATCH 520/972] af_unix: Reject SIOCATMARK on non-stream sockets SIOCATMARK reports whether the receive queue is at the urgent mark for MSG_OOB. In AF_UNIX, MSG_OOB is supported only for SOCK_STREAM sockets. SOCK_DGRAM and SOCK_SEQPACKET reject MSG_OOB in sendmsg() and recvmsg(), so they should not support SIOCATMARK either. Return -EOPNOTSUPP for non-stream sockets before checking the receive queue. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Suggested-by: Kuniyuki Iwashima Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506140825.2987635-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e2d787ca3e743d..1cbf36ea043bd5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3323,6 +3323,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sk_buff *skb; int answ = 0; + if (sk->sk_type != SOCK_STREAM) + return -EOPNOTSUPP; + mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); From 9032f7676935a13fd402608223d326c5f62da9c0 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 6 May 2026 09:41:05 +0800 Subject: [PATCH 521/972] net/smc: fix missing sk_err when TCP handshake fails In smc_connect_work(), when the underlying TCP handshake fails, the error code (rc) must be propagated to sk_err to ensure userspace can correctly retrieve the error status via SO_ERROR. Currently, the code only handles a restricted set of error codes (e.g., EPIPE, ECONNREFUSED). If other errors occurs, such as EHOSTUNREACH, sk_err remains unset (zero). This affects applications that rely on SO_ERROR to determine connect outcome. For example, higher versions of Go's netpoller treats SO_ERROR == 0 combined with a failed getpeername() as a spurious wakeup and re-enters epoll_wait(). Under ET mode, no further edge will be generated since the socket is already in a terminal state, causing the connect to hang indefinitely or until a user-specified timeout, if one is set. Fixes: 50717a37db03 ("net/smc: nonblocking connect rework") Signed-off-by: D. Wythe Reviewed-by: Dust Li Link: https://patch.msgid.link/20260506014105.27093-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1a565095376aab..185dbed7de5d6c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1628,12 +1628,8 @@ static void smc_connect_work(struct work_struct *work) lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; - if (rc == -EPIPE || rc == -EAGAIN) - smc->sk.sk_err = EPIPE; - else if (rc == -ECONNREFUSED) - smc->sk.sk_err = ECONNREFUSED; - else if (signal_pending(current)) - smc->sk.sk_err = -sock_intr_errno(timeo); + if (!smc->sk.sk_err) + smc->sk.sk_err = (rc == -EAGAIN) ? EPIPE : -rc; sock_put(&smc->sk); /* passive closing */ goto out; } From 32cd651d14fc72a93703ea2384cb5cd8998523a8 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 5 May 2026 10:39:26 -0700 Subject: [PATCH 522/972] net: phy: broadcom: Save PHY counters during suspend The PHY counters can be lost if the PHY is reset during suspend. We need to save the values into the shadow counters or the accounting will be incorrect over multiple suspend and resume cycles. Fixes: 820ee17b8d3b ("net: phy: broadcom: Add support code for reading PHY counters") Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20260505173926.2870069-1-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm-phy-lib.c | 9 +++++++++ drivers/net/phy/bcm-phy-lib.h | 1 + drivers/net/phy/bcm7xxx.c | 14 ++++++++++++++ drivers/net/phy/broadcom.c | 5 +++++ 4 files changed, 29 insertions(+) diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index 5198d66dbbc048..b64beade8dd933 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -563,6 +563,15 @@ void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow, } EXPORT_SYMBOL_GPL(bcm_phy_get_stats); +void bcm_phy_update_stats_shadow(struct phy_device *phydev, u64 *shadow) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(bcm_phy_hw_stats); i++) + bcm_phy_get_stat(phydev, shadow, i); +} +EXPORT_SYMBOL_GPL(bcm_phy_update_stats_shadow); + void bcm_phy_r_rc_cal_reset(struct phy_device *phydev) { /* Reset R_CAL/RC_CAL Engine */ diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h index bceddbc860eb28..bba94ce9619543 100644 --- a/drivers/net/phy/bcm-phy-lib.h +++ b/drivers/net/phy/bcm-phy-lib.h @@ -85,6 +85,7 @@ int bcm_phy_get_sset_count(struct phy_device *phydev); void bcm_phy_get_strings(struct phy_device *phydev, u8 *data); void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow, struct ethtool_stats *stats, u64 *data); +void bcm_phy_update_stats_shadow(struct phy_device *phydev, u64 *shadow); void bcm_phy_r_rc_cal_reset(struct phy_device *phydev); int bcm_phy_28nm_a0b0_afe_config_init(struct phy_device *phydev); int bcm_phy_enable_jumbo(struct phy_device *phydev); diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 00e8fa14aa773a..71a163f62c0eb6 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -807,6 +807,17 @@ static void bcm7xxx_28nm_get_phy_stats(struct phy_device *phydev, bcm_phy_get_stats(phydev, priv->stats, stats, data); } +static int bcm7xxx_28nm_suspend(struct phy_device *phydev) +{ + struct bcm7xxx_phy_priv *priv = phydev->priv; + + mutex_lock(&phydev->lock); + bcm_phy_update_stats_shadow(phydev, priv->stats); + mutex_unlock(&phydev->lock); + + return genphy_suspend(phydev); +} + static int bcm7xxx_28nm_probe(struct phy_device *phydev) { struct bcm7xxx_phy_priv *priv; @@ -849,6 +860,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .flags = PHY_IS_INTERNAL, \ .config_init = bcm7xxx_28nm_config_init, \ .resume = bcm7xxx_28nm_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ .get_tunable = bcm7xxx_28nm_get_tunable, \ .set_tunable = bcm7xxx_28nm_set_tunable, \ .get_sset_count = bcm_phy_get_sset_count, \ @@ -866,6 +878,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .flags = PHY_IS_INTERNAL, \ .config_init = bcm7xxx_28nm_ephy_config_init, \ .resume = bcm7xxx_28nm_ephy_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ .get_sset_count = bcm_phy_get_sset_count, \ .get_strings = bcm_phy_get_strings, \ .get_stats = bcm7xxx_28nm_get_phy_stats, \ @@ -902,6 +915,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .config_aneg = genphy_config_aneg, \ .read_status = genphy_read_status, \ .resume = bcm7xxx_16nm_ephy_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ } static struct phy_driver bcm7xxx_driver[] = { diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index bf0c6a04481ee2..d1a4edb34ad2ef 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -592,8 +592,13 @@ static int bcm54xx_set_wakeup_irq(struct phy_device *phydev, bool state) static int bcm54xx_suspend(struct phy_device *phydev) { + struct bcm54xx_phy_priv *priv = phydev->priv; int ret = 0; + mutex_lock(&phydev->lock); + bcm_phy_update_stats_shadow(phydev, priv->stats); + mutex_unlock(&phydev->lock); + bcm54xx_ptp_stop(phydev); /* Acknowledge any Wake-on-LAN interrupt prior to suspend */ From 019c892e46544af0ae94ec833f79aa903c837666 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 06:59:53 +0000 Subject: [PATCH 523/972] ipmr: Call ipmr_fib_lookup() under RCU. Yi Lai reported RCU splat in reg_vif_xmit() below. [0] When CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() uses rcu_dereference() without explicit rcu_read_lock(). Although rcu_read_lock_bh() is already held by the caller __dev_queue_xmit(), lockdep requires explicit rcu_read_lock() for rcu_dereference(). Let's move up rcu_read_lock() in reg_vif_xmit() to cover ipmr_fib_lookup(). [0]: WARNING: suspicious RCU usage 7.1.0-rc2-next-20260504-9d0d467c3572 #1 Not tainted ----------------------------- net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz.2.17/1779: #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: local_bh_disable include/linux/bottom_half.h:20 [inline] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: rcu_read_lock_bh include/linux/rcupdate.h:891 [inline] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140 net/core/dev.c:4792 #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: spin_lock include/linux/spinlock.h:342 [inline] #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __netif_tx_lock include/linux/netdevice.h:4795 [inline] #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140 net/core/dev.c:4865 stack backtrace: CPU: 1 UID: 0 PID: 1779 Comm: syz.2.17 Not tainted 7.1.0-rc2-next-20260504-9d0d467c3572 #1 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x121/0x150 lib/dump_stack.c:120 dump_stack+0x19/0x20 lib/dump_stack.c:129 lockdep_rcu_suspicious+0x15b/0x1f0 kernel/locking/lockdep.c:6878 ipmr_fib_lookup net/ipv4/ipmr.c:329 [inline] reg_vif_xmit+0x2ee/0x3c0 net/ipv4/ipmr.c:540 __netdev_start_xmit include/linux/netdevice.h:5382 [inline] netdev_start_xmit include/linux/netdevice.h:5391 [inline] xmit_one net/core/dev.c:3889 [inline] dev_hard_start_xmit+0x170/0x700 net/core/dev.c:3905 __dev_queue_xmit+0x1df1/0x4140 net/core/dev.c:4871 dev_queue_xmit include/linux/netdevice.h:3423 [inline] packet_xmit+0x252/0x370 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3082 [inline] packet_sendmsg+0x39ad/0x5650 net/packet/af_packet.c:3114 sock_sendmsg_nosec net/socket.c:797 [inline] __sock_sendmsg net/socket.c:812 [inline] ____sys_sendmsg+0xa21/0xba0 net/socket.c:2716 ___sys_sendmsg+0x121/0x1c0 net/socket.c:2770 __sys_sendmsg+0x177/0x220 net/socket.c:2802 __do_sys_sendmsg net/socket.c:2807 [inline] __se_sys_sendmsg net/socket.c:2805 [inline] __x64_sys_sendmsg+0x80/0xc0 net/socket.c:2805 x64_sys_call+0x1d9c/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xc1/0x1020 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f37e563ee5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe5caa7fa8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000005c5fa0 RCX: 00007f37e563ee5d RDX: 0000000000000000 RSI: 00002000000012c0 RDI: 0000000000000004 RBP: 00000000005c5fa0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00000000005c5fac R15: 00000000005c5fa0 Fixes: b3b6babf4751 ("ipmr: Free mr_table after RCU grace period.") Reported-by: syzkaller Reported-by: Yi Lai Closes: https://lore.kernel.org/netdev/afrY34dLXNUboevf@ly-workstation/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260506065955.1695753-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 05fb6eefe0beb3..2628cd3a93a684 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) }; int err; + rcu_read_lock(); err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { + rcu_read_unlock(); kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); - rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), From ecddc523cfdb85b3e132f13e293224ebfdfab564 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 07:04:42 +0000 Subject: [PATCH 524/972] tcp: Fix dst leak in tcp_v6_connect(). If a socket is bound to a wildcard address, tcp_v[46]_connect() updates it with a non-wildcard address based on the route lookup. After bhash2 was introduced in the cited commit, we must call inet_bhash2_update_saddr() to update the bhash2 entry as well. If inet_bhash2_update_saddr() fails, we must release the refcount for dst by ip_route_connect() or ip6_dst_lookup_flow(). While tcp_v4_connect() calls ip_rt_put() in the error path, tcp_v6_connect() does not call dst_release(). Let's call dst_release() when inet_bhash2_update_saddr() fails in tcp_v6_connect(). Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260506070443.1699879-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 51583aef0643e9..d13d49bfef1945 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, saddr = &fl6->saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) + if (err) { + dst_release(dst); goto failure; + } } /* set the source address */ From dedf6c90386d99b878763c183a08b61d3ce4824e Mon Sep 17 00:00:00 2001 From: Joey Lu Date: Wed, 6 May 2026 16:46:13 +0800 Subject: [PATCH 525/972] net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel() priv->dev was never initialized after devm_kzalloc() allocates the private data structure. When nvt_set_phy_intf_sel() is later invoked via the phylink interface_select callback, it calls nvt_gmac_get_delay(priv->dev, ...) which dereferences the NULL pointer. Fix this by assigning priv->dev = dev immediately after allocation. Fixes: 4d7c557f58ef ("net: stmmac: dwmac-nuvoton: Add dwmac glue for Nuvoton MA35 family") Signed-off-by: Joey Lu Link: https://patch.msgid.link/20260506084614.192894-2-a0987203069@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c index e2240b68ad98d6..2ab6ecac64224b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c @@ -100,6 +100,8 @@ static int nvt_gmac_probe(struct platform_device *pdev) if (!priv) return dev_err_probe(dev, -ENOMEM, "Failed to allocate private data\n"); + priv->dev = dev; + priv->regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node, "nuvoton,sys", 1, &priv->macid); if (IS_ERR(priv->regmap)) From b131dc93f7bf1b1461f5bde0c06c4c2384aa5b58 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Wed, 6 May 2026 09:25:38 +0200 Subject: [PATCH 526/972] net: sparx5: fix wrong chip ids for TSN SKUs The TSN SKUs in enum spx5_target_chiptype have incorrect IDs: SPX5_TARGET_CT_7546TSN = 0x47546, SPX5_TARGET_CT_7549TSN = 0x47549, SPX5_TARGET_CT_7552TSN = 0x47552, SPX5_TARGET_CT_7556TSN = 0x47556, SPX5_TARGET_CT_7558TSN = 0x47558, The value read back from the chip is GCB_CHIP_ID_PART_ID, which is a GENMASK(27, 12) field, i.e. at most 16 bits wide. It can never match these IDs, so probing a TSN part fails with a "Target not supported" error. Fix the enum to use the actual 16-bit part IDs returned by the hardware: 0x0546, 0x0549, 0x0552, 0x0556 and 0x0558. Reported-by: Andrew Lunn Fixes: 3cfa11bac9bb ("net: sparx5: add the basic sparx5 driver") Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20260506-misc-fixes-sparx5-lan969x-v2-3-fb236aa96908@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index 6a745bb71b5c53..eb57b86fbe22fa 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -31,11 +31,11 @@ enum spx5_target_chiptype { SPX5_TARGET_CT_7552 = 0x7552, /* SparX-5-128 Enterprise */ SPX5_TARGET_CT_7556 = 0x7556, /* SparX-5-160 Enterprise */ SPX5_TARGET_CT_7558 = 0x7558, /* SparX-5-200 Enterprise */ - SPX5_TARGET_CT_7546TSN = 0x47546, /* SparX-5-64i Industrial */ - SPX5_TARGET_CT_7549TSN = 0x47549, /* SparX-5-90i Industrial */ - SPX5_TARGET_CT_7552TSN = 0x47552, /* SparX-5-128i Industrial */ - SPX5_TARGET_CT_7556TSN = 0x47556, /* SparX-5-160i Industrial */ - SPX5_TARGET_CT_7558TSN = 0x47558, /* SparX-5-200i Industrial */ + SPX5_TARGET_CT_7546TSN = 0x0546, /* SparX-5-64i Industrial */ + SPX5_TARGET_CT_7549TSN = 0x0549, /* SparX-5-90i Industrial */ + SPX5_TARGET_CT_7552TSN = 0x0552, /* SparX-5-128i Industrial */ + SPX5_TARGET_CT_7556TSN = 0x0556, /* SparX-5-160i Industrial */ + SPX5_TARGET_CT_7558TSN = 0x0558, /* SparX-5-200i Industrial */ SPX5_TARGET_CT_LAN9694 = 0x9694, /* lan969x-40 */ SPX5_TARGET_CT_LAN9691VAO = 0x9691, /* lan969x-40-VAO */ SPX5_TARGET_CT_LAN9694TSN = 0x9695, /* lan969x-40-TSN */ From 41ae14071cd7f6a7770e2fe1f8a0859d4c2c6ba4 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Wed, 6 May 2026 09:25:39 +0200 Subject: [PATCH 527/972] net: sparx5: configure serdes for 1000BASE-X in sparx5_port_init() sparx5_port_init() only invokes sparx5_serdes_set() and the associated shadow-device enable and low-speed device switch for SGMII and QSGMII. On any port with a high-speed primary device (DEV5G/DEV10G/DEV25G) configured for 1000BASE-X the serdes is therefore left uninitialized, the DEV2G5 shadow is never enabled, and the port stays pointed at its high-speed device rather than the DEV2G5. The PCS1G block looks healthy in isolation, but no frames reach the link partner. Add 1000BASE-X to the check so the same three steps run. Note: the same issue might apply to 2500BASE-X, but that will, eventually, be addressed in a separate commit. Reported-by: Andrew Lunn Fixes: 946e7fd5053a ("net: sparx5: add port module support") Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20260506-misc-fixes-sparx5-lan969x-v2-4-fb236aa96908@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 04bc8fffaf961e..62c49893de3c37 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -1128,7 +1128,8 @@ int sparx5_port_init(struct sparx5 *sparx5, DEV2G5_PCS1G_SD_CFG(port->portno)); if (conf->portmode == PHY_INTERFACE_MODE_QSGMII || - conf->portmode == PHY_INTERFACE_MODE_SGMII) { + conf->portmode == PHY_INTERFACE_MODE_SGMII || + conf->portmode == PHY_INTERFACE_MODE_1000BASEX) { err = sparx5_serdes_set(sparx5, port, conf); if (err) return err; From 5be7a0cef3229fb3b63a07c0d289daf752545424 Mon Sep 17 00:00:00 2001 From: Piyush Sachdeva Date: Thu, 7 May 2026 22:22:13 +0530 Subject: [PATCH 528/972] smb: client: Use FullSessionKey for AES-256 encryption key derivation When Kerberos authentication is used with AES-256 encryption (AES-256-CCM or AES-256-GCM), the SMB3 encryption and decryption keys must be derived using the full session key (Session.FullSessionKey) rather than just the first 16 bytes (Session.SessionKey). Per MS-SMB2 section 3.2.5.3.1, when Connection.Dialect is "3.1.1" and Connection.CipherId is AES-256-CCM or AES-256-GCM, Session.FullSessionKey must be set to the full cryptographic key from the GSS authentication context. The encryption and decryption key derivation (SMBC2SCipherKey, SMBS2CCipherKey) must use this FullSessionKey as the KDF input. The signing key derivation continues to use Session.SessionKey (first 16 bytes) in all cases. Previously, generate_key() hardcoded SMB2_NTLMV2_SESSKEY_SIZE (16) as the HMAC-SHA256 key input length for all derivations. When Kerberos with AES-256 provides a 32-byte session key, the KDF for encryption/decryption was using only the first 16 bytes, producing keys that did not match the server's, causing mount failures with sec=krb5 and require_gcm_256=1. Add a full_key_size parameter to generate_key() and pass the appropriate size from generate_smb3signingkey(): - Signing: always SMB2_NTLMV2_SESSKEY_SIZE (16 bytes) - Encryption/Decryption: ses->auth_key.len when AES-256, otherwise 16 Also fix cifs_dump_full_key() to report the actual session key length for AES-256 instead of hardcoded CIFS_SESS_KEY_SIZE, so that userspace tools like Wireshark receive the correct key for decryption. Cc: Reviewed-by: Bharath SM Signed-off-by: Piyush Sachdeva Signed-off-by: Piyush Sachdeva Signed-off-by: Steve French --- fs/smb/client/ioctl.c | 2 +- fs/smb/client/smb2transport.c | 35 ++++++++++++++++++++++++++--------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 9afab3237e54c3..17408bb8ab65bf 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -296,7 +296,7 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug break; case SMB2_ENCRYPTION_AES256_CCM: case SMB2_ENCRYPTION_AES256_GCM: - out.session_key_length = CIFS_SESS_KEY_SIZE; + out.session_key_length = ses->auth_key.len; out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; break; default: diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 41009039b4cbe6..e8eeff9e50d67f 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -251,7 +251,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } static void generate_key(struct cifs_ses *ses, struct kvec label, - struct kvec context, __u8 *key, unsigned int key_size) + struct kvec context, __u8 *key, unsigned int key_size, + unsigned int full_key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -265,7 +266,7 @@ static void generate_key(struct cifs_ses *ses, struct kvec label, memset(key, 0x0, key_size); hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response, - SMB2_NTLMV2_SESSKEY_SIZE); + full_key_size); hmac_sha256_update(&hmac_ctx, i, 4); hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len); hmac_sha256_update(&hmac_ctx, &zero, 1); @@ -298,6 +299,7 @@ generate_smb3signingkey(struct cifs_ses *ses, struct TCP_Server_Info *server, const struct derivation_triplet *ptriplet) { + unsigned int full_key_size = SMB2_NTLMV2_SESSKEY_SIZE; bool is_binding = false; int chan_index = 0; @@ -330,12 +332,24 @@ generate_smb3signingkey(struct cifs_ses *ses, if (is_binding) { generate_key(ses, ptriplet->signing.label, ptriplet->signing.context, - ses->chans[chan_index].signkey, - SMB3_SIGN_KEY_SIZE); + ses->chans[chan_index].signkey, SMB3_SIGN_KEY_SIZE, + SMB2_NTLMV2_SESSKEY_SIZE); } else { generate_key(ses, ptriplet->signing.label, - ptriplet->signing.context, - ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE, SMB2_NTLMV2_SESSKEY_SIZE); + + /* + * Per MS-SMB2 3.2.5.3.1, signing key always uses Session.SessionKey + * (first 16 bytes). Encryption/decryption keys use + * Session.FullSessionKey when dialect is 3.1.1 and cipher is + * AES-256-CCM or AES-256-GCM, otherwise Session.SessionKey. + */ + + if (server->dialect == SMB311_PROT_ID && + (server->cipher_type == SMB2_ENCRYPTION_AES256_CCM || + server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + full_key_size = ses->auth_key.len; /* safe to access primary channel, since it will never go away */ spin_lock(&ses->chan_lock); @@ -345,10 +359,13 @@ generate_smb3signingkey(struct cifs_ses *ses, generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, - ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE); + ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE, + full_key_size); + generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, - ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE); + ses->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE, + full_key_size); } #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS @@ -361,7 +378,7 @@ generate_smb3signingkey(struct cifs_ses *ses, &ses->Suid); cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type); cifs_dbg(VFS, "Session Key %*ph\n", - SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); + (int)ses->auth_key.len, ses->auth_key.response); cifs_dbg(VFS, "Signing Key %*ph\n", SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || From 8cb6fc3231500233ddaf63cb7fd5435008d9ed5f Mon Sep 17 00:00:00 2001 From: Piyush Sachdeva Date: Thu, 7 May 2026 22:22:14 +0530 Subject: [PATCH 529/972] smb: client: Zero-pad short GSS session keys per MS-SMB2 Per MS-SMB2 section 3.2.5.3, Session.SessionKey is the first 16 bytes of the GSS cryptographic key, right-padded with zero bytes if the key is shorter than 16 bytes. SMB2_auth_kerberos() copies the GSS session key from the cifs.upcall response using kmemdup(msg->data, msg->sesskey_len, ...) and stores the GSS-reported length verbatim in ses->auth_key.len. generate_key() reads SMB2_NTLMV2_SESSKEY_SIZE bytes from this buffer when feeding the HMAC-SHA256 KDF for signing key derivation. If a GSS mechanism returns a session key shorter than 16 bytes (e.g. a deprecated single-DES Kerberos enctype with an 8-byte session key), the KDF call performs an out-of-bounds slab read and derives keys that do not match the server, which pads per the spec. Modern KDCs disable short-key enctypes by default, so this is latent rather than reachable in production, but it is still a kernel heap over-read. Allocate auth_key.response with kzalloc() at a length of max(msg->sesskey_len, SMB2_NTLMV2_SESSKEY_SIZE), copy the GSS key in, and rely on kzalloc()'s zero initialization for the spec-mandated padding. Set ses->auth_key.len to the padded length. Larger GSS keys (e.g. the 32-byte aes256-cts-hmac-sha1-96 session key) continue to be stored at their natural length, preserving the FullSessionKey path. Emit a cifs_dbg(VFS, ...) message when a short key is encountered to surface deprecated-enctype usage. NTLMv2 and NTLMSSP code paths produce a 16-byte session key by construction and are unaffected. Signed-off-by: Piyush Sachdeva Signed-off-by: Piyush Sachdeva Signed-off-by: Steve French --- fs/smb/client/smb2pdu.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index cb61051f9af3b3..995fcdd306815f 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1713,17 +1713,30 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) is_binding = (ses->ses_status == SES_GOOD); spin_unlock(&ses->ses_lock); + /* + * Per MS-SMB2 3.2.5.3, Session.SessionKey is the first 16 bytes of the + * GSS cryptographic key, right-padded with zero bytes if shorter. + * Allocate at least SMB2_NTLMV2_SESSKEY_SIZE bytes (zeroed) so the KDF + * input buffer is always valid for HMAC-SHA256 even with deprecated + * Kerberos enctypes that return a short session key. + */ + if (unlikely(msg->sesskey_len < SMB2_NTLMV2_SESSKEY_SIZE)) + cifs_dbg(VFS, + "short GSS session key (%u bytes); zero-padding per MS-SMB2 3.2.5.3\n", + msg->sesskey_len); + kfree_sensitive(ses->auth_key.response); - ses->auth_key.response = kmemdup(msg->data, - msg->sesskey_len, - GFP_KERNEL); + ses->auth_key.len = max_t(unsigned int, msg->sesskey_len, + SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); if (!ses->auth_key.response) { cifs_dbg(VFS, "%s: can't allocate (%u bytes) memory\n", - __func__, msg->sesskey_len); + __func__, ses->auth_key.len); + ses->auth_key.len = 0; rc = -ENOMEM; goto out_put_spnego_key; } - ses->auth_key.len = msg->sesskey_len; + memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; sess_data->iov[1].iov_len = msg->secblob_len; From d62b8d236fab503c6fec1d3e9a38bea71feaca20 Mon Sep 17 00:00:00 2001 From: Zisen Ye Date: Sat, 2 May 2026 18:48:36 +0800 Subject: [PATCH 530/972] smb/client: fix out-of-bounds read in symlink_data() Since smb2_check_message() returns success without length validation for the symlink error response, in symlink_data() it is possible for iov->iov_len to be smaller than sizeof(struct smb2_err_rsp). If the buffer only contains the base SMB2 header (64 bytes), accessing err->ErrorContextCount (at offset 66) or err->ByteCount later in symlink_data() will cause an out-of-bounds read. Link: https://lore.kernel.org/linux-cifs/297d8d9b-adf7-42fd-a1c2-5b1f230032bc@chenxiaosong.com/ Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Cc: Stable@vger.kernel.org Signed-off-by: Zisen Ye Reviewed-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/client/smb2misc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 973fce3c959c4b..2a7355ce1a0783 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -241,7 +241,8 @@ smb2_check_message(char *buf, unsigned int pdu_len, unsigned int len, if (len != calc_len) { /* create failed on symlink */ if (command == SMB2_CREATE_HE && - shdr->Status == STATUS_STOPPED_ON_SYMLINK) + shdr->Status == STATUS_STOPPED_ON_SYMLINK && + len > calc_len) return 0; /* Windows 7 server returns 24 bytes more */ if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) From 8d09328dfda089675e4c049f3f256064a1d1996b Mon Sep 17 00:00:00 2001 From: Zisen Ye Date: Wed, 6 May 2026 11:49:08 +0800 Subject: [PATCH 531/972] smb/client: fix out-of-bounds read in smb2_compound_op() If a server sends a truncated response but a large OutputBufferLength, and terminates the EA list early, check_wsl_eas() returns success without validating that the entire OutputBufferLength fits within iov_len. Then smb2_compound_op() does: memcpy(idata->wsl.eas, data[0], size[0]); Where size[0] is OutputBufferLength. If iov_len is smaller than size[0], memcpy can read beyond the end of the rsp_iov allocation and leak adjacent kernel heap memory. Link: https://lore.kernel.org/linux-cifs/d998240c-aca9-420d-9dbd-f5ba24af19e0@chenxiaosong.com/ Fixes: ea41367b2a60 ("smb: client: introduce SMB2_OP_QUERY_WSL_EA") Cc: stable@vger.kernel.org Signed-off-by: Zisen Ye Reviewed-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/client/smb2inode.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 286912616c7339..6c9c229b91f654 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -111,7 +111,7 @@ static int check_wsl_eas(struct kvec *rsp_iov) u32 outlen, next; u16 vlen; u8 nlen; - u8 *end; + u8 *ea_end, *iov_end; outlen = le32_to_cpu(rsp->OutputBufferLength); if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE || @@ -120,15 +120,19 @@ static int check_wsl_eas(struct kvec *rsp_iov) ea = (void *)((u8 *)rsp_iov->iov_base + le16_to_cpu(rsp->OutputBufferOffset)); - end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len; + ea_end = (u8 *)ea + outlen; + iov_end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len; + if (ea_end > iov_end) + return -EINVAL; + for (;;) { - if ((u8 *)ea > end - sizeof(*ea)) + if ((u8 *)ea > ea_end - sizeof(*ea)) return -EINVAL; nlen = ea->ea_name_length; vlen = le16_to_cpu(ea->ea_value_length); if (nlen != SMB2_WSL_XATTR_NAME_LEN || - (u8 *)ea->ea_data + nlen + 1 + vlen > end) + (u8 *)ea->ea_data + nlen + 1 + vlen > ea_end) return -EINVAL; switch (vlen) { From f98b48151cc502ada59d9778f0112d21f2586ca3 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Mon, 20 Apr 2026 10:47:47 -0400 Subject: [PATCH 532/972] smb: client: validate dacloffset before building DACL pointers parse_sec_desc(), build_sec_desc(), and the chown path in id_mode_to_cifs_acl() all add the server-supplied dacloffset to pntsd before proving a DACL header fits inside the returned security descriptor. On 32-bit builds a malicious server can return dacloffset near U32_MAX, wrap the derived DACL pointer below end_of_acl, and then slip past the later pointer-based bounds checks. build_sec_desc() and id_mode_to_cifs_acl() can then dereference DACL fields from the wrapped pointer in the chmod/chown rewrite paths. Validate dacloffset numerically before building any DACL pointer and reuse the same helper at the three DACL entry points. Fixes: bc3e9dd9d104 ("cifs: Change SIDs in ACEs while transferring file ownership.") Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Michael Bommarito Signed-off-by: Steve French --- fs/smb/client/cifsacl.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index a2750f1e3d90bd..786dbbc43c5b99 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -1264,6 +1264,17 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) return 0; } +static bool dacl_offset_valid(unsigned int acl_len, __u32 dacloffset) +{ + if (acl_len < sizeof(struct smb_acl)) + return false; + + if (dacloffset < sizeof(struct smb_ntsd)) + return false; + + return dacloffset <= acl_len - sizeof(struct smb_acl); +} + /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, @@ -1284,7 +1295,6 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, group_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), @@ -1315,11 +1325,18 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, return rc; } - if (dacloffset) + if (dacloffset) { + if (!dacl_offset_valid(acl_len, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + return -EINVAL; + } + + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr, get_mode_from_special_sid); - else + } else { cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ + } return rc; } @@ -1342,6 +1359,11 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { + if (!dacl_offset_valid(secdesclen, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + return -EINVAL; + } + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); rc = validate_dacl(dacl_ptr, end_of_acl); if (rc) @@ -1710,6 +1732,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, nsecdesclen = sizeof(struct smb_ntsd) + (sizeof(struct smb_sid) * 2); dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { + if (!dacl_offset_valid(secdesclen, dacloffset)) { + cifs_dbg(VFS, "Server returned illegal DACL offset\n"); + rc = -EINVAL; + goto id_mode_to_cifs_acl_exit; + } + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); rc = validate_dacl(dacl_ptr, (char *)pntsd + secdesclen); if (rc) { @@ -1752,6 +1780,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } +id_mode_to_cifs_acl_exit: cifs_put_tlink(tlink); kfree(pnntsd); From 4616a9c36be7e2e051ef53b0e8fd729da0277abf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 11:05:31 -1000 Subject: [PATCH 533/972] sched_ext: Move scx_error() out of scx_link_sched()'s lock region scx_link_sched() holds scx_sched_lock. The scx_error() calls inside take the same lock through scx_claim_exit() and deadlock. Move them out of the guard. Fixes: 6b4576b09714 ("sched_ext: Reject sub-sched attachment to a disabled parent") Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3f0d8aeaed819b..7d367c140a36be 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5585,10 +5585,12 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { + const char *err_msg; + s32 ret = 0; + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *parent = scx_parent(sch); - s32 ret; if (parent) { /* @@ -5598,15 +5600,16 @@ static s32 scx_link_sched(struct scx_sched *sch) * parent can shoot us down. */ if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { - scx_error(sch, "parent disabled"); - return -ENOENT; + err_msg = "parent disabled"; + ret = -ENOENT; + break; } ret = rhashtable_lookup_insert_fast(&scx_sched_hash, &sch->hash_node, scx_sched_hash_params); if (ret) { - scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); - return ret; + err_msg = "failed to insert into scx_sched_hash"; + break; } list_add_tail(&sch->sibling, &parent->children); @@ -5616,6 +5619,15 @@ static s32 scx_link_sched(struct scx_sched *sch) list_add_tail_rcu(&sch->all, &scx_sched_all); } + /* + * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after + * the guard above is released. + */ + if (ret) { + scx_error(sch, "%s (%d)", err_msg, ret); + return ret; + } + refresh_watchdog(); return 0; } From dde2f938d02f2c740d49bb5113dea941f941026a Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Thu, 7 May 2026 18:54:34 +0800 Subject: [PATCH 534/972] cgroup/cpuset: move PF_EXITING check before __GFP_HARDWALL in cpuset_current_node_allowed() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since prepare_alloc_pages() unconditionally adds __GFP_HARDWALL for the fast path when cpusets are enabled, the __GFP_HARDWALL check in cpuset_current_node_allowed() causes the PF_EXITING escape path to be skipped on the first allocation attempt. This makes it unreachable in the common case, so dying tasks can get stuck in direct reclaim or even trigger OOM while trying to exit, despite being allowed to allocate from any node. Move the PF_EXITING check before __GFP_HARDWALL so that dying tasks can allocate memory from any node to exit quickly, even when cpusets are enabled. Also update the function comment to reflect the actual behavior of prepare_alloc_pages() and the corrected check ordering. Signed-off-by: Chen Wandun Acked-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e3a081a07c6d51..a48901a0416a28 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4176,11 +4176,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. - * Otherwise, no. + * If the current task is PF_EXITING, yes. Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed. + * unless the task has been OOM killed or is exiting. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -4194,7 +4194,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). + * in interrupt, of course). The PF_EXITING check must therefore + * come before the __GFP_HARDWALL check, otherwise a dying task + * would be blocked on the fast path. * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -4204,6 +4206,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok + * PF_EXITING - any node ok (let dying task exit quickly) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -4223,11 +4226,10 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) */ if (unlikely(tsk_is_oom_victim(current))) return true; - if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return false; - if (current->flags & PF_EXITING) /* Let dying task have memory */ return true; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return false; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); From 363a53749cc483409498e8f6e1525fe081f1d9d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 12:09:21 -1000 Subject: [PATCH 535/972] sched_ext: Drop unused scx_find_sub_sched() stub scx_find_sub_sched()'s only caller, scx_bpf_sub_dispatch(), is gated on CONFIG_EXT_SUB_SCHED. When CONFIG_EXT_SUB_SCHED=n the caller compiles out and the stub becomes dead code, tripping -Wunused-function on randconfigs. Drop the stub. Fixes: 25037af712eb ("sched_ext: Add rhashtable lookup for sub-schedulers") Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202605080556.42PXw8U9-lkp@intel.com/ Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7d367c140a36be..48b4834c702702 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -297,7 +297,6 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) #else /* CONFIG_EXT_SUB_SCHED */ static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } -static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ From fc51cba3ebae67f967120e27162e94cfb8594479 Mon Sep 17 00:00:00 2001 From: ZhengYuan Huang Date: Wed, 25 Mar 2026 08:43:39 +0800 Subject: [PATCH 536/972] btrfs: fix check_chunk_block_group_mappings() to iterate all chunk maps [BUG] A corrupted image with a chunk present in the chunk tree but whose corresponding block group item is missing from the extent tree can be mounted successfully, even though check_chunk_block_group_mappings() is supposed to catch exactly this corruption at mount time. Once mounted, running btrfs balance with a usage filter (-dusage=N or -dusage=min..max) triggers a null-ptr-deref: KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] RIP: 0010:chunk_usage_filter fs/btrfs/volumes.c:3874 [inline] RIP: 0010:should_balance_chunk fs/btrfs/volumes.c:4018 [inline] RIP: 0010:__btrfs_balance fs/btrfs/volumes.c:4172 [inline] RIP: 0010:btrfs_balance+0x2024/0x42b0 fs/btrfs/volumes.c:4604 [CAUSE] The crash occurs because __btrfs_balance() iterates the on-disk chunk tree, finds the orphaned chunk, calls chunk_usage_filter() (or chunk_usage_range_filter()), which queries the in-memory block group cache via btrfs_lookup_block_group(). Since no block group was ever inserted for this chunk, the lookup returns NULL, and the subsequent dereference of cache->used crashes. check_chunk_block_group_mappings() uses btrfs_find_chunk_map() to iterate the in-memory chunk map (fs_info->mapping_tree): map = btrfs_find_chunk_map(fs_info, start, 1); With @start = 0 and @length = 1, btrfs_find_chunk_map() looks for a chunk map that *contains* the logical address 0. If no chunk contains logical address 0, btrfs_find_chunk_map(fs_info, 0, 1) returns NULL immediately and the loop breaks after the very first iteration, having checked zero chunks. The entire verification function is therefore a no-op, and the corrupted image passes the mount-time check undetected. [FIX] Replace the btrfs_find_chunk_map() based loop with a direct in-order walk of fs_info->mapping_tree using rb_first_cached() + rb_next(). This guarantees that every chunk map in the tree is visited regardless of the logical addresses involved. No lock is taken around the traversal. This function is called during mount from btrfs_read_block_groups(), which is invoked from open_ctree() before any background threads (cleaner, transaction kthread, etc.) are started. There are therefore no concurrent writers that could modify mapping_tree at this point. An analogous lockless direct traversal of mapping_tree already exists in fill_dummy_bgs() in the same file. Since we walk the rb-tree directly via rb_entry() without going through btrfs_find_chunk_map(), no reference is taken on each map entry, so the btrfs_free_chunk_map() calls are also removed. Signed-off-by: ZhengYuan Huang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e6f5a17a13e369..b611c64119dbca 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2412,29 +2412,25 @@ static struct btrfs_block_group *btrfs_create_block_group( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - u64 start = 0; + struct rb_node *node; int ret = 0; - while (1) { + /* + * This is called during mount from btrfs_read_block_groups(), before + * any background threads are started, so no concurrent writers can + * modify the mapping_tree. No lock is needed here. + */ + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - /* - * btrfs_find_chunk_map() will return the first chunk map - * intersecting the range, so setting @length to 1 is enough to - * get the first chunk. - */ - map = btrfs_find_chunk_map(fs_info, start, 1); - if (!map) - break; - + map = rb_entry(node, struct btrfs_chunk_map, rb_node); bg = btrfs_lookup_block_group(fs_info, map->start); if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); ret = -EUCLEAN; - btrfs_free_chunk_map(map); break; } if (unlikely(bg->start != map->start || bg->length != map->chunk_len || @@ -2447,12 +2443,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = map->start + map->chunk_len; - btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; From 4822703b150fc25f7bdb8cf266a482619881a97e Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Wed, 29 Apr 2026 00:10:25 -0700 Subject: [PATCH 537/972] btrfs: always pass __GFP_NOWARN from add_ra_bio_pages() A build workload newly prints order-0 allocation failures on 7.1-rc1: sh: page allocation failure: order:0 mode:0x14084a(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_IO|__GFP_KSWAPD_RECLAIM| __GFP_COMP|__GFP_HARDWALL) CPU: 27 UID: 1000 PID: 855540 Comm: sh Not tainted 7.1.0-rc1-llvm-00058-gdca922e019dd #1 PREEMPTLAZY Call Trace: dump_stack_lvl+0x50/0x70 warn_alloc+0xeb/0x100 __alloc_pages_slowpath+0x567/0x5a0 ? filemap_get_entry+0x11a/0x140 __alloc_frozen_pages_noprof+0x249/0x2d0 alloc_pages_mpol+0xe4/0x180 folio_alloc_noprof+0x80/0xa0 add_ra_bio_pages+0x13c/0x4b0 btrfs_submit_compressed_read+0x229/0x300 submit_one_bio+0x9e/0xe0 btrfs_readahead+0x185/0x1a0 [...] (lldb) source list -a add_ra_bio_pages+0x13c .../vmlinux.unstripped add_ra_bio_pages + 316 at .../fs/btrfs/compression.c:454:8 451 452 folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), 453 0, NULL); -> 454 if (!folio) 455 break; I can reproduce this consistently by running a memory hog concurrently with a buffered writer on a machine with a very large amount of swap. Commit 7ae37b2c94ed ("btrfs: prevent direct reclaim during compressed readahead") clearly intended to suppress these warnings. But because the mask set in the address_space with mapping_set_gfp_mask() doesn't include __GFP_NOWARN, mapping_gfp_constraint() removes it from constraint_gfp before it is passed to filemap_alloc_folio(). Fix by refactoring the code to add __GFP_NOWARN after the call to mapping_gfp_constraint(). Fixes: 7ae37b2c94ed ("btrfs: prevent direct reclaim during compressed readahead") Signed-off-by: Calvin Owens Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c5783ac1b64623..e2ef01a59d0423 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -407,22 +407,18 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - /* - * Avoid direct reclaim when the caller does not allow it. Since - * add_ra_bio_pages() is always speculative, suppress allocation warnings - * in either case. - */ + /* Avoid direct reclaim when the caller does not allow it. */ + constraint_gfp = ~__GFP_FS; + cache_gfp = GFP_NOFS | __GFP_NOWARN; if (!direct_reclaim) { - constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; - } else { - constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; - cache_gfp = GFP_NOFS | __GFP_NOWARN; + constraint_gfp &= ~__GFP_DIRECT_RECLAIM; + cache_gfp &= ~__GFP_DIRECT_RECLAIM; } while (cur < compressed_end) { pgoff_t page_end; pgoff_t pg_index = cur >> PAGE_SHIFT; + gfp_t masked_constraint_gfp; u32 add_size; if (pg_index > end_index) @@ -449,8 +445,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), - 0, NULL); + /* + * Since add_ra_bio_pages() is always speculative, suppress + * allocation warnings. + */ + masked_constraint_gfp = mapping_gfp_constraint(mapping, constraint_gfp); + masked_constraint_gfp |= __GFP_NOWARN; + + folio = filemap_alloc_folio(masked_constraint_gfp, 0, NULL); if (!folio) break; From c73370c677646e86fc4b1780fb07027bdf847375 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 28 Apr 2026 16:58:56 +0100 Subject: [PATCH 538/972] btrfs: tracepoints: fix sleep while in atomic context in btrfs_sync_file() The trace event btrfs_sync_file() is called in an atomic context (all trace events are) and its call to dput(), which is needed due to the call to dget_parent(), can sleep, triggering a kernel splat. This can be reproduced by enabling the trace event and running btrfs/056 from fstests for example. The splat shown in dmesg is the following: [53.919] BUG: sleeping function called from invalid context at fs/dcache.c:970 [53.947] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 32773, name: xfs_io [53.988] preempt_count: 2, expected: 0 [53.967] RCU nest depth: 0, expected: 0 [53.943] Preemption disabled at: [53.944] [<0000000000000000>] 0x0 [54.078] CPU: 0 UID: 0 PID: 32773 Comm: xfs_io Tainted: G W 7.1.0-rc1-btrfs-next-232+ #1 PREEMPT(full) [54.070] Tainted: [W]=WARN [54.071] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [54.072] Call Trace: [54.074] [54.076] dump_stack_lvl+0x56/0x80 [54.079] __might_resched.cold+0xd6/0x10f [54.072] dput.part.0+0x24/0x110 [54.078] trace_event_raw_event_btrfs_sync_file+0x75/0x140 [btrfs] [54.089] btrfs_sync_file+0x1ed/0x530 [btrfs] [54.087] ? __handle_mm_fault+0x8ae/0xed0 [54.089] btrfs_do_write_iter+0x172/0x210 [btrfs] [54.091] vfs_write+0x21f/0x450 [54.094] __x64_sys_pwrite64+0x8d/0xc0 [54.096] ? do_user_addr_fault+0x20c/0x670 [54.099] do_syscall_64+0x60/0xf20 [54.092] ? clear_bhb_loop+0x60/0xb0 [54.094] entry_SYSCALL_64_after_hwframe+0x76/0x7e So stop using dget_parent() and dput() and access the parent dentry directly as dentry->d_parent. This is also what ext4 is doing in its equivalent trace event ext4_sync_file_enter(). Fixes: a85b46db143f ("btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file()") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ad7a2d76c1d57..ec1df8b94517c5 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -771,10 +771,8 @@ TRACE_EVENT(btrfs_sync_file, TP_fast_assign( struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); - struct dentry *parent = dget_parent(dentry); - struct inode *parent_inode = d_inode(parent); + struct inode *parent_inode = d_inode(dentry->d_parent); - dput(parent); TP_fast_assign_fsid(btrfs_sb(inode->i_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->parent = btrfs_ino(BTRFS_I(parent_inode)); From 4066c55e109475a06d18a1f127c939d551211956 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 30 Apr 2026 10:37:22 +0930 Subject: [PATCH 539/972] btrfs: only release the dirty pages io tree after successful writes [WARNING] With extra warning on dirty extent buffers at umount (aka, the next patch in the series), test case generic/388 can trigger the following warning about dirty extent buffers at unmount time: BTRFS critical (device dm-2 state E): emergency shutdown BTRFS error (device dm-2 state E): error while writing out transaction: -30 BTRFS warning (device dm-2 state E): Skipping commit of aborted transaction. BTRFS error (device dm-2 state EA): Transaction 9 aborted (error -30) BTRFS: error (device dm-2 state EA) in cleanup_transaction:2068: errno=-30 Readonly filesystem BTRFS info (device dm-2 state EA): forced readonly BTRFS info (device dm-2 state EA): last unmount of filesystem 4fbf2e15-f941-49a0-bc7c-716315d2777c ------------[ cut here ]------------ WARNING: disk-io.c:3311 at invalidate_and_check_btree_folios+0xfd/0x1ca [btrfs], CPU#8: umount/914368 CPU: 8 UID: 0 PID: 914368 Comm: umount Tainted: G OE 7.1.0-rc1-custom+ #372 PREEMPT(full) 2de38db8d1deae71fde295430a0ff3ab98ccf596 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:invalidate_and_check_btree_folios+0xfd/0x1ca [btrfs] Call Trace: close_ctree+0x52e/0x574 [btrfs d2f0b1cd330d1287e7a9919d112eadfc0e914efd] generic_shutdown_super+0x89/0x1a0 kill_anon_super+0x16/0x40 btrfs_kill_super+0x16/0x20 [btrfs d2f0b1cd330d1287e7a9919d112eadfc0e914efd] deactivate_locked_super+0x2d/0xb0 cleanup_mnt+0xdc/0x140 task_work_run+0x5a/0xa0 exit_to_user_mode_loop+0x123/0x4b0 do_syscall_64+0x243/0x7c0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ---[ end trace 0000000000000000 ]--- BTRFS warning (device dm-2 state EA): unable to release extent buffer 30539776 owner 9 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30621696 owner 257 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30638080 owner 258 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30654464 owner 7 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30703616 owner 2 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30720000 owner 10 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30736384 owner 4 gen 9 refs 2 flags 0x7 BTRFS warning (device dm-2 state EA): unable to release extent buffer 30752768 owner 11 gen 9 refs 2 flags 0x7 I'm using a stripped down version, which seems to trigger the warning more reliably: _fsstress_pid="" workload() { dmesg -C mkfs.btrfs -f -K $dev > /dev/null echo 1 > /sys/kernel/debug/clear_warn_once mount $dev $mnt $fsstress -w -n 1024 -p 4 -d $mnt & _fsstress_pid=$! sleep 0 $godown $mnt pkill --echo -PIPE fsstress > /dev/null wait $_fsstress_pid unset _fsstress_pid umount $mnt if dmesg | grep -q "WARNING"; then fail fi } for (( i = 0; i < $runtime; i++ )); do echo "=== $i/$runtime ===" workload done [CAUSE] Inside btrfs_write_and_wait_transaction(), we first try to write all dirty ebs, then wait for them to finish. After that we call btrfs_extent_io_tree_release() to free all extent states from dirty_pages io tree. However if we hit an error from btrfs_write_marked_extent(), then we still call btrfs_extent_io_tree_release() to clear that dirty_pages io tree, which may contain dirty records that we haven't yet submitted. Furthermore, the later transaction cleanup path will utilize that dirty_pages io tree to properly cleanup those dirty ebs, but since it's already empty, no dirty ebs are properly cleaned up, thus will later trigger the warnings inside invalidate_btree_folios(). [FIX] Normally such dirty ebs won't cause problems, as when the iput() is called on the btree inode, the dirty ebs will be forcibly written back, and since the fs is already in an error status, such writeback will not reach disk and finish immediately. But it's still better to get rid of such dirty ebs, if we ended up with dirty ebs but the fs is not in an error status, then such writeback at iput() time will be too late, as all workers are already stopped but writeback will utilize workers, which will lead to NULL pointer dereferences. Instead of unconditionally calling btrfs_extent_io_tree_release(), only call it if btrfs_write_and_wait_transaction() finished successfully, so that @dirty_pages extent io tree is kept untouched for transaction cleanup. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/transaction.c | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8a11be02eeb9be..c0a30bb213d7a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4686,6 +4686,7 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } + btrfs_extent_io_tree_release(dirty_pages); } static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 248adb785051b7..194f581b36f36e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1293,14 +1293,13 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); - if (ret) return ret; - else if (ret2) + if (ret2) return ret2; - else - return 0; + + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); + return 0; } /* From c562ba61fc5e11798720acc1b172862158f1fa0b Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Fri, 1 May 2026 10:41:56 +0800 Subject: [PATCH 540/972] btrfs: fix incorrect i_size after remount caused by KEEP_SIZE prealloc gap When fallocate() with FALLOC_FL_KEEP_SIZE preallocates an extent past the current i_size, the file_extent_tree of the inode is updated to cover that range. However, on the next mount, btrfs_read_locked_inode() only re-populates file_extent_tree with [0, round_up(i_size, sectorsize)), losing the marks that belonged to the KEEP_SIZE prealloc extent beyond i_size. Later, when a non-KEEP_SIZE fallocate() extends i_size into / past that old prealloc extent, the reservation loop in btrfs_fallocate() skips already-prealloc segments and does not call into the path that marks the file_extent_tree, so a gap remains inside the file_extent_tree across [old_aligned_i_size, start_of_new_alloc). Then __btrfs_prealloc_file_range() calls btrfs_inode_safe_disk_i_size_write(), which uses find_contiguous_extent_bit() starting at offset 0 to derive disk_i_size. The walk stops at the gap, so disk_i_size ends up smaller than i_size and gets persisted. After the next mount, the file shows the wrong (smaller) size. The following reproducer triggers the problem: $ cat test.sh MNT=/mnt/sdi DEV=/dev/sdi mkdir -p $MNT mkfs.btrfs -f -O ^no-holes $DEV mount $DEV $MNT touch $MNT/file1 # KEEP_SIZE prealloc beyond i_size (i_size stays 0) fallocate -n -o 4M -l 4M $MNT/file1 umount $MNT mount $DEV $MNT # non-KEEP_SIZE fallocate that overlaps the previous prealloc tail # and extends past it fallocate -o 7M -l 2M $MNT/file1 ls -lh $MNT/file1 umount $MNT mount $DEV $MNT ls -lh $MNT/file1 umount $MNT Running the reproducer gives the following result: $ ./test.sh (...) -rw-rw-r-- 1 root root 9.0M May 4 16:35 /mnt/sdi/file1 -rw-rw-r-- 1 root root 7.0M May 4 16:35 /mnt/sdi/file1 The size before the second mount is correct (9M), but after the remount it drops to 7M, i.e. the start of the gap inside file_extent_tree. Fix this in __btrfs_prealloc_file_range() by marking the entire range [round_down(old_i_size, sectorsize), round_up(new_i_size, sectorsize)) in file_extent_tree before updating i_size and calling btrfs_inode_safe_disk_i_size_write(). This ensures the contiguous bit search starting from 0 is not truncated by a stale gap left behind by a previous KEEP_SIZE prealloc that was not restored on inode load. The fix has no effect when the NO_HOLES feature is enabled because btrfs_inode_safe_disk_i_size_write() and btrfs_inode_set_file_extent_range() both take the fast path that directly tracks disk_i_size without consulting file_extent_tree. Fixes: 9ddc959e802b ("btrfs: use the file extent tree infrastructure") Reviewed-by: Filipe Manana Signed-off-by: Robbie Ko [ Minor updates to the change log ] Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 906d5c21ebc477..75136a17271014 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9299,10 +9299,38 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { + u64 range_start; + u64 range_end; + if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; + + /* + * Make sure the file_extent_tree covers the entire + * range [old_i_size, new_i_size) before we update + * disk_i_size. Without this, a previous KEEP_SIZE + * prealloc that extended past i_size (and was lost + * across umount/mount because file_extent_tree is + * only populated up to round_up(i_size) on inode + * load) can leave a gap inside this range. That gap + * would cause btrfs_inode_safe_disk_i_size_write() + * (via find_contiguous_extent_bit() starting at 0) + * to truncate disk_i_size to the start of the gap, + * making the persisted size smaller than i_size. + */ + range_start = round_down(inode->i_size, fs_info->sectorsize); + range_end = round_up(i_size, fs_info->sectorsize); + ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), + range_start, range_end - range_start); + if (ret) { + btrfs_abort_transaction(trans, ret); + if (own_trans) + btrfs_end_transaction(trans); + break; + } + i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } From 3607422cdeebd1f0c259964bf33aaa9792e21930 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Sat, 2 May 2026 19:32:06 +0200 Subject: [PATCH 541/972] hwmon: (lm75) Fix AS6200 and TMP112 setup and alarm handling The initialization of the AS6200 has two shortcomings - The device-add-commit states "Conversion mode: continuous" but the the lm75_params structure uses set_mask = 0x94c0. This activates single shot mode (bit 15). According to the datasheet "The device features a single shot measurement mode if the device is in sleep mode (SM=1)". This is quite contradictionary. - It is the only device that activates polarity active-high (bit 10) All this is paired with a undefined clear mask bug in function lm75_write_config() that was introduced with a later refactoring commit. [as6200] = { .config_reg_16bits = true, .set_mask = 0x94C0, -> .clr_mask not defined here .default_resolution = 12, ... static inline int lm75_write_config(struct lm75_data *data, u16 set_mask, u16 clr_mask) { return regmap_update_bits(data->regmap, LM75_REG_CONF, clr_mask | LM75_SHUTDOWN, set_mask); } regmap_update_bits() requires clr_mask to be a superset of set_mask. So basically all sensors with "wrong" masks like the AS6200 are not initialized as intended. Fix that by - Change the set_mask to 0xc010 to reflect the current active-low setup properly and to drive the sensor in continous mode. This takes into account that the config register is little endian and the first byte sent to the chip is the LSB. - Adapt the alarm handling so it can report the alarm correctly even if it is high active. This is done by comparing config register bit 5 and 10 (translated to 2 and 13). This commit does not introduce any ABI breakage as the mutliple bugs effectly drive the AS6200 in standard active-low mode. Fixes: 4b6358e1fe46 ("hwmon: (lm75) Add AMS AS6200 temperature sensor") Suggested-by: Guenter Roeck Signed-off-by: Markus Stockhausen Link: https://lore.kernel.org/r/20260502173207.3567876-2-markus.stockhausen@gmx.de [groeck: Update set_mask for as6200 further: As modeled, the upper bits contain the conversion rate, so the config register needs to be set to 0xc010 instead of 0x10c0 to reflect 8 samples/s and 4 consecutive faults. Fix the same problem for TMP112.] Signed-off-by: Guenter Roeck --- drivers/hwmon/lm75.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c index f1a1e5b888f64e..a1bf4e9813ed8e 100644 --- a/drivers/hwmon/lm75.c +++ b/drivers/hwmon/lm75.c @@ -137,7 +137,7 @@ static const struct lm75_params device_params[] = { }, [as6200] = { .config_reg_16bits = true, - .set_mask = 0x94C0, /* 8 sample/s, 4 CF, positive polarity */ + .set_mask = 0xC010, /* 8 sample/s, 4 CF */ .default_resolution = 12, .default_sample_time = 125, .num_sample_times = 4, @@ -286,8 +286,8 @@ static const struct lm75_params device_params[] = { }, [tmp112] = { .config_reg_16bits = true, - .set_mask = 0x60C0, /* 12-bit mode, 8 samples / second */ - .clr_mask = 1 << 15, /* no one-shot mode*/ + .set_mask = 0xC060, /* 12-bit mode, 8 samples / second */ + .clr_mask = 1 << 7, /* no one-shot mode*/ .default_resolution = 12, .default_sample_time = 125, .num_sample_times = 4, @@ -416,7 +416,7 @@ static int lm75_read(struct device *dev, enum hwmon_sensor_types type, switch (data->kind) { case as6200: case tmp112: - *val = (regval >> 13) & 0x1; + *val = !!(regval & BIT(13)) == !!(regval & BIT(2)); break; default: return -EINVAL; From 05aaac8746c5786eaa779b163fae4ddcd5172707 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Sat, 2 May 2026 19:32:07 +0200 Subject: [PATCH 542/972] hwmon: (lm75) Fix configuration register writes. Sensors configurations are defined by set and clear masks. These do not follow a consistent "clear mask is a superset of set mask" rule. This relaxed definition breaks lm75_write_config() static inline int lm75_write_config(struct lm75_data *data, u16 set_mask, u16 clr_mask) { return regmap_update_bits(data->regmap, LM75_REG_CONF, clr_mask | LM75_SHUTDOWN, set_mask); } Basically all bits from set_mask that are not defined in clr_mask are dropped. Fix that by enhancing the helper to always combine clr_mask and set_mask into the mask bits of regmap_update_bits(). Fixes: 6da24a25f766 ("hwmon: (lm75) Hide register size differences in regmap access functions") Suggested-by: Guenter Roeck Signed-off-by: Markus Stockhausen Link: https://lore.kernel.org/r/20260502173207.3567876-3-markus.stockhausen@gmx.de Signed-off-by: Guenter Roeck --- drivers/hwmon/lm75.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/lm75.c b/drivers/hwmon/lm75.c index a1bf4e9813ed8e..c283443e363b42 100644 --- a/drivers/hwmon/lm75.c +++ b/drivers/hwmon/lm75.c @@ -353,7 +353,7 @@ static inline int lm75_write_config(struct lm75_data *data, u16 set_mask, u16 clr_mask) { return regmap_update_bits(data->regmap, LM75_REG_CONF, - clr_mask | LM75_SHUTDOWN, set_mask); + clr_mask | set_mask | LM75_SHUTDOWN, set_mask); } static irqreturn_t lm75_alarm_handler(int irq, void *private) From 99076a17a112ac43cbd37f6883898ae649166303 Mon Sep 17 00:00:00 2001 From: Tabrez Ahmed Date: Sat, 2 May 2026 07:38:42 +0530 Subject: [PATCH 543/972] hwmon: (ads7871) Fix endianness bug in 16-bit register reads The ads7871_read_reg16() function relies on spi_w8r16() to read the 16-bit sensor output. The ADS7871 device transmits the Least Significant Byte (LSB) first. On Little-Endian architectures, spi_w8r16() correctly reconstructs the 16-bit value. However, on Big-Endian architectures, the byte swapping causes the first received byte (LSB) to be placed in the most significant byte of the u16, resulting in corrupted voltage readings. To fix this, cast the integer result of spi_w8r16() to a restricted __le16 type and convert it to the host CPU's native byte order using le16_to_cpu(). Negative error codes returned by the SPI core are caught and returned prior to the conversion to avoid mangling the error status. Reported-by: Sashiko Closes: https://sashiko.dev/#/patchset/20260418034601.90226-1-tabreztalks@gmail.com Fixes: e0c70b8078629 ("hwmon: add TI ads7871 a/d converter driver") Suggested-by: David Laight Signed-off-by: Tabrez Ahmed Link: https://lore.kernel.org/r/20260502020844.110038-2-tabreztalks@gmail.com Signed-off-by: Guenter Roeck --- drivers/hwmon/ads7871.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/ads7871.c b/drivers/hwmon/ads7871.c index 9bfdf9e6bcd77d..9ee3ce01f130bb 100644 --- a/drivers/hwmon/ads7871.c +++ b/drivers/hwmon/ads7871.c @@ -77,9 +77,13 @@ static int ads7871_read_reg8(struct spi_device *spi, int reg) static int ads7871_read_reg16(struct spi_device *spi, int reg) { int ret; + reg = reg | INST_READ_BM | INST_16BIT_BM; ret = spi_w8r16(spi, reg); - return ret; + if (ret < 0) + return ret; + + return le16_to_cpu((__force __le16)ret); } static int ads7871_write_reg8(struct spi_device *spi, int reg, u8 val) From 8e72510db9fa2d41f2b06d5c01fe9020e076fee4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:13 +0200 Subject: [PATCH 544/972] netfilter: x_tables: allow initial table replace without emitting audit log message At the moment we emit the audit log a bit too early, which makes it necessary to also emit an unregister log in case we have to unwind errors after possible hook register failure. Followup patch will be slightly simpler if we can delay the register message until after the hooks have been wired up. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2c67c2e6b13288..bb0cb395955151 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1472,11 +1472,9 @@ struct xt_counters *xt_counters_alloc(unsigned int counters) } EXPORT_SYMBOL(xt_counters_alloc); -struct xt_table_info * -xt_replace_table(struct xt_table *table, - unsigned int num_counters, - struct xt_table_info *newinfo, - int *error) +static struct xt_table_info * +do_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, int *error) { struct xt_table_info *private; unsigned int cpu; @@ -1531,10 +1529,23 @@ xt_replace_table(struct xt_table *table, } } - audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : - AUDIT_XT_OP_REPLACE, - GFP_KERNEL); + return private; +} + +struct xt_table_info * +xt_replace_table(struct xt_table *table, unsigned int num_counters, + struct xt_table_info *newinfo, + int *error) +{ + struct xt_table_info *private; + + private = do_replace_table(table, num_counters, newinfo, error); + if (private) + audit_log_nfcfg(table->name, table->af, private->number, + !private->number ? AUDIT_XT_OP_REGISTER : + AUDIT_XT_OP_REPLACE, + GFP_KERNEL); + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); From b62eb8dcf2c47d4d676a434efbd57c4f776f7829 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:14 +0200 Subject: [PATCH 545/972] netfilter: x_tables: allocate hook ops while under mutex arp/ip(6)t_register_table() add the table to the per-netns list via xt_register_table() before allocating the per-netns hook ops copy via kmemdup_array(). This leaves a window where the table is visible in the list with ops=NULL. If the pernet exit happens runs concurrently the pre_exit callback finds the table via xt_find_table() and passes the NULL ops pointer to nf_unregister_net_hooks(), causing a NULL dereference: general protection fault in nf_unregister_net_hooks+0xbc/0x150 RIP: nf_unregister_net_hooks (net/netfilter/core.c:613) Call Trace: ipt_unregister_table_pre_exit iptable_mangle_net_pre_exit ops_pre_exit_list cleanup_net Fix by moving the ops allocation into the xtables core so the table is never in the list without valid ops. Also ensure the table is no longer processing packets before its torn down on error unwind. nf_register_net_hooks might have published at least one hook; call synchronize_rcu() if there was an error. audit log register message gets deferred until all operations have passed, this avoids need to emit another ureg message in case of error unwinding. Based on earlier patch by Tristan Madani. Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Fixes: ee177a54413a ("netfilter: ip6_tables: pass table pointer via nf_hook_ops") Fixes: ae689334225f ("netfilter: ip_tables: pass table pointer via nf_hook_ops") Link: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + net/ipv4/netfilter/arp_tables.c | 35 +++------------------ net/ipv4/netfilter/ip_tables.c | 41 +++--------------------- net/ipv6/netfilter/ip6_tables.c | 38 +++-------------------- net/netfilter/x_tables.c | 50 +++++++++++++++++++++++++----- 5 files changed, 55 insertions(+), 110 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a81b46af5118db..cb4b694dd9e402 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -305,6 +305,7 @@ struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97ead883e4a13b..c02e46a0271a0f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1522,13 +1522,11 @@ int arpt_register_table(struct net *net, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1543,7 +1541,7 @@ int arpt_register_table(struct net *net, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; @@ -1553,31 +1551,6 @@ int arpt_register_table(struct net *net, return PTR_ERR(new_table); } - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __arpt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 23c8deff8095ae..488c5945ebb235 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1724,13 +1724,11 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1745,7 +1743,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ipt_entry *iter; @@ -1755,37 +1753,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - /* No template? No need to do anything. This is used by 'nat' table, it registers - * with the nat core instead of the netfilter core. - */ - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ipt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index d585ac3c111335..dbe7c7acd702ef 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1733,13 +1733,11 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { - struct nf_hook_ops *ops; - unsigned int num_ops; - int ret, i; - struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; - void *loc_cpu_entry; + struct xt_table_info *newinfo; struct xt_table *new_table; + void *loc_cpu_entry; + int ret; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) @@ -1754,7 +1752,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } - new_table = xt_register_table(net, table, &bootstrap, newinfo); + new_table = xt_register_table(net, table, template_ops, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; @@ -1764,34 +1762,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return PTR_ERR(new_table); } - if (!template_ops) - return 0; - - num_ops = hweight32(table->valid_hooks); - if (num_ops == 0) { - ret = -EINVAL; - goto out_free; - } - - ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) { - ret = -ENOMEM; - goto out_free; - } - - for (i = 0; i < num_ops; i++) - ops[i].priv = new_table; - - new_table->ops = ops; - - ret = nf_register_net_hooks(net, ops, num_ops); - if (ret != 0) - goto out_free; - - return ret; - -out_free: - __ip6t_unregister_table(net, new_table); return ret; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bb0cb395955151..06f27bea9eedb6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1542,7 +1542,6 @@ xt_replace_table(struct xt_table *table, unsigned int num_counters, private = do_replace_table(table, num_counters, newinfo, error); if (private) audit_log_nfcfg(table->name, table->af, private->number, - !private->number ? AUDIT_XT_OP_REGISTER : AUDIT_XT_OP_REPLACE, GFP_KERNEL); @@ -1552,20 +1551,32 @@ EXPORT_SYMBOL_GPL(xt_replace_table); struct xt_table *xt_register_table(struct net *net, const struct xt_table *input_table, + const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t, *table = NULL; + struct nf_hook_ops *ops = NULL; struct xt_table_info *private; - struct xt_table *t, *table; - int ret; + unsigned int num_ops; + int ret = -EINVAL; + + num_ops = hweight32(input_table->valid_hooks); + if (num_ops == 0) + goto out; + + ret = -ENOMEM; + if (template_ops) { + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); + if (!ops) + goto out; + } /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); - if (!table) { - ret = -ENOMEM; + if (!table) goto out; - } mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ @@ -1579,7 +1590,7 @@ struct xt_table *xt_register_table(struct net *net, /* Simplifies replace_table code. */ table->private = bootstrap; - if (!xt_replace_table(table, 0, newinfo, &ret)) + if (!do_replace_table(table, 0, newinfo, &ret)) goto unlock; private = table->private; @@ -1588,14 +1599,37 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; + if (ops) { + int i; + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) { + mutex_unlock(&xt[table->af].mutex); + /* nf_register_net_hooks() might have published a + * base chain before internal error unwind. + */ + synchronize_rcu(); + goto out; + } + + table->ops = ops; + } + + audit_log_nfcfg(table->name, table->af, private->number, + AUDIT_XT_OP_REGISTER, GFP_KERNEL); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; unlock: mutex_unlock(&xt[table->af].mutex); - kfree(table); out: + kfree(table); + kfree(ops); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(xt_register_table); From 527d6931473b75d90e38942aae6537d1a527f1fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:15 +0200 Subject: [PATCH 546/972] netfilter: x_tables: add and use xt_unregister_table_pre_exit Remove the copypasted variants of _pre_exit and add one single function in the xtables core. ebtables is not compatible with x_tables and therefore unchanged. This is a preparation patch to reduce noise in the followup bug fixes. Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + include/linux/netfilter_arp/arp_tables.h | 1 - include/linux/netfilter_ipv4/ip_tables.h | 1 - include/linux/netfilter_ipv6/ip6_tables.h | 1 - net/ipv4/netfilter/arp_tables.c | 9 ------- net/ipv4/netfilter/arptable_filter.c | 2 +- net/ipv4/netfilter/ip_tables.c | 9 ------- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_nat.c | 1 + net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 9 ------- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_nat.c | 1 + net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- net/netfilter/x_tables.c | 29 +++++++++++++++++++++++ 19 files changed, 41 insertions(+), 39 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index cb4b694dd9e402..74486714ae2007 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -309,6 +309,7 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a40aaf645fa479..05631a25e62293 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,7 +53,6 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 132b0e4a6d4df6..13593391d6058d 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,7 +26,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops); -void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8b8885a73c7649..c6d5b927830dd3 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,7 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c02e46a0271a0f..bd348b7bad2c5f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1554,15 +1554,6 @@ int arpt_register_table(struct net *net, return ret; } -void arpt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} -EXPORT_SYMBOL(arpt_unregister_table_pre_exit); - void arpt_unregister_table(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 78cd5ee24448f6..393d9a8c773900 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -43,7 +43,7 @@ static int arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_ARP, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 488c5945ebb235..864489928fb5aa 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1756,14 +1756,6 @@ int ipt_register_table(struct net *net, const struct xt_table *table, return ret; } -void ipt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ipt_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); @@ -1854,7 +1846,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); module_init(ip_tables_init); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 3ab908b7479517..b2fbd9651d61aa 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -61,7 +61,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 385d945d8ebea0..a99e619961975f 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -96,7 +96,7 @@ static int iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 625a1ca13b1bad..8fc4912e790d8f 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,6 +129,7 @@ static int iptable_nat_table_init(struct net *net) static void __net_exit iptable_nat_net_pre_exit(struct net *net) { ipt_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); } static void __net_exit iptable_nat_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 0e7f53964d0af6..42511721e538a0 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -53,7 +53,7 @@ static int iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index d885443cb26798..4646bf6d7d2bd6 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -50,7 +50,7 @@ static int iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dbe7c7acd702ef..edf50bc7787e56 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1765,14 +1765,6 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); -} - void ip6t_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); @@ -1864,7 +1856,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index e8992693e14a04..f05a9e4b2c6762 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -60,7 +60,7 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 8dd4cd0c47bd4d..afa4a5703e4339 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -89,7 +89,7 @@ static int ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 5be723232df8f1..bb8aa3fc42b45e 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,6 +131,7 @@ static int ip6table_nat_table_init(struct net *net) static void __net_exit ip6table_nat_net_pre_exit(struct net *net) { ip6t_nat_unregister_lookups(net); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); } static void __net_exit ip6table_nat_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index fc9f6754028f2c..32d2da81c52a7d 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -52,7 +52,7 @@ static int ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 4df14a9bae782d..3dfd8d6ea4b906 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -49,7 +49,7 @@ static int ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security"); + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 06f27bea9eedb6..9c1e896c7b03b6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1650,6 +1650,35 @@ void *xt_unregister_table(struct xt_table *table) return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); + +/** + * xt_unregister_table_pre_exit - pre-shutdown unregister of a table + * @net: network namespace + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Unregisters the specified netfilter table from the given network namespace + * and also unregisters the hooks from netfilter core: no new packets will be + * processed. + */ +void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + + if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; + } + } + mutex_unlock(&xt[af].mutex); +} +EXPORT_SYMBOL(xt_unregister_table_pre_exit); #endif #ifdef CONFIG_PROC_FS From d338693d778579b676a61346849bebd892427158 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:16 +0200 Subject: [PATCH 547/972] netfilter: x_tables: unregister the templates first When the module is going away we need to zap the template first. Else there is a small race window where userspace could instantiate a new table after the pernet exit function has removed the current table. Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arptable_filter.c | 2 +- net/ipv4/netfilter/iptable_filter.c | 2 +- net/ipv4/netfilter/iptable_mangle.c | 2 +- net/ipv4/netfilter/iptable_raw.c | 2 +- net/ipv4/netfilter/iptable_security.c | 2 +- net/ipv6/netfilter/ip6table_filter.c | 2 +- net/ipv6/netfilter/ip6table_mangle.c | 2 +- net/ipv6/netfilter/ip6table_raw.c | 2 +- net/ipv6/netfilter/ip6table_security.c | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 393d9a8c773900..382345567a600a 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -82,8 +82,8 @@ static int __init arptable_filter_init(void) static void __exit arptable_filter_fini(void) { - unregister_pernet_subsys(&arptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&arptable_filter_net_ops); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index b2fbd9651d61aa..0dea754a91209a 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -101,8 +101,8 @@ static int __init iptable_filter_init(void) static void __exit iptable_filter_fini(void) { - unregister_pernet_subsys(&iptable_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&iptable_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index a99e619961975f..4d3b124923080b 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -135,8 +135,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&iptable_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 42511721e538a0..6f7afec7954bd4 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -100,9 +100,9 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { + xt_unregister_template(&packet_raw); unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); - xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 4646bf6d7d2bd6..81175c20ccbe8d 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -89,9 +89,9 @@ static int __init iptable_security_init(void) static void __exit iptable_security_fini(void) { + xt_unregister_template(&security_table); unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); - xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index f05a9e4b2c6762..cf561919bde84c 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -100,8 +100,8 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { - unregister_pernet_subsys(&ip6table_filter_net_ops); xt_unregister_template(&packet_filter); + unregister_pernet_subsys(&ip6table_filter_net_ops); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index afa4a5703e4339..1a758f2bc5379c 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -128,8 +128,8 @@ static int __init ip6table_mangle_init(void) static void __exit ip6table_mangle_fini(void) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); + unregister_pernet_subsys(&ip6table_mangle_net_ops); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 32d2da81c52a7d..923455921c1ddf 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -98,8 +98,8 @@ static int __init ip6table_raw_init(void) static void __exit ip6table_raw_fini(void) { - unregister_pernet_subsys(&ip6table_raw_net_ops); xt_unregister_template(&packet_raw); + unregister_pernet_subsys(&ip6table_raw_net_ops); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 3dfd8d6ea4b906..c44834d93fc792 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -88,8 +88,8 @@ static int __init ip6table_security_init(void) static void __exit ip6table_security_fini(void) { - unregister_pernet_subsys(&ip6table_security_net_ops); xt_unregister_template(&security_table); + unregister_pernet_subsys(&ip6table_security_net_ops); kfree(sectbl_ops); } From b4597d5fd7d2f8cebfffd40dffb5e003cc78964c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:17 +0200 Subject: [PATCH 548/972] netfilter: x_tables: add and use xtables_unregister_table_exit Previous change added xtables_unregister_table_pre_exit to detach the table from the packetpath and to unlink it from the active table list. In case of rmmod, userspace that is doing set/getsockopt for this table will not be able to re-instantiate the table: 1. The larval table has been removed already 2. existing instantiated table is no longer on the xt pernet table list. This adds the second stage helper: unlink the table from the dying list, free the hook ops (if any) and do the audit notification. It replaces xt_unregister_table(). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Reported-by: Tristan Madani Reviewed-by: Tristan Madani Closes: https://lore.kernel.org/netfilter-devel/20260429175613.1459342-1-tristmd@gmail.com/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/ipv4/netfilter/arp_tables.c | 9 ++-- net/ipv4/netfilter/ip_tables.c | 9 ++-- net/ipv4/netfilter/iptable_nat.c | 5 +- net/ipv6/netfilter/ip6_tables.c | 9 ++-- net/ipv6/netfilter/ip6table_nat.c | 5 +- net/netfilter/x_tables.c | 81 +++++++++++++++++++++++------- 7 files changed, 83 insertions(+), 37 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 74486714ae2007..5a1c5c336fa4fe 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -308,8 +308,8 @@ struct xt_table *xt_register_table(struct net *net, const struct nf_hook_ops *template_ops, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); -void *xt_unregister_table(struct xt_table *table); void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name); +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index bd348b7bad2c5f..ad2259678c7854 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1501,13 +1501,11 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len static void __arpt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; + void *loc_cpu_entry; struct arpt_entry *iter; - private = xt_unregister_table(table); - /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) @@ -1515,6 +1513,7 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int arpt_register_table(struct net *net, @@ -1556,7 +1555,7 @@ int arpt_register_table(struct net *net, void arpt_unregister_table(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 864489928fb5aa..5cbdb0815857f4 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1704,12 +1704,10 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ipt_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ipt_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1718,6 +1716,7 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ipt_register_table(struct net *net, const struct xt_table *table, @@ -1758,7 +1757,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, void ipt_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV4, name); if (table) __ipt_unregister_table(net, table); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 8fc4912e790d8f..a0df7255402514 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -119,8 +119,11 @@ static int iptable_nat_table_init(struct net *net) } ret = ipt_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV4, "nat"); + synchronize_rcu(); ipt_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index edf50bc7787e56..9d9c3763f2f5e9 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1713,12 +1713,10 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { - struct xt_table_info *private; - void *loc_cpu_entry; + struct xt_table_info *private = table->private; struct module *table_owner = table->me; struct ip6t_entry *iter; - - private = xt_unregister_table(table); + void *loc_cpu_entry; /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; @@ -1727,6 +1725,7 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); + kfree(table); } int ip6t_register_table(struct net *net, const struct xt_table *table, @@ -1767,7 +1766,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, void ip6t_unregister_table_exit(struct net *net, const char *name) { - struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + struct xt_table *table = xt_unregister_table_exit(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index bb8aa3fc42b45e..c2394e2c94b562 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -121,8 +121,11 @@ static int ip6table_nat_table_init(struct net *net) } ret = ip6t_nat_register_lookups(net); - if (ret < 0) + if (ret < 0) { + xt_unregister_table_pre_exit(net, NFPROTO_IPV6, "nat"); + synchronize_rcu(); ip6t_unregister_table_exit(net, "nat"); + } kfree(repl); return ret; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9c1e896c7b03b6..4e6708c23922b9 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -55,6 +55,9 @@ static struct list_head xt_templates[NFPROTO_NUMPROTO]; struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; + + /* stash area used during netns exit */ + struct list_head dead_tables[NFPROTO_NUMPROTO]; }; struct compat_delta { @@ -1634,23 +1637,6 @@ struct xt_table *xt_register_table(struct net *net, } EXPORT_SYMBOL_GPL(xt_register_table); -void *xt_unregister_table(struct xt_table *table) -{ - struct xt_table_info *private; - - mutex_lock(&xt[table->af].mutex); - private = table->private; - list_del(&table->list); - mutex_unlock(&xt[table->af].mutex); - audit_log_nfcfg(table->name, table->af, private->number, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); - kfree(table->ops); - kfree(table); - - return private; -} -EXPORT_SYMBOL_GPL(xt_unregister_table); - /** * xt_unregister_table_pre_exit - pre-shutdown unregister of a table * @net: network namespace @@ -1660,6 +1646,14 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); * Unregisters the specified netfilter table from the given network namespace * and also unregisters the hooks from netfilter core: no new packets will be * processed. + * + * This must be called prior to xt_unregister_table_exit() from the pernet + * .pre_exit callback. After this call, the table is no longer visible to + * the get/setsockopt path. In case of rmmod, module exit path must have + * called xt_unregister_template() prior to unregistering pernet ops to + * prevent re-instantiation of the table. + * + * See also: xt_unregister_table_exit() */ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) { @@ -1669,6 +1663,7 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &xt_net->dead_tables[af]); mutex_unlock(&xt[af].mutex); if (t->ops) /* nat table registers with nat core, t->ops is NULL. */ @@ -1679,6 +1674,50 @@ void xt_unregister_table_pre_exit(struct net *net, u8 af, const char *name) mutex_unlock(&xt[af].mutex); } EXPORT_SYMBOL(xt_unregister_table_pre_exit); + +/** + * xt_unregister_table_exit - remove a table during namespace teardown + * @net: the network namespace from which to unregister the table + * @af: address family (e.g., NFPROTO_IPV4, NFPROTO_IPV6) + * @name: name of the table to unregister + * + * Completes the unregister process for a table. This must be called from + * the pernet ops .exit callback. This is the second stage after + * xt_unregister_table_pre_exit(). + * + * pair with xt_unregister_table_pre_exit() during namespace shutdown. + * + * Return: the unregistered table or NULL if the table was never + * instantiated. The caller needs to kfree() the table after it + * has removed the family specific matches/targets. + */ +struct xt_table *xt_unregister_table_exit(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *table; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(table, &xt_net->dead_tables[af], list) { + struct nf_hook_ops *ops = NULL; + + if (strcmp(table->name, name) != 0) + continue; + + list_del(&table->list); + + audit_log_nfcfg(table->name, table->af, table->private->number, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + swap(table->ops, ops); + mutex_unlock(&xt[af].mutex); + + kfree(ops); + return table; + } + mutex_unlock(&xt[af].mutex); + + return NULL; +} +EXPORT_SYMBOL_GPL(xt_unregister_table_exit); #endif #ifdef CONFIG_PROC_FS @@ -2125,8 +2164,10 @@ static int __net_init xt_net_init(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { INIT_LIST_HEAD(&xt_net->tables[i]); + INIT_LIST_HEAD(&xt_net->dead_tables[i]); + } return 0; } @@ -2135,8 +2176,10 @@ static void __net_exit xt_net_exit(struct net *net) struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; - for (i = 0; i < NFPROTO_NUMPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) { WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->dead_tables[i])); + } } static struct pernet_operations xt_net_ops = { From b7f0544d86d439cb946515d2ef6a0a75e8626710 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:18 +0200 Subject: [PATCH 549/972] netfilter: ebtables: move to two-stage removal scheme Like previous patches for x_tables, follow same pattern in ebtables. We can't reuse xt helpers: ebt_table struct layout is incompatible. table->ops assignment is now done while still holding the ebt mutex to make sure we never expose partially-filled table struct. Fixes: 87663c39f898 ("netfilter: ebtables: do not hook tables by default") Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 2 +- net/bridge/netfilter/ebtable_filter.c | 2 +- net/bridge/netfilter/ebtable_nat.c | 2 +- net/bridge/netfilter/ebtables.c | 60 +++++++++++++++++---------- 4 files changed, 40 insertions(+), 26 deletions(-) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 74136021955257..e6f9e343b41f1a 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -128,8 +128,8 @@ static int __init ebtable_broute_init(void) static void __exit ebtable_broute_fini(void) { - unregister_pernet_subsys(&broute_net_ops); ebt_unregister_template(&broute_table); + unregister_pernet_subsys(&broute_net_ops); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index dacd81b12e6264..02b6501c15a5e3 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -109,8 +109,8 @@ static int __init ebtable_filter_init(void) static void __exit ebtable_filter_fini(void) { - unregister_pernet_subsys(&frame_filter_net_ops); ebt_unregister_template(&frame_filter); + unregister_pernet_subsys(&frame_filter_net_ops); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0f2a8c6118d42e..9985a82555c41c 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -109,8 +109,8 @@ static int __init ebtable_nat_init(void) static void __exit ebtable_nat_fini(void) { - unregister_pernet_subsys(&frame_nat_net_ops); ebt_unregister_template(&frame_nat); + unregister_pernet_subsys(&frame_nat_net_ops); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index aea3e19875c69d..3578ffbc14aee3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -42,6 +42,7 @@ struct ebt_pernet { struct list_head tables; + struct list_head dead_tables; }; struct ebt_template { @@ -1162,11 +1163,6 @@ static int do_replace(struct net *net, sockptr_t arg, unsigned int len) static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - mutex_lock(&ebt_mutex); - list_del(&table->list); - mutex_unlock(&ebt_mutex); - audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries, - AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, ebt_cleanup_entry, net, NULL); if (table->private->nentries) @@ -1267,13 +1263,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, for (i = 0; i < num_ops; i++) ops[i].priv = table; - list_add(&table->list, &ebt_net->tables); - mutex_unlock(&ebt_mutex); - table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); - if (ret) + if (ret) { + synchronize_rcu(); __ebt_unregister_table(net, table); + } else { + list_add(&table->list, &ebt_net->tables); + } + mutex_unlock(&ebt_mutex); audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1339,7 +1337,7 @@ void ebt_unregister_template(const struct ebt_table *t) } EXPORT_SYMBOL(ebt_unregister_template); -static struct ebt_table *__ebt_find_table(struct net *net, const char *name) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; @@ -1348,30 +1346,36 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { + list_move(&t->list, &ebt_net->dead_tables); mutex_unlock(&ebt_mutex); - return t; + nf_unregister_net_hooks(net, t->ops, hweight32(t->valid_hooks)); + return; } } mutex_unlock(&ebt_mutex); - return NULL; -} - -void ebt_unregister_table_pre_exit(struct net *net, const char *name) -{ - struct ebt_table *table = __ebt_find_table(net, name); - - if (table) - nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); void ebt_unregister_table(struct net *net, const char *name) { - struct ebt_table *table = __ebt_find_table(net, name); + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_table *t; - if (table) - __ebt_unregister_table(net, table); + mutex_lock(&ebt_mutex); + + list_for_each_entry(t, &ebt_net->dead_tables, list) { + if (strcmp(t->name, name) == 0) { + list_del(&t->list); + audit_log_nfcfg(t->name, AF_BRIDGE, t->private->nentries, + AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + __ebt_unregister_table(net, t); + mutex_unlock(&ebt_mutex); + return; + } + } + + mutex_unlock(&ebt_mutex); } /* userspace just supplied us with counters */ @@ -2556,11 +2560,21 @@ static int __net_init ebt_pernet_init(struct net *net) struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); INIT_LIST_HEAD(&ebt_net->tables); + INIT_LIST_HEAD(&ebt_net->dead_tables); return 0; } +static void __net_exit ebt_pernet_exit(struct net *net) +{ + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + WARN_ON_ONCE(!list_empty(&ebt_net->tables)); + WARN_ON_ONCE(!list_empty(&ebt_net->dead_tables)); +} + static struct pernet_operations ebt_net_ops = { .init = ebt_pernet_init, + .exit = ebt_pernet_exit, .id = &ebt_pernet_id, .size = sizeof(struct ebt_pernet), }; From 92c603fa07bc0d6a17345de3ad7954730b8de44b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:19 +0200 Subject: [PATCH 550/972] netfilter: ebtables: close dangling table module init race sashiko reported for a related patch: In modules like iptable_raw.c, [..], if register_pernet_subsys() fails, the rollback might call kfree(rawtable_ops) before [..] During this window, could a concurrent userspace process find the globally visible template, trigger table_init(), [..] The table init functions must always register the template last. Otherwise, set/getsockopt can instantiate a table in a namespace while the required pernet ops (contain the destructor) isn't available. This change is also required in x_tables, handled in followup change. Fixes: 87663c39f898 ("netfilter: ebtables: do not hook tables by default") Reviewed-by: Tristan Madani Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtable_broute.c | 12 +++++------- net/bridge/netfilter/ebtable_filter.c | 12 +++++------- net/bridge/netfilter/ebtable_nat.c | 10 ++++------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index e6f9e343b41f1a..f05c79f215ea06 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -112,18 +112,16 @@ static struct pernet_operations broute_net_ops = { static int __init ebtable_broute_init(void) { - int ret = ebt_register_template(&broute_table, broute_table_init); + int ret = register_pernet_subsys(&broute_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&broute_net_ops); - if (ret) { - ebt_unregister_template(&broute_table); - return ret; - } + ret = ebt_register_template(&broute_table, broute_table_init); + if (ret) + unregister_pernet_subsys(&broute_net_ops); - return 0; + return ret; } static void __exit ebtable_broute_fini(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 02b6501c15a5e3..0fc03b07e62aeb 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,18 +93,16 @@ static struct pernet_operations frame_filter_net_ops = { static int __init ebtable_filter_init(void) { - int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + int ret = register_pernet_subsys(&frame_filter_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_filter_net_ops); - if (ret) { - ebt_unregister_template(&frame_filter); - return ret; - } + ret = ebt_register_template(&frame_filter, frame_filter_table_init); + if (ret) + unregister_pernet_subsys(&frame_filter_net_ops); - return 0; + return ret; } static void __exit ebtable_filter_fini(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 9985a82555c41c..8a10375d890992 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,16 +93,14 @@ static struct pernet_operations frame_nat_net_ops = { static int __init ebtable_nat_init(void) { - int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + int ret = register_pernet_subsys(&frame_nat_net_ops); if (ret) return ret; - ret = register_pernet_subsys(&frame_nat_net_ops); - if (ret) { - ebt_unregister_template(&frame_nat); - return ret; - } + ret = ebt_register_template(&frame_nat, frame_nat_table_init); + if (ret) + unregister_pernet_subsys(&frame_nat_net_ops); return ret; } From 16bc4b6686b2c112c10e67d6b493adc3607256d3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 May 2026 12:07:20 +0200 Subject: [PATCH 551/972] netfilter: x_tables: close dangling table module init race Similar to the previous ebtables patch: template add exposes the table to userspace, we must do this last to rnsure the pernet ops are set up (contain the destructors). Fixes: fdacd57c79b7 ("netfilter: x_tables: never register tables by default") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arptable_filter.c | 23 ++++++++++++----------- net/ipv4/netfilter/iptable_filter.c | 23 ++++++++++++----------- net/ipv4/netfilter/iptable_mangle.c | 25 +++++++++++++------------ net/ipv4/netfilter/iptable_raw.c | 22 +++++++++++----------- net/ipv4/netfilter/iptable_security.c | 23 ++++++++++++----------- net/ipv6/netfilter/ip6table_filter.c | 22 +++++++++++----------- net/ipv6/netfilter/ip6table_mangle.c | 23 ++++++++++++----------- net/ipv6/netfilter/ip6table_raw.c | 20 ++++++++++---------- net/ipv6/netfilter/ip6table_security.c | 23 ++++++++++++----------- 9 files changed, 105 insertions(+), 99 deletions(-) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 382345567a600a..370b635e3523b9 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -58,25 +58,26 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - arptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); - if (IS_ERR(arpfilter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(arpfilter_ops)) return PTR_ERR(arpfilter_ops); - } ret = register_pernet_subsys(&arptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + arptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(arpfilter_ops); - return ret; + unregister_pernet_subsys(&arptable_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(arpfilter_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 0dea754a91209a..672d7da1071d31 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -77,26 +77,27 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret = xt_register_template(&packet_filter, - iptable_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&iptable_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, + iptable_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&iptable_filter_net_ops); + goto err_free; } return 0; +err_free: + kfree(filter_ops); + return ret; } static void __exit iptable_filter_fini(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 4d3b124923080b..13d25d9a4610e3 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -111,25 +111,26 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - iptable_mangle_table_init); - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); - ret = PTR_ERR(mangle_ops); - return ret; - } + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); ret = register_pernet_subsys(&iptable_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&iptable_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 6f7afec7954bd4..2745c22f4034d8 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -77,24 +77,24 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, - iptable_raw_table_init); - if (ret < 0) - return ret; - rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&iptable_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, + iptable_raw_table_init); if (ret < 0) { - xt_unregister_template(table); - kfree(rawtable_ops); - return ret; + unregister_pernet_subsys(&iptable_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 81175c20ccbe8d..491894511c5441 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -65,25 +65,26 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret = xt_register_template(&security_table, - iptable_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + iptable_security_table_init); if (ret < 0) { - xt_unregister_template(&security_table); - kfree(sectbl_ops); - return ret; + unregister_pernet_subsys(&iptable_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index cf561919bde84c..b074fc4776764c 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -76,25 +76,25 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret = xt_register_template(&packet_filter, - ip6table_filter_table_init); - - if (ret < 0) - return ret; + int ret; filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); - if (IS_ERR(filter_ops)) { - xt_unregister_template(&packet_filter); + if (IS_ERR(filter_ops)) return PTR_ERR(filter_ops); - } ret = register_pernet_subsys(&ip6table_filter_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_filter, ip6table_filter_table_init); if (ret < 0) { - xt_unregister_template(&packet_filter); - kfree(filter_ops); - return ret; + unregister_pernet_subsys(&ip6table_filter_net_ops); + goto err_free; } + return 0; +err_free: + kfree(filter_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 1a758f2bc5379c..e6ee036a9b2c58 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -104,25 +104,26 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret = xt_register_template(&packet_mangler, - ip6table_mangle_table_init); - - if (ret < 0) - return ret; + int ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - xt_unregister_template(&packet_mangler); + if (IS_ERR(mangle_ops)) return PTR_ERR(mangle_ops); - } ret = register_pernet_subsys(&ip6table_mangle_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); if (ret < 0) { - xt_unregister_template(&packet_mangler); - kfree(mangle_ops); - return ret; + unregister_pernet_subsys(&ip6table_mangle_net_ops); + goto err_free; } + return 0; +err_free: + kfree(mangle_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 923455921c1ddf..3b161ee875bcc1 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -75,24 +75,24 @@ static int __init ip6table_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - ret = xt_register_template(table, ip6table_raw_table_init); - if (ret < 0) - return ret; - /* Register hooks */ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); - if (IS_ERR(rawtable_ops)) { - xt_unregister_template(table); + if (IS_ERR(rawtable_ops)) return PTR_ERR(rawtable_ops); - } ret = register_pernet_subsys(&ip6table_raw_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(table, ip6table_raw_table_init); if (ret < 0) { - kfree(rawtable_ops); - xt_unregister_template(table); - return ret; + unregister_pernet_subsys(&ip6table_raw_net_ops); + goto err_free; } + return 0; +err_free: + kfree(rawtable_ops); return ret; } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index c44834d93fc792..4bd5d97b8ab65d 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -64,25 +64,26 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret = xt_register_template(&security_table, - ip6table_security_table_init); - - if (ret < 0) - return ret; + int ret; sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); - if (IS_ERR(sectbl_ops)) { - xt_unregister_template(&security_table); + if (IS_ERR(sectbl_ops)) return PTR_ERR(sectbl_ops); - } ret = register_pernet_subsys(&ip6table_security_net_ops); + if (ret < 0) + goto err_free; + + ret = xt_register_template(&security_table, + ip6table_security_table_init); if (ret < 0) { - kfree(sectbl_ops); - xt_unregister_template(&security_table); - return ret; + unregister_pernet_subsys(&ip6table_security_net_ops); + goto err_free; } + return 0; +err_free: + kfree(sectbl_ops); return ret; } From 27414ff1b287ea9a2a11675149ec28e05539f3cc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 May 2026 11:19:22 +0200 Subject: [PATCH 552/972] netfilter: bridge: eb_tables: close module init race sashiko reports for unrelated patch: Does the core ebtables initialization in ebtables.c suffer from a similar race? Once nf_register_sockopt() completes, the sockopts are exposed globally. sockopt has to be registered last, just like in ip/ip6/arptables. Fixes: 5b53951cfc85 ("netfilter: ebtables: use net_generic infra") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3578ffbc14aee3..b9f4daac09af36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -2583,19 +2583,20 @@ static int __init ebtables_init(void) { int ret; - ret = xt_register_target(&ebt_standard_target); + ret = register_pernet_subsys(&ebt_net_ops); if (ret < 0) return ret; - ret = nf_register_sockopt(&ebt_sockopts); + + ret = xt_register_target(&ebt_standard_target); if (ret < 0) { - xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } - ret = register_pernet_subsys(&ebt_net_ops); + ret = nf_register_sockopt(&ebt_sockopts); if (ret < 0) { - nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); return ret; } From dcb0f9aefdd604d36710fda53c25bd7cf4a3e37a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 May 2026 13:00:28 +0200 Subject: [PATCH 553/972] netfilter: nf_conntrack_expect: restore helper propagation via expectation A recent series to fix expectations broke helper propagation via expectation, this mechanism is used by the sip and h323 helper. This also propagates the conntrack helper to expected connections. I changed semantics of exp->helper which now tells us the actual helper that created the expectation. Add an explicit assign_helper field to expectations for this purpose and update helpers to use it. Restore this feature for userspace conntrack helper via ctnetlink nfqueue integration so it is again possible to attach a helper to an expectation, where it makes sense. This is not restored via ctnetlink expectation creation as there is no client for such feature. Use the expectation layer 4 protocol number for the helper lookup for consistency. Make sure the expectation using this helper propagation mechanism also go away when the helper is unregistered. Fixes: 9c42bc9db90a ("netfilter: nf_conntrack_expect: honor expectation helper field") Fixes: 917b61fa2042 ("netfilter: ctnetlink: ignore explicit helper on new expectations") Reported-by: Ilya Maximets Tested-by: Ilya Maximets Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 5 ++++- net/netfilter/nf_conntrack_broadcast.c | 1 + net/netfilter/nf_conntrack_core.c | 7 +++++-- net/netfilter/nf_conntrack_expect.c | 1 + net/netfilter/nf_conntrack_h323_main.c | 12 ++++++------ net/netfilter/nf_conntrack_helper.c | 5 +++++ net/netfilter/nf_conntrack_netlink.c | 18 ++++++++++++++++-- net/netfilter/nf_conntrack_sip.c | 2 +- 8 files changed, 39 insertions(+), 12 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index e9a8350e7ccfb0..80f50fd0f7ad27 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -45,9 +45,12 @@ struct nf_conntrack_expect { void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); - /* Helper to assign to new connection */ + /* Helper that created this expectation */ struct nf_conntrack_helper __rcu *helper; + /* Helper to assign to new connection */ + struct nf_conntrack_helper __rcu *assign_helper; + /* The conntrack of the master connection */ struct nf_conn *master; diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index 4f39bf7c843f2d..75e53fde6b2974 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -72,6 +72,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, exp->flags = NF_CT_EXPECT_PERMANENT; exp->class = NF_CT_EXPECT_CLASS_DEFAULT; rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b081892263201c..8ba5b22a1eef2f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1811,14 +1811,17 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { + struct nf_conntrack_helper *assign_helper; + /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ ct->master = exp->master; - if (exp->helper) { + assign_helper = rcu_dereference(exp->assign_helper); + if (assign_helper) { help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) - rcu_assign_pointer(help->helper, exp->helper); + rcu_assign_pointer(help->helper, assign_helper); } #ifdef CONFIG_NF_CONNTRACK_MARK diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 24d0576d84b7f6..8e943efbdf0a52 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -344,6 +344,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, helper = rcu_dereference(help->helper); rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, NULL); write_pnet(&exp->net, net); #ifdef CONFIG_NF_CONNTRACK_ZONES exp->zone = ct->zone; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 3f5c50455b716a..b2fe6554b9cf43 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -643,7 +643,7 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, &nf_conntrack_helper_h245); + rcu_assign_pointer(exp->assign_helper, &nf_conntrack_helper_h245); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -767,7 +767,7 @@ static int expect_callforwarding(struct sk_buff *skb, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); nathook = rcu_dereference(nfct_h323_nat_hook); if (memcmp(&ct->tuplehash[dir].tuple.src.u3, @@ -1234,7 +1234,7 @@ static int expect_q931(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3 : NULL, &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_TCP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); exp->flags = NF_CT_EXPECT_PERMANENT; /* Accept multiple calls */ nathook = rcu_dereference(nfct_h323_nat_hook); @@ -1306,7 +1306,7 @@ static int process_gcf(struct sk_buff *skb, struct nf_conn *ct, nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_UDP, NULL, &port); - rcu_assign_pointer(exp->helper, nf_conntrack_helper_ras); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_ras); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect RAS "); @@ -1523,7 +1523,7 @@ static int process_acf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); @@ -1577,7 +1577,7 @@ static int process_lcf(struct sk_buff *skb, struct nf_conn *ct, &ct->tuplehash[!dir].tuple.src.u3, &addr, IPPROTO_TCP, NULL, &port); exp->flags = NF_CT_EXPECT_PERMANENT; - rcu_assign_pointer(exp->helper, nf_conntrack_helper_q931); + rcu_assign_pointer(exp->assign_helper, nf_conntrack_helper_q931); if (nf_ct_expect_related(exp, 0) == 0) { pr_debug("nf_ct_ras: expect Q.931 "); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index a715304a53d8c2..b594cd244fe1d4 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -400,6 +400,11 @@ static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) this = rcu_dereference_protected(exp->helper, lockdep_is_held(&nf_conntrack_expect_lock)); + if (this == me) + return true; + + this = rcu_dereference_protected(exp->assign_helper, + lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eda5fe4a75c827..d7209d12411145 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2634,6 +2634,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr *const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask); @@ -2860,6 +2861,7 @@ static int ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report) { + struct nf_conntrack_helper *assign_helper = NULL; struct nlattr *cda[CTA_EXPECT_MAX+1]; struct nf_conntrack_tuple tuple, mask; struct nf_conntrack_expect *exp; @@ -2875,8 +2877,18 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (cda[CTA_EXPECT_HELP_NAME]) { + const char *helpname = nla_data(cda[CTA_EXPECT_HELP_NAME]); + + assign_helper = __nf_conntrack_helper_find(helpname, + nf_ct_l3num(ct), + tuple.dst.protonum); + if (!assign_helper) + return -EOPNOTSUPP; + } + exp = ctnetlink_alloc_expect((const struct nlattr * const *)cda, ct, - &tuple, &mask); + assign_helper, &tuple, &mask); if (IS_ERR(exp)) return PTR_ERR(exp); @@ -3515,6 +3527,7 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, static struct nf_conntrack_expect * ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, + const struct nf_conntrack_helper *assign_helper, struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *mask) { @@ -3568,6 +3581,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, exp->zone = ct->zone; #endif rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, assign_helper); exp->tuple = *tuple; exp->mask.src.u3 = mask->src.u3; exp->mask.src.u.all = mask->src.u.all; @@ -3623,7 +3637,7 @@ ctnetlink_create_expect(struct net *net, ct = nf_ct_tuplehash_to_ctrack(h); rcu_read_lock(); - exp = ctnetlink_alloc_expect(cda, ct, &tuple, &mask); + exp = ctnetlink_alloc_expect(cda, ct, NULL, &tuple, &mask); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto err_rcu; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1eb55907d470d6..d24bfa9e823448 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1383,7 +1383,7 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; - rcu_assign_pointer(exp->helper, helper); + rcu_assign_pointer(exp->assign_helper, helper); exp->flags = NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE; hooks = rcu_dereference(nf_nat_sip_hooks); From d8ef54c83ad70b81735b506431affadd2f720aa1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 May 2026 23:57:55 +0200 Subject: [PATCH 554/972] netfilter: ctnetlink: check tuple and mask in expectations created via nfqueue Ensure the expectation tuple and mask attributes are present in netlink message, otherwise null-ptr-deref is possible. Fixes: bd0779370588 ("netfilter: nfnetlink_queue: allow to attach expectations to conntracks") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d7209d12411145..befa7e83ee49f5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2872,6 +2872,9 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, if (err < 0) return err; + if (!cda[CTA_EXPECT_TUPLE] || !cda[CTA_EXPECT_MASK]) + return -EINVAL; + err = ctnetlink_glue_exp_parse((const struct nlattr * const *)cda, ct, &tuple, &mask); if (err < 0) From eb6317739b1ea3ab28791e1f91b24781905fa815 Mon Sep 17 00:00:00 2001 From: Li Xiasong Date: Thu, 7 May 2026 22:04:22 +0800 Subject: [PATCH 555/972] netfilter: nf_conntrack_sip: get helper before allocating expectation process_register_request() allocates an expectation and then checks whether a conntrack helper is available. If helper lookup fails, the function returns early and the allocated expectation is left behind. Reorder the code to fetch and validate helper before calling nf_ct_expect_alloc(). This keeps the logic simpler and removes the leak path while preserving existing behavior. Fixes: e14575fa7529 ("netfilter: nf_conntrack: use rcu accessors where needed") Cc: stable@vger.kernel.org Signed-off-by: Li Xiasong Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d24bfa9e823448..e69941f1a10161 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1366,6 +1366,10 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, goto store_cseq; } + helper = rcu_dereference(nfct_help(ct)->helper); + if (!helper) + return NF_DROP; + exp = nf_ct_expect_alloc(ct); if (!exp) { nf_ct_helper_log(skb, ct, "cannot alloc expectation"); @@ -1376,10 +1380,6 @@ static int process_register_request(struct sk_buff *skb, unsigned int protoff, if (sip_direct_signalling) saddr = &ct->tuplehash[!dir].tuple.src.u3; - helper = rcu_dereference(nfct_help(ct)->helper); - if (!helper) - return NF_DROP; - nf_ct_expect_init(exp, SIP_EXPECT_SIGNALLING, nf_ct_l3num(ct), saddr, &daddr, proto, NULL, &port); exp->timeout.expires = sip_timeout * HZ; From 19f94b6fee75b3ef7fbc06f3745b9a771a8a19a4 Mon Sep 17 00:00:00 2001 From: Li Xiasong Date: Thu, 7 May 2026 22:04:23 +0800 Subject: [PATCH 556/972] netfilter: nft_ct: fix missing expect put in obj eval nft_ct_expect_obj_eval() allocates an expectation and may call nf_ct_expect_related(), but never drops its local reference. Add nf_ct_expect_put(exp) before return to balance allocation. Fixes: 857b46027d6f ("netfilter: nft_ct: add ct expectations support") Cc: stable@vger.kernel.org Signed-off-by: Li Xiasong Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 60ee8d932fcb36..fa2cc556331cf5 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1334,6 +1334,8 @@ static void nft_ct_expect_obj_eval(struct nft_object *obj, if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; + + nf_ct_expect_put(exp); } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { From 1f91d0d5827512816789f74f4d72d16269bde1ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2026 14:16:59 -1000 Subject: [PATCH 557/972] sched_ext: Fix !CONFIG_EXT_SUB_SCHED build warnings W=1 with CONFIG_EXT_SUB_SCHED=n flags 'err_msg' uninitialized and 'err_free_lb_resched' unused. Initialize err_msg and gate the label. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 48b4834c702702..f4e2db8e56be62 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5584,7 +5584,7 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { - const char *err_msg; + const char *err_msg = ""; s32 ret = 0; scoped_guard(raw_spinlock_irq, &scx_sched_lock) { @@ -6652,8 +6652,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #endif /* CONFIG_EXT_SUB_SCHED */ return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: free_cpumask_var(sch->bypass_lb_resched_cpumask); +#endif err_free_lb_cpumask: free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: From 04b8f96b10881006e8aadbf72c59427ec5eaa9b4 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 7 May 2026 16:21:31 -0700 Subject: [PATCH 558/972] ASoC: tegra: tegra210-mixer: Use div_u64() for 64-bit division A MIPS allmodconfig build with LLVM fails during modpost: ERROR: modpost: "__udivdi3" [sound/soc/tegra/snd-soc-tegra210-mixer.ko] undefined! tegra210_mixer_configure_gain() divides a 64-bit BIT_ULL() value by the fade duration. On 32-bit MIPS, clang emits a call to __udivdi3 for that plain C division, but that compiler helper is not exported to modules. Use div_u64() for the inverse duration calculation so the driver uses the kernel's 64-bit division helper instead of emitting a compiler runtime call. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260507232131.438589-1-rosenp@gmail.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_mixer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/tegra/tegra210_mixer.c b/sound/soc/tegra/tegra210_mixer.c index f05617b5f43356..bfdd457f740c18 100644 --- a/sound/soc/tegra/tegra210_mixer.c +++ b/sound/soc/tegra/tegra210_mixer.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -157,8 +158,8 @@ static int tegra210_mixer_configure_gain(struct snd_soc_component *cmpnt, if (i == DURATION_N3_ID) val = mixer->duration[id]; else if (i == DURATION_INV_N3_ID) - val = (u32)(BIT_ULL(31 + TEGRA210_MIXER_PRESCALAR) / - mixer->duration[id]); + val = div_u64(BIT_ULL(31 + TEGRA210_MIXER_PRESCALAR), + mixer->duration[id]); else val = gain_params.duration[i]; } From 307abfac04a254c09c5705d816b33354acee97a0 Mon Sep 17 00:00:00 2001 From: Jianpeng Chang Date: Fri, 8 May 2026 09:56:36 +0900 Subject: [PATCH 559/972] kprobes: skip non-symbol addresses in kprobe_add_ksym_blacklist() When kprobe_add_area_blacklist() iterates through a section like .kprobes.text, the start address may not correspond to a named symbol. On ARM64 with CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS=y (introduced by commit baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS")), the compiler flag -fpatchable-function-entry=4,2 inserts 2 NOPs before each function entry point for ftrace call_ops. These pre-function NOPs sit at the section base address, before the first named function symbol. The compiler emits a $x mapping symbol at offset 0x00 to mark the start of code, but find_kallsyms_symbol() ignores mapping symbols. Without CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS (e.g. defconfig), no pre-function NOPs are inserted, the first function starts at offset 0x00, and the bug does not trigger. This only affects modules that have a .kprobes.text section (i.e. those using the __kprobes annotation). Modules using NOKPROBE_SYMBOL() instead (like kretprobe_example.ko) blacklist exact function addresses via the _kprobe_blacklist section and are not affected. For kprobe_example.ko on ARM64 with -fpatchable-function-entry=4,2, the .kprobes.text section layout is: offset 0x00: $x + 2 NOPs (mapping symbol + ftrace preamble) offset 0x08: handler_post (64 bytes) offset 0x50: handler_pre (68 bytes) kprobe_add_area_blacklist() starts iterating from the section base address (offset 0x00), which only has the $x mapping symbol. kprobe_add_ksym_blacklist() then calls kallsyms_lookup_size_offset() for this address, which goes through: kallsyms_lookup_size_offset() -> module_address_lookup() -> find_kallsyms_symbol() find_kallsyms_symbol() scans all module symbols to find the closest preceding symbol. Since no named text symbol exists at offset 0x00, find_kallsyms_symbol() picks __UNIQUE_ID_vermagic (a .modinfo symbol whose address is in the temporary image) as the "best" match. The computed "size" = next_text_symbol - modinfo_symbol spans across these two unrelated memory regions, creating a blacklist entry with a bogus range of tens of terabytes. Whether this causes a visible failure depends on address randomization, here is what happens on Raspberry Pi 4/5: - On RPi5, the bogus size was ~35 TB. start + size stayed within 64-bit range, so the blacklist entry covered the entire kernel text. register_kprobe() in the module's own init function failed with -EINVAL. - On RPi4, the bogus size was ~75 TB. start + size overflowed 64 bits and wrapped to a small address near zero. The range check (addr >= start && addr < end) then failed because end wrapped around, so the bogus entry was accidentally harmless and kprobes worked by luck. The same bug exists on both machines, but randomization determines whether the integer overflow masks it or not. Fix this by adding notrace to the __kprobes macro. Functions in .kprobes.text are kprobe infrastructure handlers that should never be traced by ftrace. With notrace, the compiler stops inserting them and the non-symbol gap at the section start disappears entirely. Link: https://lore.kernel.org/all/20260506012706.2785785-1-jianpeng.chang.cn@windriver.com/ Fixes: baaf553d3bc3 ("arm64: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS") Signed-off-by: Jianpeng Chang Signed-off-by: Masami Hiramatsu (Google) --- include/asm-generic/kprobes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/kprobes.h b/include/asm-generic/kprobes.h index 060eab094e5a22..5290a2b2e15a0f 100644 --- a/include/asm-generic/kprobes.h +++ b/include/asm-generic/kprobes.h @@ -14,7 +14,7 @@ static unsigned long __used \ _kbl_addr_##fname = (unsigned long)fname; # define NOKPROBE_SYMBOL(fname) __NOKPROBE_SYMBOL(fname) /* Use this to forbid a kprobes attach on very low level functions */ -# define __kprobes __section(".kprobes.text") +# define __kprobes notrace __section(".kprobes.text") # define nokprobe_inline __always_inline #else # define NOKPROBE_SYMBOL(fname) From ef5581bb30efb939cc2bf093475c6cc85258e5cd Mon Sep 17 00:00:00 2001 From: Martin Kaiser Date: Fri, 8 May 2026 09:56:36 +0900 Subject: [PATCH 560/972] test_kprobes: clear kprobes between test runs Running the kprobes sanity tests twice makes all tests fail and eventually crashes the kernel. [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # Totals: pass:5 fail:0 skip:0 total:5 ok 1 kprobes_test [root@martin-riscv-1 ~]# echo 1 > /sys/kernel/debug/kunit/kprobes_test/run ... # test_kprobe: EXPECTATION FAILED at lib/tests/test_kprobes.c:64 Expected 0 == register_kprobe(&kp), but register_kprobe(&kp) == -22 (0xffffffffffffffea) ... Unable to handle kernel paging request ... The testsuite defines several kprobes and kretprobes as static variables that are preserved across test runs. After register_kprobe and unregister_kprobe, a kprobe contains some leftover data that must be cleared before the kprobe can be registered again. The tests are setting symbol_name to define the probe location. Address and flags must be cleared. The existing code clears some of the probes between subsequent tests, but not between two test runs. The leftover data from a previous test run makes the registrations fail in the next run. Move the cleanups for all kprobes into kprobes_test_init, this function is called before each single test (including the first test of a test run). Link: https://lore.kernel.org/all/20260507134615.1010905-1-martin@kaiser.cx/ Fixes: e44e81c5b90f ("kprobes: convert tests to kunit") Signed-off-by: Martin Kaiser Signed-off-by: Masami Hiramatsu (Google) --- lib/tests/test_kprobes.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/tests/test_kprobes.c b/lib/tests/test_kprobes.c index b7582010125c3f..06e729e4de0516 100644 --- a/lib/tests/test_kprobes.c +++ b/lib/tests/test_kprobes.c @@ -12,6 +12,12 @@ #define div_factor 3 +#define KP_CLEAR(_kp) \ +do { \ + (_kp).addr = NULL; \ + (_kp).flags = 0; \ +} while (0) + static u32 rand1, preh_val, posth_val; static u32 (*target)(u32 value); static u32 (*recursed_target)(u32 value); @@ -125,10 +131,6 @@ static void test_kprobes(struct kunit *test) current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - kp.addr = NULL; - kp.flags = 0; - KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); preh_val = 0; posth_val = 0; @@ -226,9 +228,6 @@ static void test_kretprobes(struct kunit *test) struct kretprobe *rps[2] = {&rp, &rp2}; current_test = test; - /* addr and flags should be cleard for reusing kprobe. */ - rp.kp.addr = NULL; - rp.kp.flags = 0; KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); krph_val = 0; @@ -290,8 +289,6 @@ static void test_stacktrace_on_kretprobe(struct kunit *test) unsigned long myretaddr = (unsigned long)__builtin_return_address(0); current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; /* * Run the stacktrace_driver() to record correct return address in @@ -352,8 +349,6 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) struct kretprobe *rps[2] = {&rp3, &rp4}; current_test = test; - rp3.kp.addr = NULL; - rp3.kp.flags = 0; //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); @@ -367,6 +362,18 @@ static void test_stacktrace_on_nested_kretprobe(struct kunit *test) static int kprobes_test_init(struct kunit *test) { + KP_CLEAR(kp); + KP_CLEAR(kp2); + KP_CLEAR(kp_missed); +#ifdef CONFIG_KRETPROBES + KP_CLEAR(rp.kp); + KP_CLEAR(rp2.kp); +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + KP_CLEAR(rp3.kp); + KP_CLEAR(rp4.kp); +#endif +#endif + target = kprobe_target; target2 = kprobe_target2; recursed_target = kprobe_recursed_target; From 41337097f2823e99478d7cbe68d4893582ed0b18 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 6 May 2026 21:21:52 +0800 Subject: [PATCH 561/972] riscv: cpufeature: Use pre-defined ISA ext macros to index isa2hwcap We have pre-defined ISA extension macros, here use those macros to replace a magic number for isa2hwcap definition and some array indexing for isa2hwcap access. This doesn't change the original functionality, just improve the code maintainability and readability. Signed-off-by: Hui Wang Link: https://patch.msgid.link/20260506132152.53239-1-hui.wang@canonical.com Signed-off-by: Paul Walmsley --- arch/riscv/kernel/cpufeature.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3dc4c0d31550fe..f46aa5602d74d3 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -1102,16 +1102,16 @@ early_param("riscv_isa_fallback", riscv_isa_fallback_setup); void __init riscv_fill_hwcap(void) { char print_str[NUM_ALPHA_EXTS + 1]; - unsigned long isa2hwcap[26] = {0}; + unsigned long isa2hwcap[RISCV_ISA_EXT_BASE] = {0}; int i, j; - isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; - isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; + isa2hwcap[RISCV_ISA_EXT_i] = COMPAT_HWCAP_ISA_I; + isa2hwcap[RISCV_ISA_EXT_m] = COMPAT_HWCAP_ISA_M; + isa2hwcap[RISCV_ISA_EXT_a] = COMPAT_HWCAP_ISA_A; + isa2hwcap[RISCV_ISA_EXT_f] = COMPAT_HWCAP_ISA_F; + isa2hwcap[RISCV_ISA_EXT_d] = COMPAT_HWCAP_ISA_D; + isa2hwcap[RISCV_ISA_EXT_c] = COMPAT_HWCAP_ISA_C; + isa2hwcap[RISCV_ISA_EXT_v] = COMPAT_HWCAP_ISA_V; if (!acpi_disabled) { riscv_fill_hwcap_from_isa_string(isa2hwcap); From 5e28b7b94408897e41c63477aabc9e1db439bc8c Mon Sep 17 00:00:00 2001 From: "Francis, David" Date: Tue, 28 Apr 2026 19:25:50 +0000 Subject: [PATCH 562/972] drm: Set old handle to NULL before prime swap in change_handle There was a potential race condition in change_handle. The ioctl briefly had a single object with two idr entries; a concurrent gem_close could delete the object and remove one of the handles while leaving the other one dangling, which could subsequently be dereferenced for a use-after-free. To fix this, do the same dance that gem_close itself does. (f6cd7daecff5 drm: Release driver references to handle before making it available again) First idr_replace the old handle to NULL. Later, if the prime operations are successful, actually close it. create_tail required a similar dance to avoid a similar problem. (bd46cece51a3 drm/gem: Fix race in drm_gem_handle_create_tail()) It idr_allocs the new handle with NULL, then swaps in the correct object later to avoid races. We don't need to do that here, since the only operations that could race are drm_prime, and change_handle holds the prime lock for the entire duration. v2: cleanups of error paths Signed-off-by: David Francis Co-authored-by: Dave Airlie Reported-by: Puttimet Thammasaeng Tested-by: Vitaly Prosyak Cc: Simona Vetter Cc: stable@vger.kernel.org Cc: Christian Koenig Fixes: 53096728b8910 ("drm: Add DRM prime interface to reassign GEM handle") Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_gem.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index d6424267260bdb..51a887cc7fd744 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1019,7 +1019,7 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_gem_change_handle *args = data; - struct drm_gem_object *obj; + struct drm_gem_object *obj, *idrobj; int handle, ret; if (!drm_core_check_feature(dev, DRIVER_GEM)) @@ -1042,8 +1042,29 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, mutex_lock(&file_priv->prime.lock); spin_lock(&file_priv->table_lock); + + /* When create_tail allocs an obj idr, it needs to first alloc as NULL, + * then later replace with the correct object. This is not necessary + * here, because the only operations that could race are drm_prime + * bookkeeping, and we hold the prime lock. + */ ret = idr_alloc(&file_priv->object_idr, obj, handle, handle + 1, GFP_NOWAIT); + + if (ret < 0) { + spin_unlock(&file_priv->table_lock); + goto out_unlock; + } + + idrobj = idr_replace(&file_priv->object_idr, NULL, handle); + if (idrobj != obj) { + idr_replace(&file_priv->object_idr, idrobj, handle); + idr_remove(&file_priv->object_idr, args->new_handle); + spin_unlock(&file_priv->table_lock); + ret = -ENOENT; + goto out_unlock; + } + spin_unlock(&file_priv->table_lock); if (ret < 0) @@ -1055,6 +1076,8 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, if (ret < 0) { spin_lock(&file_priv->table_lock); idr_remove(&file_priv->object_idr, handle); + idrobj = idr_replace(&file_priv->object_idr, obj, handle); + WARN_ON(idrobj != NULL); spin_unlock(&file_priv->table_lock); goto out_unlock; } From f03e8583532941b07761c5429de7d50766fa3110 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Sun, 3 May 2026 12:28:58 +0800 Subject: [PATCH 563/972] batman-adv: stop caching unowned originator pointers in BAT IV BAT IV keeps the last-hop neighbor address in each neigh_node, but some paths also cache an originator pointer derived from a temporary lookup. That pointer is not owned by the neigh_node and may no longer refer to a live originator entry after purge handling runs. Stop storing the auxiliary originator pointer in the BAT IV neighbor state. When BAT IV needs the neighbor originator data, resolve it from the stored neighbor address and drop the reference again after use. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei [sven: avoid bonding logic for outgoing OGM] Signed-off-by: Sven Eckelmann --- net/batman-adv/bat_iv_ogm.c | 83 ++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 618d1889c04e75..74ef7dc2b2f981 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -173,19 +173,12 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh) + struct batadv_orig_node *orig_node) { struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_get_or_create(orig_node, hard_iface, neigh_addr); - if (!neigh_node) - goto out; - - neigh_node->orig_node = orig_neigh; - -out: return neigh_node; } @@ -906,6 +899,31 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, return sum; } +/** + * batadv_iv_ogm_neigh_ifinfo_sum() - Get bcast_own sum for a last-hop neighbor + * @bat_priv: the bat priv with all the mesh interface information + * @neigh_node: last-hop neighbor of an originator + * + * Return: Number of replied (rebroadcasted) OGMs for the originator currently + * announced by the neighbor. Returns 0 if the neighbor's originator entry is + * not available anymore. + */ +static u8 batadv_iv_ogm_neigh_ifinfo_sum(struct batadv_priv *bat_priv, + const struct batadv_neigh_node *neigh_node) +{ + struct batadv_orig_node *orig_neigh; + u8 sum; + + orig_neigh = batadv_orig_hash_find(bat_priv, neigh_node->addr); + if (!orig_neigh) + return 0; + + sum = batadv_iv_orig_ifinfo_sum(orig_neigh, neigh_node->if_incoming); + batadv_orig_node_put(orig_neigh); + + return sum; +} + /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator @@ -975,17 +993,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, } if (!neigh_node) { - struct batadv_orig_node *orig_tmp; - - orig_tmp = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); - if (!orig_tmp) - goto unlock; - neigh_node = batadv_iv_ogm_neigh_new(if_incoming, ethhdr->h_source, - orig_node, orig_tmp); - - batadv_orig_node_put(orig_tmp); + orig_node); if (!neigh_node) goto unlock; } else { @@ -1037,10 +1047,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, */ if (router_ifinfo && neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) { - sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node, - router->if_incoming); - sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node, - neigh_node->if_incoming); + sum_orig = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, router); + sum_neigh = batadv_iv_ogm_neigh_ifinfo_sum(bat_priv, + neigh_node); if (sum_orig >= sum_neigh) goto out; } @@ -1106,7 +1115,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, if (!neigh_node) neigh_node = batadv_iv_ogm_neigh_new(if_incoming, orig_neigh_node->orig, - orig_neigh_node, orig_neigh_node); if (!neigh_node) @@ -1302,6 +1310,32 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, return ret; } +/** + * batadv_orig_to_direct_router() - get direct next hop neighbor to an orig address + * @bat_priv: the bat priv with all the mesh interface information + * @orig_addr: the originator MAC address to search the best next hop router for + * @if_outgoing: the interface where the OGM should be sent to + * + * Return: A neighbor node which is the best router towards the given originator + * address. Bonding candidates are ignored. + */ +static struct batadv_neigh_node * +batadv_orig_to_direct_router(struct batadv_priv *bat_priv, u8 *orig_addr, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_neigh_node *neigh_node; + struct batadv_orig_node *orig_node; + + orig_node = batadv_orig_hash_find(bat_priv, orig_addr); + if (!orig_node) + return NULL; + + neigh_node = batadv_orig_router_get(orig_node, if_outgoing); + batadv_orig_node_put(orig_node); + + return neigh_node; +} + /** * batadv_iv_ogm_process_per_outif() - process a batman iv OGM for an outgoing * interface @@ -1372,8 +1406,9 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, router = batadv_orig_router_get(orig_node, if_outgoing); if (router) { - router_router = batadv_orig_router_get(router->orig_node, - if_outgoing); + router_router = batadv_orig_to_direct_router(bat_priv, + router->addr, + if_outgoing); router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); } From ce425dd05d0fe7594930a0fb103634f35ac47bb6 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:49 +0200 Subject: [PATCH 564/972] batman-adv: tp_meter: fix tp_num leak on kmalloc failure When batadv_tp_start() or batadv_tp_init_recv() fail to allocate a new tp_vars object, the previously incremented bat_priv->tp_num counter is never decremented. This causes tp_num to drift upward on each allocation failure. Since only BATADV_TP_MAX_NUM sessions can be started and the count is never reduced for these failed allocations, it causes to an exhaustion of throughput meter sessions. In worst case, no new throughput meter session can be started until the mesh interface is removed. The error handling must decrement tp_num releasing the lock and aborting the creation of an throughput meter session Cc: stable@kernel.org Fixes: 33a3bb4a3345 ("batman-adv: throughput meter implementation") Signed-off-by: Sven Eckelmann --- net/batman-adv/tp_meter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 58ca59a2799ed1..066c76113fc433 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -994,6 +994,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, "Meter: %s cannot allocate list elements\n", @@ -1366,8 +1367,10 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv, } tp_vars = kmalloc_obj(*tp_vars, GFP_ATOMIC); - if (!tp_vars) + if (!tp_vars) { + atomic_dec(&bat_priv->tp_num); goto out_unlock; + } ether_addr_copy(tp_vars->other_end, icmp->orig); tp_vars->role = BATADV_TP_RECEIVER; From 4ae1709a314060a196981b344610d023ea841e57 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:50 +0200 Subject: [PATCH 565/972] batman-adv: bla: prevent use-after-free when deleting claims When batadv_bla_del_backbone_claims() removes all claims for a backbone, it does this by dropping the link entry in the hash list. This list entry itself was one of the references which need to be dropped at the same time via batadv_claim_put(). But the batadv_claim_put() must not be done before the last access to the claim object in this function. Otherwise the claim might be freed already by the batadv_claim_release() function before the list entry was dropped. Cc: stable@kernel.org Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 51fe028b90881e..8b77dd2ecfa419 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -318,8 +318,8 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) if (claim->backbone_gw != backbone_gw) continue; - batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); + batadv_claim_put(claim); } spin_unlock_bh(list_lock); } From cf6b604011591865ae39ac82de8978c1120d17af Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:51 +0200 Subject: [PATCH 566/972] batman-adv: bla: only purge non-released claims When batadv_bla_purge_claims() goes through the list of claims, it is only traversing the hash list with an rcu_read_lock(). Due to a potential parallel batadv_claim_put(), it can happen that it encounters a claim which was actually in the process of being released+freed by batadv_claim_release(). In this case, backbone_gw is set to NULL before the delayed RCU kfree is started. Calling batadv_bla_claim_get_backbone_gw() is then no longer allowed because it would cause a NULL-ptr derefence. To avoid this, only claims with a valid reference counter must be purged. All others are already taken care of. Cc: stable@kernel.org Fixes: 23721387c409 ("batman-adv: add basic bridge loop avoidance code") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 8b77dd2ecfa419..879ab043d57a9f 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1288,6 +1288,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + /* only purge claims not currently in the process of being released. + * Such claims could otherwise have a NULL-ptr backbone_gw set because + * they already went through batadv_claim_release() + */ + if (!kref_get_unless_zero(&claim->refcount)) + continue; + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; @@ -1313,6 +1320,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); + batadv_claim_put(claim); } rcu_read_unlock(); } From ba9d20ee9076dac32c371116bacbe72480eb356c Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Wed, 6 May 2026 22:20:52 +0200 Subject: [PATCH 567/972] batman-adv: bla: put backbone reference on failed claim hash insert When batadv_bla_add_claim() fails to insert a new claim into the hash, it leaked a reference to the backbone_gw for which the claim was intended. Call batadv_backbone_gw_put() on the error path to release the reference and avoid leaking the backbone_gw object. Cc: stable@kernel.org Fixes: 3db0decf1185 ("batman-adv: Fix non-atomic bla_claim::backbone_gw access") Signed-off-by: Sven Eckelmann --- net/batman-adv/bridge_loop_avoidance.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 879ab043d57a9f..cec11f1251d66a 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -723,6 +723,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, if (unlikely(hash_added != 0)) { /* only local changes happened. */ + batadv_backbone_gw_put(backbone_gw); kfree(claim); return; } From f7700a4415afb3ac1767a556094e4ef8bd440e41 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 8 May 2026 20:37:46 +0800 Subject: [PATCH 568/972] ublk: fix use-after-free in ublk_cancel_cmd() When ublk_reset_ch_dev() clears io->cmd via ublk_queue_reinit() concurrently with ublk_cancel_cmd(), ublk_cancel_cmd() can read a stale pointer and pass it to io_uring_cmd_done(), causing a use-after-free. Fix by synchronizing the two paths with ubq->cancel_lock: - ublk_cancel_cmd(): read and clear io->cmd under cancel_lock, then call io_uring_cmd_done() on the saved local copy outside the lock. - ublk_reset_ch_dev(): hold cancel_lock across ublk_queue_reinit() so that io->cmd and io->flags are cleared atomically with respect to ublk_cancel_cmd(). Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260508123746.242018-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 57ec900f0ce0f6..6d13f1481de053 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2411,8 +2411,14 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) { int i; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_queue_reinit(ub, ublk_get_queue(ub, i)); + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + /* Sync with ublk_cancel_cmd() */ + spin_lock(&ubq->cancel_lock); + ublk_queue_reinit(ub, ubq); + spin_unlock(&ubq->cancel_lock); + } /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; @@ -2753,6 +2759,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, { struct ublk_io *io = &ubq->ios[tag]; struct ublk_device *ub = ubq->dev; + struct io_uring_cmd *cmd = NULL; struct request *req; bool done; @@ -2775,12 +2782,15 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, spin_lock(&ubq->cancel_lock); done = !!(io->flags & UBLK_IO_FLAG_CANCELED); - if (!done) + if (!done) { io->flags |= UBLK_IO_FLAG_CANCELED; + cmd = io->cmd; + io->cmd = NULL; + } spin_unlock(&ubq->cancel_lock); - if (!done) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); + if (!done && cmd) + io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, issue_flags); } /* From 786a45757dcdf8f2beb9d4a6db605db16c18b2b4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 28 Apr 2026 21:59:52 +0100 Subject: [PATCH 569/972] x86/kexec: Push kjump return address even for non-kjump kexec The version of purgatory code shipped by kexec-tools attempts to look above the top of its stack to find a return address for a kjump, even in a non-kjump kexec. After the commit in Fixes: the word above the stack might not be there, leading to a fault (which is at least now caught by my exception-handling code in kexec). That commit fixed things for the actual kjump path, but no longer "gratuitously" pushes the unused return address to the stack in the non-kjump path. Put that *back* in the non-kjump path, to prevent purgatory from crashing when trying to access it. Fixes: 2cacf7f23a02 ("x86/kexec: Fix stack and handling of re-entry point for ::preserve_context") Reported-by: Rohan Kakulawaram Signed-off-by: David Woodhouse Signed-off-by: Borislav Petkov (AMD) Acked-by: Dave Hansen Tested-by: Rohan Kakulawaram Cc: Link: https://patch.msgid.link/32d627134143ffd957891cb697138e839c623211.camel@infradead.org --- arch/x86/kernel/relocate_kernel_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4ffba68dc57b29..eaeb77464c066e 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -136,6 +136,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * %r13 original CR4 when relocate_kernel() was invoked */ + /* + * Set return address to 0 if not preserving context. The purgatory + * shipped in kexec-tools will unconditionally look for the return + * address on the stack and set a kexec_jump_back_entry= command + * line option if it's non-zero. There's no other way that it can + * tell a preserve-context (kjump) kexec from a normal one. + */ + pushq $0 /* store the start address on the stack */ pushq %rdx From 411c1cf430392c905e39f12bc305dd994da0b426 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 8 May 2026 15:20:23 +0100 Subject: [PATCH 570/972] arm64/entry: Fix arm64-specific rseq brokenness Mathias Stearn reports that since v6.19, there are two big issues affecting rseq: (1) On arm64 specifically, rseq critical sections aren't aborted when they should be. (2) The 'cpu_id_start' field is no longer written by the kernel in all cases it used to be, including some cases where TCMalloc depends on the kernel clobbering the field. This patch fixes issue #1. This patch DOES NOT fix issue #2, which will need to be addressed by other patches. The arm64-specific brokenness is a result of commits: 2fc0e4b4126c ("rseq: Record interrupt from user space") 39a167560a61 ("rseq: Optimize event setting") The first commit failed to add a call to rseq_note_user_irq_entry() on arm64. Thus arm64 never sets rseq_event::user_irq to record that it may be necessary to abort an active rseq critical section upon return to userspace. On its own, this commit had no functional impact as the value of rseq_event::user_irq was not consumed. The second commit relied upon rseq_event::user_irq to determine whether or not to bother to perform rseq work when returning to userspace. As rseq_event::user_irq wasn't set on arm64, this work would be skipped, and consequently an active rseq critical section would not be aborted. Fix this by giving arm64 syscall-specific entry/exit paths, and performing the relevant logic in syscall and non-syscall paths, including calling rseq_note_user_irq_entry() for non-syscall entry. Currently arm64 cannot use syscall_enter_from_user_mode(), syscall_exit_to_user_mode(), and irqentry_exit_to_user_mode(), due to ordering constraints with exception masking, and risk of ABI breakage for syscall tracing/audit/etc. For the moment the entry/exit logic is left as arm64-specific, directly using enter_from_user_mode() and exit_to_user_mode(), but mirroring the generic code. I intend to follow up with refactoring/cleanup, as we did for kernel mode entry paths in commit: 041aa7a85390 ("entry: Split preemption from irqentry_exit_to_kernel_mode()") ... which will allow arm64 to use the GENERIC_IRQ_ENTRY functions directly. Fixes: 39a167560a61 ("rseq: Optimize event setting") Reported-by: Mathias Stearn Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lore.kernel.org/regressions/CAHnCjA25b+nO2n5CeifknSKHssJpPrjnf+dtr7UgzRw4Zgu=oA@mail.gmail.com/ Link: https://patch.msgid.link/20260508142023.3268622-1-mark.rutland@arm.com --- arch/arm64/kernel/entry-common.c | 31 ++++++++++++++++++++++++------- include/linux/irq-entry-common.h | 8 -------- include/linux/rseq_entry.h | 19 ------------------- 3 files changed, 24 insertions(+), 34 deletions(-) diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index cb54335465f667..c7a23f7c221220 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -62,6 +62,13 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, irqentry_exit_to_kernel_mode_after_preempt(regs, state); } +static __always_inline void arm64_syscall_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + mte_disable_tco_entry(current); + sme_enter_from_user_mode(); +} + /* * Handle IRQ/context state management when entering from user mode. * Before this function is called it is not safe to call regular kernel code, @@ -70,20 +77,30 @@ static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs, static __always_inline void arm64_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); mte_disable_tco_entry(current); sme_enter_from_user_mode(); } +static __always_inline void arm64_syscall_exit_to_user_mode(struct pt_regs *regs) +{ + local_irq_disable(); + syscall_exit_to_user_mode_prepare(regs); + local_daif_mask(); + sme_exit_to_user_mode(); + mte_check_tfsr_exit(); + exit_to_user_mode(); +} + /* * Handle IRQ/context state management when exiting to user mode. * After this function returns it is not safe to call regular kernel code, * instrumentable code, or any code which may trigger an exception. */ - static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) { local_irq_disable(); - exit_to_user_mode_prepare_legacy(regs); + irqentry_exit_to_user_mode_prepare(regs); local_daif_mask(); sme_exit_to_user_mode(); mte_check_tfsr_exit(); @@ -92,7 +109,7 @@ static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) asmlinkage void noinstr asm_exit_to_user_mode(struct pt_regs *regs) { - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } /* @@ -716,12 +733,12 @@ static void noinstr el0_brk64(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); fpsimd_syscall_exit(); } @@ -868,11 +885,11 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc_compat(struct pt_regs *regs) { - arm64_enter_from_user_mode(regs); + arm64_syscall_enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); do_el0_svc_compat(regs); - arm64_exit_to_user_mode(regs); + arm64_syscall_exit_to_user_mode(regs); } static void noinstr el0_bkpt32(struct pt_regs *regs, unsigned long esr) diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 167fba7dbf043f..1fabf0f5ea8e76 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -218,14 +218,6 @@ static __always_inline void __exit_to_user_mode_validate(void) lockdep_sys_exit(); } -/* Temporary workaround to keep ARM64 alive */ -static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) -{ - __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); - rseq_exit_to_user_mode_legacy(); - __exit_to_user_mode_validate(); -} - /** * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 2d0295df5107c7..63bc72086e75bb 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -749,24 +749,6 @@ static __always_inline void rseq_irqentry_exit_to_user_mode(void) ev->events = 0; } -/* Required to keep ARM64 working */ -static __always_inline void rseq_exit_to_user_mode_legacy(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - rseq_stat_inc(rseq_stats.exit); - - if (static_branch_unlikely(&rseq_debug_enabled)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - void __rseq_debug_syscall_return(struct pt_regs *regs); static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) @@ -782,7 +764,6 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned } static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } -static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ From ab28a0673daabe7f0fcbd7a5e36334f2f003f02f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 8 May 2026 19:50:45 +0800 Subject: [PATCH 571/972] sched_ext: Use IRQ_WORK_INIT_HARD() to initialize sch->disable_irq_work For built with PREEMPT_RT kernels, the scx_disable_irq_workfn() is called from per-cpu irq_work kthreads context, this means that when call the scx_dump_state() in the scx_disable_irq_workfn() to output current->comm/pid, it always output current irq_work kthread's comm/pid. this commit therefore use the IRQ_WORK_INIT_HARD() to initialize sch->disable_irq_work to make scx_disable_irq_workfn() is called from hardirq context. Fixes: f4a6c506d118 ("sched_ext: Always bounce scx_disable() through irq_work") Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f4e2db8e56be62..df305712a2d424 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6589,7 +6589,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); + sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); From b2ed01e7ad3de80333e9b962a44024b094bc0b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 28 Apr 2026 11:44:42 +0200 Subject: [PATCH 572/972] drm/ttm: Fix ttm_bo_swapout() infinite LRU walk on swapout failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ttm_tt_swapout() fails, the current code calls ttm_resource_add_bulk_move() followed by ttm_resource_move_to_lru_tail() to restore the resource's bulk_move membership. However, ttm_resource_move_to_lru_tail() places the resource at the tail of the LRU list which, relative to the walk cursor's hitch node (placed immediately after the resource when it was yielded), puts the resource *in front of the* the hitch. The next list_for_each_entry_continue() from the hitch finds the same resource again, causing an infinite loop. Fix by deferring del_bulk_move to the success path only. On the success path, TTM_TT_FLAG_SWAPPED has just been set by ttm_tt_swapout() but the resource is still tracked in the bulk_move range, so ttm_resource_del_bulk_move()'s !ttm_resource_unevictable() guard would incorrectly skip the removal. Introduce ttm_resource_del_bulk_move_unevictable() which bypasses that guard. Reported-by: Jatin Kataria Fixes: fc5d96670eb2 ("drm/ttm: Move swapped objects off the manager's LRU list") Cc: Christian König Cc: Matthew Brost Cc: Cc: # v6.13+ Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström Reviewed-by: Christian König Tested-by: Boqun Feng Link: https://patch.msgid.link/20260428094442.16985-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 16 ++++++---------- drivers/gpu/drm/ttm/ttm_resource.c | 13 +++++++++++++ include/drm/ttm/ttm_resource.h | 2 ++ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index d85f0a37ac35f3..2934017055429c 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1177,17 +1177,13 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo) bdev->funcs->swap_notify(bo); if (ttm_tt_is_populated(tt)) { - spin_lock(&bdev->lru_lock); - ttm_resource_del_bulk_move(bo->resource, bo); - spin_unlock(&bdev->lru_lock); - ret = ttm_tt_swapout(bdev, tt, swapout_walk->gfp_flags); - - spin_lock(&bdev->lru_lock); - if (ret) - ttm_resource_add_bulk_move(bo->resource, bo); - ttm_resource_move_to_lru_tail(bo->resource); - spin_unlock(&bdev->lru_lock); + if (!ret) { + spin_lock(&bdev->lru_lock); + ttm_resource_del_bulk_move_unevictable(bo->resource, bo); + ttm_resource_move_to_lru_tail(bo->resource); + spin_unlock(&bdev->lru_lock); + } } out: diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 9f36631d48b617..0e5f1582f13d84 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -292,6 +292,19 @@ void ttm_resource_del_bulk_move(struct ttm_resource *res, ttm_lru_bulk_move_del(bo->bulk_move, res); } +/* + * Remove a resource from its bulk_move, bypassing the unevictable check. + * Use only when the resource is known to still be tracked in the range despite + * the BO having just become unevictable; asserts that this is the case. + */ +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo) +{ + WARN_ON_ONCE(!ttm_resource_unevictable(res, bo)); + if (bo->bulk_move) + ttm_lru_bulk_move_del(bo->bulk_move, res); +} + /* Move a resource to the LRU or bulk tail */ void ttm_resource_move_to_lru_tail(struct ttm_resource *res) { diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index 33e80f30b8b822..a5d386583fb6e0 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -448,6 +448,8 @@ void ttm_resource_add_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); void ttm_resource_del_bulk_move(struct ttm_resource *res, struct ttm_buffer_object *bo); +void ttm_resource_del_bulk_move_unevictable(struct ttm_resource *res, + struct ttm_buffer_object *bo); void ttm_resource_move_to_lru_tail(struct ttm_resource *res); void ttm_resource_init(struct ttm_buffer_object *bo, From 481c2265286ef302327c93403a8cf7b3fe4506d0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 21:04:48 +0000 Subject: [PATCH 573/972] bpf: tcp: Fix type confusion in bpf_tcp_sock(). bpf_tcp_sock() only checks if sk->sk_protocol is IPPROTO_TCP, but RAW socket can bypass it: socket(AF_INET, SOCK_RAW, IPPROTO_TCP) Calling bpf_setsockopt() in SOCKOPT prog triggers out-of-bounds access to another slab object. [0] Let's use sk_is_tcp(). [0]: BUG: KASAN: slab-out-of-bounds in sol_tcp_sockopt (net/core/filter.c:5519) Read of size 8 at addr ffff88801083d760 by task test_progs/1259 CPU: 1 UID: 0 PID: 1259 Comm: test_progs Tainted: G OE 7.0.0-11175-gb5c111f4967b #1 PREEMPT(full) Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:94 lib/dump_stack.c:120) print_report (mm/kasan/report.c:378 mm/kasan/report.c:482) kasan_report (mm/kasan/report.c:595) sol_tcp_sockopt (net/core/filter.c:5519) __bpf_getsockopt (net/core/filter.c:5633) bpf_sk_getsockopt (net/core/filter.c:5654) bpf_prog_629ba00a1601e9f2__setsockopt+0x86/0x22c __cgroup_bpf_run_filter_setsockopt (./include/linux/bpf.h:1402 ./include/linux/filter.h:722 ./include/linux/filter.h:729 kernel/bpf/cgroup.c:81 kernel/bpf/cgroup.c:2026) do_sock_setsockopt (net/socket.c:2363) __x64_sys_setsockopt (net/socket.c:2406) do_syscall_64 (arch/x86/entry/syscall_64.c:63) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121) RIP: 0033:0x7f85f82fe7de Code: 55 48 63 c9 48 63 ff 45 89 c9 48 89 e5 48 83 ec 08 6a 2c e8 34 69 f7 ff c9 c3 66 90 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 e1 RSP: 002b:00007ffe59dcecd8 EFLAGS: 00000202 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f85f82fe7de RDX: 000000000000001c RSI: 0000000000000006 RDI: 000000000000000d RBP: 00007ffe59dcef20 R08: 000000000000003c R09: 0000000000000000 R10: 00007ffe59dcef00 R11: 0000000000000202 R12: 00007ffe59dcf268 R13: 0000000000000003 R14: 00007f85f9da5000 R15: 000055b2f3201400 The buggy address belongs to the object at ffff88801083d280 which belongs to the cache RAW of size 1792 The buggy address is located 1248 bytes inside of allocated 1792-byte region [ffff88801083d280, ffff88801083d980) Fixes: 655a51e536c0 ("bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20260504210610.180150-2-kuniyu@google.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index bc96c18df4e03b..cd88633f8dc148 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7475,7 +7475,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; From a7488f089bdfa87c4fef1744d4dca9f4f8b46f8b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 May 2026 04:04:46 -0700 Subject: [PATCH 574/972] workqueue: Release PENDING in __queue_work() drain/destroy reject path The caller of __queue_work() owns WORK_STRUCT_PENDING, won via test_and_set_bit() in queue_work_on()/__queue_delayed_work(). The state machine documented above __queue_work() requires that owner to either hand the token to a pwq (insert_work() -> set_work_pwq()), hand it to a timer, or release it via set_work_pool_and_clear_pending(). try_to_grab_pending() relies on this: when it observes "PENDING && off-queue" it busy-loops, trusting the current owner to make progress. The (__WQ_DESTROYING | __WQ_DRAINING) early-return path violates that contract. It WARN_ONCE()s and bare-returns, leaving work->data with PENDING set, WORK_STRUCT_PWQ clear, and work->entry empty. The path is reachable without explicit API abuse: queue_delayed_work() arms a timer with PENDING set; if drain_workqueue() runs while the timer is still pending, delayed_work_timer_fn() -> __queue_work() in softirq context hits the WARN, current is not a wq worker so is_chained_work() is false, and the work is silently dropped with PENDING leaked. Mirror what clear_pending_if_disabled() already does on its analogous reject path: unpack the off-queue data and call set_work_pool_and_clear_pending() to release the token before returning. I was able to reproduce this by queueing several slow works on a max_active=1 wq, arm a delayed_work whose timer fires while drain_workqueue() is blocked, then call cancel_delayed_work_sync(). Without this patch the cancel livelocks at 100% CPU; with it the cancel returns immediately. Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3d2e3b2ec5283f..c0e58f877e0892 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2296,6 +2296,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { + struct work_offq_data offqd; + + /* + * State on entry: PENDING is set, work is off-queue (no + * insert_work() has run). + * + * Returning without clearing PENDING would leave the work + * in a weird state (PENDING=1, PWQ=0, entry empty) + */ + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); return; } rcu_read_lock(); From 0143033dc22cdff912cfc13419f5db92fea3b4cb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 8 May 2026 09:22:03 -0700 Subject: [PATCH 575/972] workqueue: Fix wq->cpu_pwq leak in alloc_and_link_pwqs() WQ_UNBOUND path For WQ_UNBOUND workqueues, alloc_and_link_pwqs() allocates wq->cpu_pwq via alloc_percpu() and then calls apply_workqueue_attrs_locked(). On failure it returns the error directly, bypassing the enomem: label which holds the only free_percpu(wq->cpu_pwq) in this function. The caller's error path kfree()s wq without touching wq->cpu_pwq, leaking one percpu pointer table (nr_cpu_ids * sizeof(void *) bytes) per failed call. If kmemleak is enabled, we can see: unreferenced object (percpu) 0xc0fffa5b121048 (size 8): comm "insmod", pid 776, jiffies 4294682844 backtrace (crc 0): pcpu_alloc_noprof+0x665/0xac0 __alloc_workqueue+0x33f/0xa20 alloc_workqueue_noprof+0x60/0x100 Route the error through the existing enomem: cleanup and any error before this one. Cc: stable@kernel.org Fixes: 636b927eba5b ("workqueue: Make unbound workqueues to use per-cpu pool_workqueues") Signed-off-by: Breno Leitao Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0e58f877e0892..33b721a9af0223 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5654,7 +5654,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - return ret; + if (ret) + goto enomem; + return 0; enomem: if (wq->cpu_pwq) { From d73549b8bb7fa6147666c579d66f72bf076f719f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 21:04:49 +0000 Subject: [PATCH 576/972] selftest: bpf: Add test for bpf_tcp_sock() and RAW socket. Let's extend sockopt_sk.c to cover bpf_tcp_sock() for the wrong socket type. Before: # ./test_progs -t sockopt_sk [ 151.948613] ================================================================== [ 151.951376] BUG: KASAN: slab-out-of-bounds in sol_tcp_sockopt+0xc7/0x8e0 [ 151.954159] Read of size 8 at addr ffff88801083d760 by task test_progs/1259 ... run_test:FAIL:getsetsockopt unexpected error: -1 (errno 0) #427 sockopt_sk:FAIL After: #427 sockopt_sk:OK While at it, missing free() is fixed up. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260504210610.180150-3-kuniyu@google.com --- .../selftests/bpf/prog_tests/sockopt_sk.c | 17 ++++++++++++++++- tools/testing/selftests/bpf/progs/sockopt_sk.c | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 53637431ec5de3..3a41c517b9182b 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -190,7 +190,7 @@ static int getsetsockopt(void) fd = socket(AF_NETLINK, SOCK_RAW, 0); if (fd < 0) { log_err("Failed to create AF_NETLINK socket"); - return -1; + goto err; } buf.u32 = 1; @@ -211,6 +211,21 @@ static int getsetsockopt(void) } ASSERT_EQ(optlen, 8, "Unexpected NETLINK_LIST_MEMBERSHIPS value"); + /* Trick bpf_tcp_sock() with IPPROTO_TCP */ + close(fd); + fd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP); + if (!ASSERT_OK_FD(fd, "socket")) + goto err; + + /* The BPF prog intercepts this before the kernel sees it, any + * optlen works. Go with 4 bytes for simplicity. + */ + buf.u32 = 1; + optlen = sizeof(buf.u32); + err = setsockopt(fd, SOL_TCP, TCP_SAVED_SYN, &buf, optlen); + if (!ASSERT_ERR(err, "setsockopt(TCP_SAVED_SYN)")) + goto err; + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index cb990a7d3d4586..5e0b27e7855cb8 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -149,6 +149,20 @@ int _setsockopt(struct bpf_sockopt *ctx) if (sk && sk->family == AF_NETLINK) goto out; + if (sk && sk->family == AF_INET && sk->type == SOCK_RAW) { + struct bpf_tcp_sock *tp = bpf_tcp_sock(sk); + + if (tp) { + char saved_syn[60]; + + bpf_getsockopt(sk, SOL_TCP, TCP_SAVED_SYN, + &saved_syn, sizeof(saved_syn)); + goto consumed; + } + + goto out; + } + /* Make sure bpf_get_netns_cookie is callable. */ if (bpf_get_netns_cookie(NULL) == 0) @@ -224,6 +238,8 @@ int _setsockopt(struct bpf_sockopt *ctx) return 0; /* couldn't get sk storage */ storage->val = optval[0]; + +consumed: ctx->optlen = -1; /* BPF has consumed this option, don't call kernel * setsockopt handler. */ From 7995b216a731db657f356f6ae37a42f445b9a0ec Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 4 May 2026 21:04:50 +0000 Subject: [PATCH 577/972] mptcp: bpf: Fix type confusion in bpf_mptcp_sock_from_subflow() bpf_mptcp_sock_from_subflow() only checks if sk->sk_protocol is IPPROTO_TCP, but RAW socket can bypass it: socket(AF_INET, SOCK_RAW, IPPROTO_TCP) In this case, it would NOT be valid to call sk_is_mptcp() which will assume sk is a pointer to a struct tcp_sock, and wrongly checks for: tcp_sk(sk)->is_mptcp. Fixes: 3bc253c2e652 ("bpf: Add bpf_skc_to_mptcp_sock_proto") Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260504210610.180150-4-kuniyu@google.com --- net/mptcp/bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c index 8a16672b94e238..4cc16cbeb32817 100644 --- a/net/mptcp/bpf.c +++ b/net/mptcp/bpf.c @@ -14,7 +14,7 @@ struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk) && sk_is_mptcp(sk)) return mptcp_sk(mptcp_subflow_ctx(sk)->conn); return NULL; From decb84b8383ab7acff94db208ef7ed19f9c55e1f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 21:04:51 +0000 Subject: [PATCH 578/972] bpf: tcp: Fix type confusion in bpf_skc_to_tcp_sock(). bpf_skc_to_tcp_sock() only checks if sk->sk_protocol is IPPROTO_TCP, but RAW socket can bypass it: socket(AF_INET, SOCK_RAW, IPPROTO_TCP) Let's use sk_is_tcp(). Fixes: 478cfbdf5f13 ("bpf: Add bpf_skc_to_{tcp, tcp_timewait, tcp_request}_sock() helpers") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260504210610.180150-5-kuniyu@google.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index cd88633f8dc148..7d945dc2cb92d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11963,7 +11963,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; From 843064b0a77eed3d6d63ffc53aeaa359672b4e12 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 21:04:52 +0000 Subject: [PATCH 579/972] bpf: tcp: Fix type confusion in bpf_skc_to_tcp6_sock(). bpf_skc_to_tcp6_sock() only checks if sk->sk_protocol is IPPROTO_TCP and sk->sk_family is AF_INET6, but RAW socket can bypass it: socket(AF_INET6, SOCK_RAW, IPPROTO_TCP) Let's check sk->sk_type too. Fixes: af7ec1383361 ("bpf: Add bpf_skc_to_tcp6_sock() helper") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260504210610.180150-6-kuniyu@google.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7d945dc2cb92d1..684922efd48106 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11947,7 +11947,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_family == AF_INET6) + sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; From 1c2958e4ab1ed4594db16425dbcab33c56ea8330 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 21:04:53 +0000 Subject: [PATCH 580/972] bpf: tcp: Fix type confusion in sol_tcp_sockopt(). sol_tcp_sockopt() only checks if sk->sk_protocol is IPPROTO_TCP, but RAW socket can bypass it: socket(AF_INET, SOCK_RAW, IPPROTO_TCP) Let's use sk_is_tcp(). Note that initially sol_tcp_sockopt() checked sk->sk_prot->setsockopt. Fixes: 2ab42c7b871f ("bpf: Check the protocol of a sock to agree the calls to bpf_setsockopt().") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20260504210610.180150-7-kuniyu@google.com --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 684922efd48106..ef0877eefaa7cf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5481,7 +5481,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!sk_is_tcp(sk)) return -EINVAL; switch (optname) { From db5dadb562cabb6da49959b473ed0d9645b6f2da Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 4 May 2026 18:01:37 -0500 Subject: [PATCH 581/972] Revert "ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn" Some older systems don't support CPPC in the firmware and this just makes noise for them when booting. Drop back to debug. This reverts commit 21fb59ab4b9767085f4fe1edbdbe3177fbb9ec97. Fixes: 21fb59ab4b976 ("ACPI: CPPC: Adjust debug messages in amd_set_max_freq_ratio() to warn") Suggested-by: Kim Phillips Signed-off-by: Mario Limonciello Tested-by: Kim Phillips Cc: All applicable Link: https://patch.msgid.link/20260504230141.484743-2-mario.limonciello@amd.com Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/cppc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index d7c8ef1e354d30..be4c5e9e5ff6f9 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -88,19 +88,19 @@ static void amd_set_max_freq_ratio(void) rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_warn("Could not retrieve perf counters (%d)\n", rc); + pr_debug("Could not retrieve perf counters (%d)\n", rc); return; } rc = amd_get_boost_ratio_numerator(0, &numerator); if (rc) { - pr_warn("Could not retrieve highest performance (%d)\n", rc); + pr_debug("Could not retrieve highest performance (%d)\n", rc); return; } nominal_perf = perf_caps.nominal_perf; if (!nominal_perf) { - pr_warn("Could not retrieve nominal performance\n"); + pr_debug("Could not retrieve nominal performance\n"); return; } From 18fc650ccd7fe3376eca89203668cfb8268f60df Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 26 Apr 2026 01:26:43 +0000 Subject: [PATCH 582/972] bpf: Free reuseport cBPF prog after RCU grace period. Eulgyu Kim reported the splat below with a repro. [0] The repro sets up a UDP reuseport group with a cBPF prog and replaces it with a new one while another thread is sending a UDP packet to the group. The reuseport prog is freed by sk_reuseport_prog_free(). bpf_prog_put() is called for "e"BPF prog to destruct through multiple stages while cBPF prog is freed immediately by bpf_release_orig_filter() and bpf_prog_free(). If a reuseport prog is detached from the setsockopt() path (reuseport_attach_prog() or reuseport_detach_prog()), sk_reuseport_prog_free() is called without waiting for RCU readers to complete, resulting in various bugs. Let's defer freeing the reuseport cBPF prog after one RCU grace period. Note "e"BPF prog is safe as is unless the fast path starts to touch fields destroyed in bpf_prog_put_deferred() and __bpf_prog_put_noref(). [0]: BUG: KASAN: vmalloc-out-of-bounds in reuseport_select_sock+0xedc/0x1220 net/core/sock_reuseport.c:596 Read of size 4 at addr ffffc9000051e004 by task slowme/10208 CPU: 6 UID: 1000 PID: 10208 Comm: slowme Not tainted 7.0.0-geb7ac95ff75e #32 PREEMPT(full) Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 reuseport_select_sock+0xedc/0x1220 net/core/sock_reuseport.c:596 udp4_lib_lookup2+0x3bc/0x950 net/ipv4/udp.c:495 __udp4_lib_lookup+0x768/0xe20 net/ipv4/udp.c:723 __udp4_lib_lookup_skb+0x297/0x390 net/ipv4/udp.c:752 __udp4_lib_rcv+0x1312/0x2620 net/ipv4/udp.c:2752 ip_protocol_deliver_rcu+0x282/0x440 net/ipv4/ip_input.c:207 ip_local_deliver_finish+0x3bb/0x6f0 net/ipv4/ip_input.c:241 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318 NF_HOOK+0x30c/0x3a0 include/linux/netfilter.h:318 __netif_receive_skb_one_core net/core/dev.c:6181 [inline] __netif_receive_skb net/core/dev.c:6294 [inline] process_backlog+0xaa4/0x1960 net/core/dev.c:6645 __napi_poll+0xae/0x340 net/core/dev.c:7709 napi_poll net/core/dev.c:7772 [inline] net_rx_action+0x5d7/0xf50 net/core/dev.c:7929 handle_softirqs+0x22b/0x870 kernel/softirq.c:622 do_softirq+0x76/0xd0 kernel/softirq.c:523 __local_bh_enable_ip+0xf8/0x130 kernel/softirq.c:450 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:924 [inline] __dev_queue_xmit+0x1dd7/0x3710 net/core/dev.c:4890 neigh_output include/net/neighbour.h:556 [inline] ip_finish_output2+0xca9/0x1070 net/ipv4/ip_output.c:237 NF_HOOK_COND include/linux/netfilter.h:307 [inline] ip_output+0x29f/0x450 net/ipv4/ip_output.c:438 ip_send_skb+0x45/0xc0 net/ipv4/ip_output.c:1508 udp_send_skb+0xb04/0x1510 net/ipv4/udp.c:1195 udp_sendmsg+0x1a71/0x2350 net/ipv4/udp.c:1485 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg net/socket.c:742 [inline] __sys_sendto+0x554/0x680 net/socket.c:2206 __do_sys_sendto net/socket.c:2213 [inline] __se_sys_sendto net/socket.c:2209 [inline] __x64_sys_sendto+0xde/0x100 net/socket.c:2209 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x160/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x415a2d Code: b3 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f6bc31e41e8 EFLAGS: 00000212 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007f6bc31e4cdc RCX: 0000000000415a2d RDX: 0000000000000001 RSI: 00007f6bc31e421f RDI: 0000000000000003 RBP: 00007f6bc31e4240 R08: 00007f6bc31e4220 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000212 R12: 00007f6bc31e46c0 R13: ffffffffffffffb8 R14: 0000000000000000 R15: 00007ffc9b0d70b0 Fixes: 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF") Reported-by: Eulgyu Kim Reported-by: Taeyang Lee <0wn@theori.io> Signed-off-by: Kuniyuki Iwashima Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20260426012647.3233119-1-kuniyu@google.com --- net/core/filter.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index ef0877eefaa7cf..70eef14edb990c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,15 +1654,24 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) return err; } +static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + struct bpf_prog *prog = aux->prog; + + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; - if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - bpf_prog_put(prog); + if (bpf_prog_was_classic(prog)) + call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu); else - bpf_prog_destroy(prog); + bpf_prog_put(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, From 909f7bf9b080c10df3c3b38533906dbf09ff1d8b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 15 Apr 2026 17:56:06 +0200 Subject: [PATCH 583/972] PCI: Update saved_config_space upon resource assignment Bernd reports passthrough failure of a Digital Devices Cine S2 V6 DVB adapter plugged into an ASRock X570S PG Riptide board with BIOS version P5.41 (09/07/2023): ddbridge 0000:05:00.0: detected Digital Devices Cine S2 V6 DVB adapter ddbridge 0000:05:00.0: cannot read registers ddbridge 0000:05:00.0: fail BIOS assigns an incorrect BAR to the DVB adapter which doesn't fit into the upstream bridge window. The kernel corrects the BAR assignment: pci 0000:07:00.0: BAR 0 [mem 0xfffffffffc500000-0xfffffffffc50ffff 64bit]: can't claim; no compatible bridge window pci 0000:07:00.0: BAR 0 [mem 0xfc500000-0xfc50ffff 64bit]: assigned Correction of the BAR assignment happens in an x86-specific fs_initcall, pcibios_assign_resources(), after device enumeration in a subsys_initcall. This order was introduced at the behest of Linus in 2004: https://git.kernel.org/tglx/history/c/a06a30144bbc No other architecture performs such a late BAR correction. Bernd bisected the issue to commit a2f1e22390ac ("PCI/ERR: Ensure error recoverability at all times"), but it only occurs in the absence of commit 4d4c10f763d7 ("PCI: Explicitly put devices into D0 when initializing"). This combination exists in stable kernel v6.12.70, but not in mainline, hence Bernd cannot reproduce the issue with mainline. Since a2f1e22390ac, config space is saved on enumeration, prior to BAR correction. Upon passthrough, the corrected BAR is overwritten with the incorrect saved value by: vfio_pci_core_register_device() vfio_pci_set_power_state() pci_restore_state() But only if the device's current_state is PCI_UNKNOWN, as it was prior to commit 4d4c10f763d7. Since the commit, it is PCI_D0, which changes the behavior of vfio_pci_set_power_state() to no longer restore the state without saving it first. Alexandre is reporting the same issue as Bernd, but in his case, mainline is affected as well. The difference is that on Alexandre's system, the host kernel binds a driver to the device which is unbound prior to passthrough, whereas on Bernd's system no driver gets bound by the host kernel. Unbinding sets current_state to PCI_UNKNOWN in pci_device_remove(), so when vfio-pci is subsequently bound to the device, pci_restore_state() is once again called without invoking pci_save_state() first. To robustly fix the issue, always update saved_config_space upon resource assignment. Reported-by: Bernd Schumacher Closes: https://lore.kernel.org/r/acfZrlP0Ua_5D3U4@eldamar.lan/ Reported-by: Alexandre N. Closes: https://lore.kernel.org/r/dd3c3358-de0f-4a56-9c81-04aceaab4058@mailo.com/ Fixes: a2f1e22390ac ("PCI/ERR: Ensure error recoverability at all times") Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Tested-by: Bernd Schumacher Tested-by: Alexandre N. Cc: stable@vger.kernel.org # v6.12+ Link: https://patch.msgid.link/febc3f354e0c1f5a9f5b3ee9ffddaa44caccf651.1776268054.git.lukas@wunner.de --- drivers/pci/setup-res.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index fbc05cda96ee01..991d3ed543f58c 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -102,6 +102,7 @@ static void pci_std_update_resource(struct pci_dev *dev, int resno) } pci_write_config_dword(dev, reg, new); + dev->saved_config_space[reg / 4] = new; pci_read_config_dword(dev, reg, &check); if ((new ^ check) & mask) { @@ -112,6 +113,7 @@ static void pci_std_update_resource(struct pci_dev *dev, int resno) if (res->flags & IORESOURCE_MEM_64) { new = region.start >> 16 >> 16; pci_write_config_dword(dev, reg + 4, new); + dev->saved_config_space[(reg + 4) / 4] = new; pci_read_config_dword(dev, reg + 4, &check); if (check != new) { pci_err(dev, "%s: error updating (high %#010x != %#010x)\n", From f45a49a2380a47332817b7248c61a0ebbc6f0d00 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Tue, 5 May 2026 23:43:27 +0000 Subject: [PATCH 584/972] PCI: Initialize temporary device in new_id_store() When setting new_id of a PCI device driver using sysfs a lockdep splat occurs. This is because new_id_store() builds a temporary pci_dev for pci_match_device(), which calls device_match_driver_override(). That depends on the driver_override.lock added by cb3d1049f4ea ("driver core: generalize driver_override in struct device"). The new driver_override.lock was not initialized in the temporary pci_dev, resulting in this lockdep splat. Initialize the temporary pci_dev to fix this. Repro: Build with CONFIG_LOCKDEP=y, boot with QEMU, and add a new ID: # echo "8086 10f5" > /sys/bus/pci/drivers/e1000e/new_id INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 2 UID: 0 PID: 177 Comm: liveupdate-iomm Not tainted 7.0.0+ #9 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x80 register_lock_class+0x77e/0x790 lock_acquire+0xbf/0x2e0 pci_match_device+0x24/0x180 new_id_store+0x189/0x1d0 kernfs_fop_write_iter+0x14f/0x210 vfs_write+0x263/0x5e0 ksys_write+0x79/0xf0 do_syscall_64+0x117/0xf80 Fixes: 10a4206a2401 ("PCI: use generic driver_override infrastructure") Fixes: 8895d3bcb8ba ("PCI: Fail new_id for vendor/device values already built into driver") Signed-off-by: Samiullah Khawaja [bhelgaas: add commit log details and repro, trim backtrace] Signed-off-by: Bjorn Helgaas Reviewed-by: Danilo Krummrich Link: https://patch.msgid.link/20260505234327.716630-1-skhawaja@google.com --- drivers/pci/pci-driver.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d10ece0889f0f4..e3f59001785a17 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -179,6 +179,11 @@ static const struct pci_device_id *pci_match_device(struct pci_driver *drv, return NULL; } +static void _pci_free_device(struct device *dev) +{ + kfree(to_pci_dev(dev)); +} + /** * new_id_store - sysfs frontend to pci_add_dynid() * @driver: target device driver @@ -214,11 +219,13 @@ static ssize_t new_id_store(struct device_driver *driver, const char *buf, pdev->subsystem_vendor = subvendor; pdev->subsystem_device = subdevice; pdev->class = class; + pdev->dev.release = _pci_free_device; + device_initialize(&pdev->dev); if (pci_match_device(pdrv, pdev)) retval = -EEXIST; - kfree(pdev); + put_device(&pdev->dev); if (retval) return retval; From bf5421b3d8d3d9216d045af89d4f7e2f75375554 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 28 Apr 2026 07:19:54 +0200 Subject: [PATCH 585/972] MAINTAINERS: Update Marek Vasut email for PCIe R-Car Use up to date address. No functional change. Signed-off-by: Marek Vasut Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260428052030.51101-1-marek.vasut+renesas@mailbox.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2fb1c75afd1638..db06875cfb46ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20433,7 +20433,7 @@ F: drivers/pci/controller/plda/pcie-plda-host.c F: drivers/pci/controller/plda/pcie-plda.h PCI DRIVER FOR RENESAS R-CAR -M: Marek Vasut +M: Marek Vasut M: Yoshihiro Shimoda L: linux-pci@vger.kernel.org L: linux-renesas-soc@vger.kernel.org From 78e115d806b0eea83a0e4df182a689823aa80511 Mon Sep 17 00:00:00 2001 From: Hans Zhang <18255117159@163.com> Date: Fri, 8 May 2026 10:30:06 +0800 Subject: [PATCH 586/972] MAINTAINERS: Update Hans Zhang email for PCIe CIX Sky1 Update my email address as my work email account is no longer in use. Signed-off-by: Hans Zhang <18255117159@163.com> Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260508023006.1787674-1-18255117159@163.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index db06875cfb46ef..b10ee9a3b5f552 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20321,7 +20321,7 @@ F: Documentation/devicetree/bindings/pci/cdns,* F: drivers/pci/controller/cadence/*cadence* PCI DRIVER FOR CIX Sky1 -M: Hans Zhang +M: Hans Zhang <18255117159@163.com> L: linux-pci@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/pci/cix,sky1-pcie-*.yaml From 9ef40a09c5de18f0d275a451f5c2a7fd4f07158b Mon Sep 17 00:00:00 2001 From: Aksh Garg Date: Fri, 8 May 2026 11:39:51 +0530 Subject: [PATCH 587/972] MAINTAINERS: Add Aksh Garg as PCIe CADENCE reviewer I wish to contribute to the review process for Cadence PCIe IP drivers, hence add myself as a reviewer. Signed-off-by: Aksh Garg Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260508060951.840233-1-a-garg7@ti.com --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b10ee9a3b5f552..0cafa37693a672 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20315,10 +20315,11 @@ F: Documentation/devicetree/bindings/pci/marvell,armada8k-pcie.yaml F: drivers/pci/controller/dwc/pcie-armada8k.c PCI DRIVER FOR CADENCE PCIE IP +R: Aksh Garg L: linux-pci@vger.kernel.org S: Orphan F: Documentation/devicetree/bindings/pci/cdns,* -F: drivers/pci/controller/cadence/*cadence* +F: drivers/pci/controller/cadence/ PCI DRIVER FOR CIX Sky1 M: Hans Zhang <18255117159@163.com> From 97c8a3c1f73d828de43a5a88e8a9a143efb2b661 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 03:59:18 +0000 Subject: [PATCH 588/972] tcp: Fix potential UAF in reqsk_timer_handler(). When TCP socket migration fails at inet_ehash_insert() in reqsk_timer_handler(), we jump to the no_ownership: label and free the new reqsk immediately with __reqsk_free(). Thus, we must stop the new reqsk's timer before jumping to the label, but the timer might be missed since the cited commit, resulting in UAF. As we are in the original reqsk's timer context, we can safely call timer_delete_sync() for the new reqsk. Let's pass false to __inet_csk_reqsk_queue_drop() to stop the new reqsk's timer. Fixes: 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506035954.1563147-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 928654c34156b6..971f9db2c5869a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1108,7 +1108,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, false); goto no_ownership; } From 7eca3292cac7c26dad4c236f51ba225c39a0523f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 03:59:19 +0000 Subject: [PATCH 589/972] tcp: Fix imbalanced icsk_accept_queue count. When TCP socket migration happens in reqsk_timer_handler(), @sk_listener will be updated with the new listener. When we call __inet_csk_reqsk_queue_drop(), the listener must be the one stored in req->rsk_listener. The cited commit accidentally replaced oreq->rsk_listener with sk_listener, leading to imbalanced icsk_accept_queue count. Let's pass the correct listener to __inet_csk_reqsk_queue_drop(). Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506035954.1563147-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 971f9db2c5869a..dbcd37dfdc15b1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1134,7 +1134,7 @@ static void reqsk_timer_handler(struct timer_list *t) } drop: - __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + __inet_csk_reqsk_queue_drop(oreq->rsk_listener, oreq, true); reqsk_put(oreq); } From e539acf9f9c2550452914fb85aeb8fda67dd762f Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Apr 2026 11:23:17 +0100 Subject: [PATCH 590/972] MAINTAINERS: Add self for the 3c509 network driver It appears there's a need for a maintainer for the 3Com EtherLink III family of Ethernet network adapters. There is documentation available and the driver is very mature so the task ought to be of little hassle, so I think I should be able to squeeze in any issues to be addressed. Signed-off-by: Maciej W. Rozycki Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/alpine.DEB.2.21.2604271056460.28583@angie.orcam.me.uk Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0650fa014f240e..1a485549ee966a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -68,6 +68,12 @@ Maintainers List first. When adding to this list, please keep the entries in alphabetical order. +3C509 NETWORK DRIVER +M: "Maciej W. Rozycki" +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/3com/3c509.c + 3C59X NETWORK DRIVER M: Steffen Klassert L: netdev@vger.kernel.org From 7ce5556f255a680d80daa31b1cedecf7f89e2c22 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:15 +0800 Subject: [PATCH 591/972] ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern mem_check() in net/ipv6/ip6_flowlabel.c reads fl_size without holding ip6_fl_lock. fl_intern() takes the lock immediately afterwards. The two checks therefore race against concurrent fl_intern, ip6_fl_gc and ip6_fl_purge writers, which makes the mem_check budget check approximate. Move spin_lock_bh(&ip6_fl_lock) and the matching unlock from fl_intern() into its only caller ipv6_flowlabel_get(). The mem_check() call now runs under the same critical section as the fl_intern() insert, so the budget check is exact. With all writers and the read of fl_size under ip6_fl_lock, convert fl_size from atomic_t to plain int. The four sites that update or read fl_size are fl_intern (insert path), ip6_fl_gc (garbage collector, the !sched check and the per-entry decrement), ip6_fl_purge (per-netns purge), and mem_check (budget check), and all four now run under ip6_fl_lock. This is a prerequisite for adding a per-netns budget alongside fl_size. The follow-up patch adds netns_ipv6::flowlabel_count and folds it into mem_check(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-2-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_flowlabel.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index c92f98c6f6ecca..a8974643195a15 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -40,7 +40,7 @@ #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) -static atomic_t fl_size = ATOMIC_INIT(0); +static int fl_size; static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1]; static void ip6_fl_gc(struct timer_list *unused); @@ -163,7 +163,7 @@ static void ip6_fl_gc(struct timer_list *unused) if (time_after_eq(now, ttd)) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } if (!sched || time_before(ttd, sched)) @@ -172,7 +172,7 @@ static void ip6_fl_gc(struct timer_list *unused) flp = &fl->next; } } - if (!sched && atomic_read(&fl_size)) + if (!sched && fl_size) sched = now + FL_MAX_LINGER; if (sched) { mod_timer(&ip6_fl_gc_timer, sched); @@ -196,7 +196,7 @@ static void __net_exit ip6_fl_purge(struct net *net) atomic_read(&fl->users) == 0) { *flp = fl->next; fl_free(fl); - atomic_dec(&fl_size); + fl_size--; continue; } flp = &fl->next; @@ -210,10 +210,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net, { struct ip6_flowlabel *lfl; + lockdep_assert_held(&ip6_fl_lock); + fl->label = label & IPV6_FLOWLABEL_MASK; - rcu_read_lock(); - spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; @@ -235,8 +235,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net, lfl = __fl_lookup(net, fl->label); if (lfl) { atomic_inc(&lfl->users); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); return lfl; } } @@ -244,9 +242,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->lastuse = jiffies; fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); - atomic_inc(&fl_size); - spin_unlock_bh(&ip6_fl_lock); - rcu_read_unlock(); + fl_size++; return NULL; } @@ -464,10 +460,14 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { - int room = FL_MAX_SIZE - atomic_read(&fl_size); + int room; struct ipv6_fl_socklist *sfl; int count = 0; + lockdep_assert_held(&ip6_fl_lock); + + room = FL_MAX_SIZE - fl_size; + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; @@ -692,11 +692,19 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (!sfl1) goto done; + rcu_read_lock(); + spin_lock_bh(&ip6_fl_lock); err = mem_check(sk); + if (err == 0) + fl1 = fl_intern(net, fl, freq->flr_label); + else + fl1 = NULL; + spin_unlock_bh(&ip6_fl_lock); + rcu_read_unlock(); + if (err != 0) goto done; - fl1 = fl_intern(net, fl, freq->flr_label); if (fl1) goto recheck; From e68eadffb724b36ffd3d5619e0efcaf29ec2a175 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 16:24:16 +0800 Subject: [PATCH 592/972] ipv6: flowlabel: enforce per-netns limit for unprivileged callers fl_size, fl_ht and ip6_fl_lock in net/ipv6/ip6_flowlabel.c are file scope and shared across netns. mem_check() reads fl_size to decide whether to deny non-CAP_NET_ADMIN callers. capable() runs against init_user_ns, so an unprivileged user in any non-init userns can push fl_size past FL_MAX_SIZE - FL_MAX_SIZE / 4 and starve every other unprivileged userns on the host. Add struct netns_ipv6::flowlabel_count, bumped and decremented next to fl_size in fl_intern, ip6_fl_gc and ip6_fl_purge. The new field fills the existing 4-byte hole after ipmr_seq, so struct netns_ipv6 stays the same size on 64-bit builds. Bump FL_MAX_SIZE from 4096 to 8192. It has been 4096 since the file was added. Machines and connection counts have grown. mem_check() folds an extra per-netns ceiling into the existing non-CAP_NET_ADMIN conditional. The ceiling is half of the total budget that unprivileged callers have ever been able to use, i.e. (FL_MAX_SIZE - FL_MAX_SIZE / 4) / 2 = 3072 entries. With FL_MAX_SIZE doubled, this preserves the original per-user reach of 3K (what an unprivileged caller could already obtain before this change), while forcing an attacker to spread allocations across at least two netns to exhaust the global non-CAP_NET_ADMIN budget. CAP_NET_ADMIN against init_user_ns still bypasses both caps. The previous patch took ip6_fl_lock across mem_check and fl_intern, so the new flowlabel_count read in mem_check and the new flowlabel_count++ in fl_intern run under the same critical section. flowlabel_count is therefore plain int, like fl_size. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506082416.2259567-3-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_flowlabel.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 499e4288170fc6..875916d60bfe4e 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -119,6 +119,7 @@ struct netns_ipv6 { struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; atomic_t ipmr_seq; + int flowlabel_count; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a8974643195a15..b1ccdf0dc64699 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -36,7 +36,7 @@ /* FL hash table */ #define FL_MAX_PER_SOCK 32 -#define FL_MAX_SIZE 4096 +#define FL_MAX_SIZE 8192 #define FL_HASH_MASK 255 #define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) @@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused) ttd = fl->expires; if (time_after_eq(now, ttd)) { *flp = fl->next; - fl_free(fl); fl_size--; + fl->fl_net->ipv6.flowlabel_count--; + fl_free(fl); continue; } if (!sched || time_before(ttd, sched)) @@ -197,6 +198,7 @@ static void __net_exit ip6_fl_purge(struct net *net) *flp = fl->next; fl_free(fl); fl_size--; + net->ipv6.flowlabel_count--; continue; } flp = &fl->next; @@ -243,6 +245,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, fl->next = fl_ht[FL_HASH(fl->label)]; rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl); fl_size++; + net->ipv6.flowlabel_count++; return NULL; } @@ -460,6 +463,9 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, static int mem_check(struct sock *sk) { + const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4); + const int unpriv_user_limit = unpriv_total_limit / 2; + struct net *net = sock_net(sk); int room; struct ipv6_fl_socklist *sfl; int count = 0; @@ -478,7 +484,9 @@ static int mem_check(struct sock *sk) if (room <= 0 || ((count >= FL_MAX_PER_SOCK || - (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) && + (count > 0 && room < FL_MAX_SIZE / 2) || + room < FL_MAX_SIZE / 4 || + net->ipv6.flowlabel_count >= unpriv_user_limit) && !capable(CAP_NET_ADMIN))) return -ENOBUFS; From 58e2330bd45572a6e3d46ea94cf7a9641f43591a Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 6 May 2026 09:08:08 +0000 Subject: [PATCH 593/972] net: napi: Avoid gro timer misfiring at end of busypoll When in irq deferral mode (defer-hard-irqs > 0), a short enough gro-flush timeout can trigger before NAPI_STATE_SCHED is cleared if the last poll in busy_poll_stop() takes too long. This can have the effect of leaving the queue stuck with interrupts disabled and no timer armed which results in a tx timeout if there is no subsequent busypoll cycle. To prevent this, defer the gro-flush timer arm after the last poll. Fixes: 7fd3253a7de6 ("net: Introduce preferred busy-polling") Co-developed-by: Martin Karsten Signed-off-by: Martin Karsten Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Reviewed-by: Joe Damato Link: https://patch.msgid.link/20260506090808.820559-2-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8bfa8313ef62ed..0c6c270d9f7d11 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void) #if defined(CONFIG_NET_RX_BUSY_POLL) -static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout) { - if (!skip_schedule) { + if (!timeout) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; @@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } enum { @@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - bool skip_schedule = false; - unsigned long timeout; + unsigned long timeout = 0; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = napi_get_gro_flush_timeout(napi); - if (napi->defer_hard_irqs_count && timeout) { - hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - skip_schedule = true; + if (napi->defer_hard_irqs_count) { + /* A short enough gro flush timeout and long enough + * poll can result in timer firing too early. + * Timer will be armed later if necessary. + */ + timeout = napi_get_gro_flush_timeout(napi); } } @@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) - __busy_poll_stop(napi, skip_schedule); + __busy_poll_stop(napi, timeout); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } From 0a549298f452a83ae57e6582e6ca389357f9355d Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 7 May 2026 14:04:44 +0200 Subject: [PATCH 594/972] MAINTAINERS: change maintainers for macb Ethernet driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I would like to hand over the macb maintenance to Théo, as I'm unable to keep up with the recent flow of patches for this driver. After speaking with Claudiu, he indicated that he is in the same position as me. To help with this work, Conor has agreed to act as a reviewer. I was given responsibility for this driver years ago, and I'm glad to see it continue with talented developers. Signed-off-by: Nicolas Ferre Acked-by: Claudiu Beznea Acked-by: Conor Dooley Link: https://patch.msgid.link/20260507120444.9733-1-nicolas.ferre@microchip.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1a485549ee966a..8a134cf6e9bb1d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4187,8 +4187,8 @@ F: include/uapi/linux/sonet.h F: net/atm/ ATMEL MACB ETHERNET DRIVER -M: Nicolas Ferre -M: Claudiu Beznea +M: Théo Lebrun +R: Conor Dooley S: Maintained F: drivers/net/ethernet/cadence/ From 4908f1395fb1b832ceec11584af649874a2732ea Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Thu, 7 May 2026 21:17:38 +0800 Subject: [PATCH 595/972] net: ethtool: fix NULL pointer dereference in phy_reply_size In phy_prepare_data(), several strings such as 'name', 'drvname', 'upstream_sfp_name', and 'downstream_sfp_name' are allocated using kstrdup(). However, these allocations were not checked for failure. If kstrdup() fails for 'name', it returns NULL while the function continues. This leads to a kernel NULL pointer dereference and panic later in phy_reply_size() when it unconditionally calls strlen() on the NULL pointer. While other strings like 'upstream_sfp_name' might be checked before access in certain code paths, failing to handle these allocations consistently can lead to incomplete data reporting or hidden bugs. Fix this by adding proper NULL checks for all kstrdup() calls in phy_prepare_data() and implement a centralized error handling path using goto labels to ensure all previously allocated resources are freed on failure. Fixes: 9dd2ad5e92b9 ("net: ethtool: phy: Convert the PHY_GET command to generic phy dump") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20260507131738.1173835-1-2022090917019@std.uestc.edu.cn Signed-off-by: Jakub Kicinski --- net/ethtool/phy.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index d4e6887055ab15..f76d94d848d6da 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -76,6 +76,7 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, struct nlattr **tb = info->attrs; struct phy_device_node *pdn; struct phy_device *phydev; + int ret; /* RTNL is held by the caller */ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, @@ -88,8 +89,17 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, return -EOPNOTSUPP; rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); + if (!rep_data->name) + return -ENOMEM; + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } + rep_data->upstream_type = pdn->upstream_type; if (pdn->upstream_type == PHY_UPSTREAM_PHY) { @@ -97,15 +107,33 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, rep_data->upstream_index = upstream->phyindex; } - if (pdn->parent_sfp_bus) + if (pdn->parent_sfp_bus) { rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), GFP_KERNEL); + if (!rep_data->upstream_sfp_name) { + ret = -ENOMEM; + goto err_free_drvname; + } + } - if (phydev->sfp_bus) + if (phydev->sfp_bus) { rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), GFP_KERNEL); + if (!rep_data->downstream_sfp_name) { + ret = -ENOMEM; + goto err_free_upstream_sfp; + } + } return 0; + +err_free_upstream_sfp: + kfree(rep_data->upstream_sfp_name); +err_free_drvname: + kfree(rep_data->drvname); +err_free_name: + kfree(rep_data->name); + return ret; } static int phy_fill_reply(struct sk_buff *skb, From f2ab4fd02777c4081be38c35f939e4dc529b8952 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 7 May 2026 14:04:26 +0200 Subject: [PATCH 596/972] net: nsh: fix incorrect header length macros NSH header length is a 6-bit field that encodes the total length of the header in 4-byte words. So the maximum length is 0b111111 * 4, which is 252 and not 256. The maximum context length is the same number minus the length of the base header (8), so 244. These macros are used to validate push_nsh() action in openvswitch. Miscalculation here doesn't cause any real issues. In the worst case the oversized context is truncated while building the header, so we'll construct and send a broken packet, which is not a big problem, as any receiver should validate the fields. No invalid memory accesses will happen during the header push. But we should fix the macros to reject the incorrect actions in the first place. Using previously defined values and calculating the length instead of defining numbers directly, so it's easier to understand where they come from and harder to make a mistake. Fixes: 1f0b7744c505 ("net: add NSH header structures and helpers") Signed-off-by: Ilya Maximets Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20260507120434.2962505-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/nsh.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/nsh.h b/include/net/nsh.h index 16a75109389693..15a26c59081514 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -247,10 +247,10 @@ struct nshhdr { #define NSH_M_TYPE1_LEN 24 /* NSH header maximum Length. */ -#define NSH_HDR_MAX_LEN 256 +#define NSH_HDR_MAX_LEN ((NSH_LEN_MASK >> NSH_LEN_SHIFT) * 4) /* NSH context headers maximum Length. */ -#define NSH_CTX_HDRS_MAX_LEN 248 +#define NSH_CTX_HDRS_MAX_LEN (NSH_HDR_MAX_LEN - NSH_BASE_HDR_LEN) static inline struct nshhdr *nsh_hdr(struct sk_buff *skb) { From efda25ee84325385f859d10872590e90ce837243 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 6 May 2026 20:07:13 +0000 Subject: [PATCH 597/972] genetlink: free the skb on 'group >= family->n_mcgrps' These methods generally consume ownership of the provided skb, so even if an error path is encountered, the skb is freed. This is because the very first thing they do after some initial setup is to unconditionally consume the skb via consume_skb(skb). Any subsequent errors lead to the core netlink layer freeing the skb. However, there is one check that occurs before ownership is passed, which is the check for the group index. So if this error condition is encountered, then the skb is leaked. This error condition is generally considered a violation of the netlink API, so it's not expected to occur under normal circumstances. For the same reason, no callers check for this error condition, and no callers need to be adjusted. However, we should still follow the same ownership semantics of the rest of the function. Thus, free the skb in this codepath. Suggested-by: Andrew Lunn Suggested-by: Matthew Maurer Fixes: 2a94fe48f32c ("genetlink: make multicast groups const, prevent abuse") Link: https://lore.kernel.org/r/845b36ba-7b3a-41f2-acb2-b284f253e2ca@lunn.ch Signed-off-by: Alice Ryhl Link: https://patch.msgid.link/20260506-genlmsg-return-v2-1-a63ee2a055d6@google.com Signed-off-by: Jakub Kicinski --- include/net/genetlink.h | 4 +++- net/netlink/genetlink.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 7b84f2cef8b1f9..d70510ac31ab54 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -489,8 +489,10 @@ genlmsg_multicast_netns_filtered(const struct genl_family *family, netlink_filter_fn filter, void *filter_data) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + nlmsg_free(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d251d894afd468..0da39eaed255f4 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1972,8 +1972,10 @@ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group); @@ -1986,8 +1988,10 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + kfree_skb(skb); return; + } group = family->mcgrp_offset + group; nlmsg_notify(sk, skb, info->snd_portid, group, From a77d5a069d959dc45f5f472d48cba37d8cba0f1c Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Wed, 6 May 2026 16:37:45 -0700 Subject: [PATCH 598/972] net: shaper: Reject reparenting of existing nodes When an existing node-scope shaper is moved to a different parent via the group operation, the framework fails to update the leaves count on both the old and new parent shapers. Only newly created nodes (handle.id == NET_SHAPER_ID_UNSPEC) trigger the parent leaves increment at line 1039. This causes the parent's leaves counter to diverge from the actual number of children in the xarray. When the node is later deleted, pre_del_node() allocates an array sized by the stale leaves count, but the xarray iteration finds more children than expected, hitting the WARN_ON_ONCE guard and returning -EINVAL. Rather than adding reparenting support with complex leaves count bookkeeping, reject group calls that attempt to change an existing node's parent. Updates to an existing node's rate or leaves under the same parent remain permitted. We expect that for any modification of the topology user should always create new groups and let the kernel garbage collect the leaf-less nodes. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260506233745.111895-1-mohsin.bashr@gmail.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 94bc9c7382ea62..1069fa4eb9f606 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -964,15 +964,22 @@ static int __net_shaper_group(struct net_shaper_binding *binding, int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + struct net_shaper *cur = NULL; + new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; - if (!new_node && !net_shaper_lookup(binding, &node->handle)) { - /* The related attribute is not available when - * reaching here from the delete() op. - */ - NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", - node->handle.scope, node->handle.id); - return -ENOENT; + if (!new_node) { + cur = net_shaper_lookup(binding, &node->handle); + if (!cur) { + /* The related attribute is not available + * when reaching here from the delete() op. + */ + NL_SET_ERR_MSG_FMT(extack, + "Node shaper %d:%d does not exist", + node->handle.scope, + node->handle.id); + return -ENOENT; + } } /* When unspecified, the node parent scope is inherited from @@ -986,6 +993,15 @@ static int __net_shaper_group(struct net_shaper_binding *binding, return ret; } + if (cur && net_shaper_handle_cmp(&cur->parent, + &node->parent)) { + NL_SET_ERR_MSG_FMT(extack, + "Cannot reparent node shaper %d:%d", + node->handle.scope, + node->handle.id); + return -EOPNOTSUPP; + } + } else { net_shaper_default_parent(&node->handle, &node->parent); } From 1619553b0a6ba7a966b17b0226f3acb9dd4d5380 Mon Sep 17 00:00:00 2001 From: Matt Vollrath Date: Wed, 6 May 2026 14:48:10 -0700 Subject: [PATCH 599/972] i40e: Cleanup PTP registration on probe failure Fix two conditions which would leak PTP registration on probe failure: 1. i40e_setup_pf_switch can encounter an error in i40e_setup_pf_filter_control, call i40e_ptp_init, then return non-zero, sending i40e_probe to err_vsis. 2. i40e_setup_misc_vector can return non-zero, sending i40e_probe to err_vsis. Both of these conditions have been present since PTP was introduced in this driver. Found with coccinelle. Fixes: beb0dff1251db ("i40e: enable PTP") Signed-off-by: Matt Vollrath Tested-by: Sunitha Mekala Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-1-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 028bd500603a54..f06fcef644e571 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16108,6 +16108,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Unwind what we've done if something failed in the setup */ err_vsis: set_bit(__I40E_DOWN, pf->state); + i40e_ptp_stop(pf); i40e_clear_interrupt_scheme(pf); kfree(pf->vsi); err_switch_setup: From 678b713ece1e853f11e670a84cb887c35e1381b7 Mon Sep 17 00:00:00 2001 From: Matt Vollrath Date: Wed, 6 May 2026 14:48:11 -0700 Subject: [PATCH 600/972] i40e: Cleanup PTP pins on probe failure PTP pin structs are allocated early in probe, but never cleaned up. Fix this by calling i40e_ptp_free_pins in the error path. To support this, i40e_ptp_free_pins is added to the header and pin_config is correctly nullified after being freed. This has been an issue since i40e_ptp_alloc_pins was introduced. Fixes: 1050713026a08 ("i40e: add support for PTP external synchronization clock") Reported-by: Kohei Enju Cc: stable@vger.kernel.org Signed-off-by: Matt Vollrath Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Reviewed-by: Kohei Enju Tested-by: Sunitha Mekala Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-2-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_ptp.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index dcb50c2e1aa277..83e780919ac97f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1318,6 +1318,7 @@ void i40e_ptp_restore_hw_time(struct i40e_pf *pf); void i40e_ptp_init(struct i40e_pf *pf); void i40e_ptp_stop(struct i40e_pf *pf); int i40e_ptp_alloc_pins(struct i40e_pf *pf); +void i40e_ptp_free_pins(struct i40e_pf *pf); int i40e_update_adq_vsi_queues(struct i40e_vsi *vsi, int vsi_offset); int i40e_is_vsi_uplink_mode_veb(struct i40e_vsi *vsi); int i40e_get_partition_bw_setting(struct i40e_pf *pf); diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f06fcef644e571..6d4f9218dc6847 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16112,6 +16112,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) i40e_clear_interrupt_scheme(pf); kfree(pf->vsi); err_switch_setup: + i40e_ptp_free_pins(pf); i40e_reset_interrupt_capability(pf); timer_shutdown_sync(&pf->service_timer); err_mac_addr: diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index 404a716db8da71..7d07c389bb2312 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -940,12 +940,13 @@ int i40e_ptp_hwtstamp_get(struct net_device *netdev, * * Release memory allocated for PTP pins. **/ -static void i40e_ptp_free_pins(struct i40e_pf *pf) +void i40e_ptp_free_pins(struct i40e_pf *pf) { if (i40e_is_ptp_pin_dev(&pf->hw)) { kfree(pf->ptp_pins); kfree(pf->ptp_caps.pin_config); pf->ptp_pins = NULL; + pf->ptp_caps.pin_config = NULL; } } From da4f76b6a84ede14a71282ef841768299ead0221 Mon Sep 17 00:00:00 2001 From: Emil Tantilov Date: Wed, 6 May 2026 14:48:12 -0700 Subject: [PATCH 601/972] idpf: fix read_dev_clk_lock spinlock init in idpf_ptp_init() In idpf_ptp_init(), read_dev_clk_lock is initialized after ptp_schedule_worker() had already been called (and after idpf_ptp_settime64() could reach the lock). The PTP aux worker fires immediately upon scheduling and can call into idpf_ptp_read_src_clk_reg_direct(), which takes spin_lock(&ptp->read_dev_clk_lock) on an uninitialized lock, triggering the lockdep "non-static key" warning: [12973.796587] idpf 0000:83:00.0: Device HW Reset initiated [12974.094507] INFO: trying to register non-static key. ... [12974.097208] Call Trace: [12974.097213] [12974.097218] dump_stack_lvl+0x93/0xe0 [12974.097234] register_lock_class+0x4c4/0x4e0 [12974.097249] ? __lock_acquire+0x427/0x2290 [12974.097259] __lock_acquire+0x98/0x2290 [12974.097272] lock_acquire+0xc6/0x310 [12974.097281] ? idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097311] ? lockdep_hardirqs_on_prepare+0xde/0x190 [12974.097318] ? finish_task_switch.isra.0+0xd2/0x350 [12974.097330] ? __pfx_ptp_aux_kworker+0x10/0x10 [ptp] [12974.097343] _raw_spin_lock+0x30/0x40 [12974.097353] ? idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097373] idpf_ptp_read_src_clk_reg+0xb7/0x150 [idpf] [12974.097391] ? kthread_worker_fn+0x88/0x3d0 [12974.097404] ? kthread_worker_fn+0x4e/0x3d0 [12974.097411] idpf_ptp_update_cached_phctime+0x26/0x120 [idpf] [12974.097428] ? _raw_spin_unlock_irq+0x28/0x50 [12974.097436] idpf_ptp_do_aux_work+0x15/0x20 [idpf] [12974.097454] ptp_aux_kworker+0x20/0x40 [ptp] [12974.097464] kthread_worker_fn+0xd5/0x3d0 [12974.097474] ? __pfx_kthread_worker_fn+0x10/0x10 [12974.097482] kthread+0xf4/0x130 [12974.097489] ? __pfx_kthread+0x10/0x10 [12974.097498] ret_from_fork+0x32c/0x410 [12974.097512] ? __pfx_kthread+0x10/0x10 [12974.097519] ret_from_fork_asm+0x1a/0x30 [12974.097540] Move the call to spin_lock_init() up a bit to make sure read_dev_clk_lock is not touched before it's been initialized. Fixes: 5cb8805d2366 ("idpf: negotiate PTP capabilities and get PTP clock") Signed-off-by: Emil Tantilov Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-3-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_ptp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_ptp.c b/drivers/net/ethernet/intel/idpf/idpf_ptp.c index eec91c4f0a75a0..4a51d2727547d9 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_ptp.c +++ b/drivers/net/ethernet/intel/idpf/idpf_ptp.c @@ -952,6 +952,8 @@ int idpf_ptp_init(struct idpf_adapter *adapter) goto free_ptp; } + spin_lock_init(&adapter->ptp->read_dev_clk_lock); + err = idpf_ptp_create_clock(adapter); if (err) goto free_ptp; @@ -977,8 +979,6 @@ int idpf_ptp_init(struct idpf_adapter *adapter) goto remove_clock; } - spin_lock_init(&adapter->ptp->read_dev_clk_lock); - pci_dbg(adapter->pdev, "PTP init successful\n"); return 0; From 6c77b9510829a424d1b74409b7db9456e3522871 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 6 May 2026 14:48:13 -0700 Subject: [PATCH 602/972] idpf: fix double free and use-after-free in aux device error paths When auxiliary_device_add() fails in idpf_plug_vport_aux_dev() or idpf_plug_core_aux_dev(), the err_aux_dev_add label calls auxiliary_device_uninit() and falls through to err_aux_dev_init. The uninit call will trigger put_device(), which invokes the release callback (idpf_vport_adev_release / idpf_core_adev_release) that frees iadev. The fall-through then reads adev->id from the freed iadev for ida_free() and double-frees iadev with kfree(). Free the IDA slot and clear the back-pointer before uninit, while adev is still valid, then return immediately. Commit 65637c3a1811 ("idpf: fix UAF in RDMA core aux dev deinitialization") fixed the same use-after-free in the matching unplug path in this file but missed both probe error paths. Cc: Tony Nguyen Cc: Przemek Kitszel Cc: Andrew Lunn Cc: stable@kernel.org Fixes: be91128c579c ("idpf: implement RDMA vport auxiliary dev create, init, and destroy") Fixes: f4312e6bfa2a ("idpf: implement core RDMA auxiliary dev create, init, and destroy") Signed-off-by: Greg Kroah-Hartman Reviewed-by: Aleksandr Loktionov Reviewed-by: Paul Menzel Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-4-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf_idc.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c index 7e4f4ac9265377..b7d6b08fc89e89 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_idc.c +++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c @@ -90,7 +90,10 @@ static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info, return 0; err_aux_dev_add: + ida_free(&idpf_idc_ida, adev->id); + vdev_info->adev = NULL; auxiliary_device_uninit(adev); + return ret; err_aux_dev_init: ida_free(&idpf_idc_ida, adev->id); err_ida_alloc: @@ -228,7 +231,10 @@ static int idpf_plug_core_aux_dev(struct iidc_rdma_core_dev_info *cdev_info) return 0; err_aux_dev_add: + ida_free(&idpf_idc_ida, adev->id); + cdev_info->adev = NULL; auxiliary_device_uninit(adev); + return ret; err_aux_dev_init: ida_free(&idpf_idc_ida, adev->id); err_ida_alloc: From b3cda96feb60d91fe88d52b974ff110dcfa91239 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Wed, 6 May 2026 14:48:14 -0700 Subject: [PATCH 603/972] ice: fix setting RSS VSI hash for E830 ice_set_rss_hfunc() performs a VSI update, in which it sets hashing function, leaving other VSI options unchanged. However, ::q_opt_flags is mistakenly set to the value of another field, instead of its original value, probably due to a typo. What happens next is hardware-dependent: On E810, only the first bit is meaningful (see ICE_AQ_VSI_Q_OPT_PE_FLTR_EN) and can potentially end up in a different state than before VSI update. On E830, some of the remaining bits are not reserved. Setting them to some unrelated values can cause the firmware to reject the update because of invalid settings, or worse - succeed. Reproducer: sudo ethtool -X $PF1 equal 8 Output in dmesg: Failed to configure RSS hash for VSI 6, error -5 Fixes: 352e9bf23813 ("ice: enable symmetric-xor RSS for Toeplitz hash function") Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Signed-off-by: Marcin Szycik Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-5-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1d1947a7fe1191..c52c465280f742 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8046,7 +8046,7 @@ int ice_set_rss_hfunc(struct ice_vsi *vsi, u8 hfunc) ctx->info.q_opt_rss |= FIELD_PREP(ICE_AQ_VSI_Q_OPT_RSS_HASH_M, hfunc); ctx->info.q_opt_tc = vsi->info.q_opt_tc; - ctx->info.q_opt_flags = vsi->info.q_opt_rss; + ctx->info.q_opt_flags = vsi->info.q_opt_flags; err = ice_update_vsi(hw, vsi->idx, ctx, NULL); if (err) { From 0ded1f36ba4021cba50513e80be6b6e173710168 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 6 May 2026 14:48:15 -0700 Subject: [PATCH 604/972] ice: fix locking in ice_dcb_rebuild() Move the mutex_lock() call up to prevent that DCB settings change after the first ice_query_port_ets() call. The second ice_query_port_ets() call in ice_dcb_rebuild() is already protected by pf->tc_mutex. This also fixes a bug in an error path, as before taking the first "goto dcb_error" in the function jumped over mutex_lock() to mutex_unlock(). This bug has been detected by the clang thread-safety analyzer. Cc: intel-wired-lan@lists.osuosl.org Fixes: 242b5e068b25 ("ice: Fix DCB rebuild after reset") Signed-off-by: Bart Van Assche Reviewed-by: Aleksandr Loktionov Reviewed-by: Przemek Kitszel Tested-by: Arpana Arland Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-6-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 16aa255351523d..0bc6dd37568792 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -537,14 +537,14 @@ void ice_dcb_rebuild(struct ice_pf *pf) struct ice_dcbx_cfg *err_cfg; int ret; + mutex_lock(&pf->tc_mutex); + ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL); if (ret) { dev_err(dev, "Query Port ETS failed\n"); goto dcb_error; } - mutex_lock(&pf->tc_mutex); - if (!pf->hw.port_info->qos_cfg.is_sw_lldp) ice_cfg_etsrec_defaults(pf->hw.port_info); From cce709d8df6ba6d2a0a0dbf34acc2cdd9e23bd46 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 6 May 2026 14:48:16 -0700 Subject: [PATCH 605/972] ice: dpll: fix rclk pin state get for E810 The refactoring of ice_dpll_rclk_state_on_pin_get() to use ice_dpll_pin_get_parent_idx() omitted the base_rclk_idx adjustment that was correctly added in the ice_dpll_rclk_state_on_pin_set() path. This breaks E810 devices where base_rclk_idx is non-zero, causing the wrong hardware index to be used for pin state lookup and incorrect recovered clock state to be reported via the DPLL subsystem. E825C is unaffected as its base_rclk_idx is 0. While at it, add bounds check against ICE_DPLL_RCLK_NUM_MAX on hw_idx after the base_rclk_idx subtraction in both ice_dpll_rclk_state_on_pin_{get,set}() to prevent out-of-bounds access on the pin state array. Fixes: ad1df4f2d591 ("ice: dpll: Support E825-C SyncE and dynamic pin discovery") Signed-off-by: Ivan Vecera Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-7-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dpll.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index 27b460926baced..892bc7c2e28b46 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2523,6 +2523,8 @@ ice_dpll_rclk_state_on_pin_set(const struct dpll_pin *pin, void *pin_priv, if (hw_idx < 0) goto unlock; hw_idx -= pf->dplls.base_rclk_idx; + if (hw_idx >= ICE_DPLL_RCLK_NUM_MAX) + goto unlock; if ((enable && p->state[hw_idx] == DPLL_PIN_STATE_CONNECTED) || (!enable && p->state[hw_idx] == DPLL_PIN_STATE_DISCONNECTED)) { @@ -2586,6 +2588,9 @@ ice_dpll_rclk_state_on_pin_get(const struct dpll_pin *pin, void *pin_priv, hw_idx = ice_dpll_pin_get_parent_idx(p, parent_pin); if (hw_idx < 0) goto unlock; + hw_idx -= pf->dplls.base_rclk_idx; + if (hw_idx >= ICE_DPLL_RCLK_NUM_MAX) + goto unlock; ret = ice_dpll_pin_state_update(pf, p, ICE_DPLL_PIN_TYPE_RCLK_INPUT, extack); From 30f1658fc5387384c7a60b9d15c79cb959512c1a Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 6 May 2026 14:48:17 -0700 Subject: [PATCH 606/972] ice: dpll: fix misplaced header macros The CGU register definitions (ICE_CGU_R10, ICE_CGU_R11 and related field masks) were placed after the #endif of the _ICE_DPLL_H_ include guard, leaving them unprotected. Move them inside the guard. Fixes: ad1df4f2d591 ("ice: dpll: Support E825-C SyncE and dynamic pin discovery") Signed-off-by: Ivan Vecera Reviewed-by: Aleksandr Loktionov Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260506-jk-iwl-net-2026-05-04-v2-8-a5ea4dc837a9@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_dpll.h | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h index ae42cdea0ee145..8678575359b929 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.h +++ b/drivers/net/ethernet/intel/ice/ice_dpll.h @@ -8,6 +8,22 @@ #define ICE_DPLL_RCLK_NUM_MAX 4 +#define ICE_CGU_R10 0x28 +#define ICE_CGU_R10_SYNCE_CLKO_SEL GENMASK(8, 5) +#define ICE_CGU_R10_SYNCE_CLKODIV_M1 GENMASK(13, 9) +#define ICE_CGU_R10_SYNCE_CLKODIV_LOAD BIT(14) +#define ICE_CGU_R10_SYNCE_DCK_RST BIT(15) +#define ICE_CGU_R10_SYNCE_ETHCLKO_SEL GENMASK(18, 16) +#define ICE_CGU_R10_SYNCE_ETHDIV_M1 GENMASK(23, 19) +#define ICE_CGU_R10_SYNCE_ETHDIV_LOAD BIT(24) +#define ICE_CGU_R10_SYNCE_DCK2_RST BIT(25) +#define ICE_CGU_R10_SYNCE_S_REF_CLK GENMASK(31, 27) + +#define ICE_CGU_R11 0x2C +#define ICE_CGU_R11_SYNCE_S_BYP_CLK GENMASK(6, 1) + +#define ICE_CGU_BYPASS_MUX_OFFSET_E825C 3 + /** * enum ice_dpll_pin_sw - enumerate ice software pin indices: * @ICE_DPLL_PIN_SW_1_IDX: index of first SW pin @@ -157,19 +173,3 @@ static inline void ice_dpll_deinit(struct ice_pf *pf) { } #endif #endif - -#define ICE_CGU_R10 0x28 -#define ICE_CGU_R10_SYNCE_CLKO_SEL GENMASK(8, 5) -#define ICE_CGU_R10_SYNCE_CLKODIV_M1 GENMASK(13, 9) -#define ICE_CGU_R10_SYNCE_CLKODIV_LOAD BIT(14) -#define ICE_CGU_R10_SYNCE_DCK_RST BIT(15) -#define ICE_CGU_R10_SYNCE_ETHCLKO_SEL GENMASK(18, 16) -#define ICE_CGU_R10_SYNCE_ETHDIV_M1 GENMASK(23, 19) -#define ICE_CGU_R10_SYNCE_ETHDIV_LOAD BIT(24) -#define ICE_CGU_R10_SYNCE_DCK2_RST BIT(25) -#define ICE_CGU_R10_SYNCE_S_REF_CLK GENMASK(31, 27) - -#define ICE_CGU_R11 0x2C -#define ICE_CGU_R11_SYNCE_S_BYP_CLK GENMASK(6, 1) - -#define ICE_CGU_BYPASS_MUX_OFFSET_E825C 3 From c4f3d6eb1fcf6cd9ce4644f604d5aad1ce594dfc Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Wed, 6 May 2026 21:43:11 +0900 Subject: [PATCH 607/972] net: lan966x: avoid unregistering netdev on register failure lan966x_probe_port() stores the newly allocated net_device in the port before calling register_netdev(). If register_netdev() fails, the probe error path calls lan966x_cleanup_ports(), which sees port->dev and calls unregister_netdev() for a device that was never registered. Destroy the phylink instance created for this port and clear port->dev before returning the registration error. The common cleanup path now skips ports without port->dev before reaching the registered netdev cleanup, so it only handles ports that reached the registered-netdev lifetime. This also avoids treating an uninitialized FDMA netdev and the failed port as a NULL == NULL match in the common cleanup path. Fixes: d28d6d2e37d1 ("net: lan966x: add port module support") Co-developed-by: Ijae Kim Signed-off-by: Ijae Kim Signed-off-by: Myeonghun Pak Link: https://patch.msgid.link/20260506124331.31945-1-mhun512@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 47752d3fde0b10..1179a6e127c527 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -749,11 +749,10 @@ static void lan966x_cleanup_ports(struct lan966x *lan966x) for (p = 0; p < lan966x->num_phys_ports; p++) { port = lan966x->ports[p]; - if (!port) + if (!port || !port->dev) continue; - if (port->dev) - unregister_netdev(port->dev); + unregister_netdev(port->dev); lan966x_xdp_port_deinit(port); if (lan966x->fdma && lan966x->fdma_ndev == port->dev) @@ -873,6 +872,9 @@ static int lan966x_probe_port(struct lan966x *lan966x, u32 p, err = register_netdev(dev); if (err) { dev_err(lan966x->dev, "register_netdev failed\n"); + phylink_destroy(phylink); + port->phylink = NULL; + port->dev = NULL; return err; } From 6635fa84403c3a59455b66007c019a7cc632db30 Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Thu, 7 May 2026 01:28:13 +0530 Subject: [PATCH 608/972] net: ti: icssm-prueth: fix eth_ports_node leak in probe The error path on of_property_read_u32() failure inside icssm_prueth_probe() returns without putting eth_ports_node, which was acquired before the for_each_child_of_node() loop. Drop it before returning. Fixes: 511f6c1ae093 ("net: ti: icssm-prueth: Adds ICSSM Ethernet driver") Signed-off-by: Shitalkumar Gandhi Link: https://patch.msgid.link/20260506195813.641610-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/icssm/icssm_prueth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/icssm/icssm_prueth.c b/drivers/net/ethernet/ti/icssm/icssm_prueth.c index 53bbd929090423..b7e94244355a3b 100644 --- a/drivers/net/ethernet/ti/icssm/icssm_prueth.c +++ b/drivers/net/ethernet/ti/icssm/icssm_prueth.c @@ -1825,6 +1825,7 @@ static int icssm_prueth_probe(struct platform_device *pdev) dev_err(dev, "%pOF error reading port_id %d\n", eth_node, ret); of_node_put(eth_node); + of_node_put(eth_ports_node); return ret; } From abb5f36771cc4c05899b34000829a787572a8817 Mon Sep 17 00:00:00 2001 From: Ben Morris Date: Thu, 7 May 2026 17:14:55 -0700 Subject: [PATCH 609/972] sctp: revalidate list cursor after sctp_sendmsg_to_asoc() in SCTP_SENDALL The SCTP_SENDALL path in sctp_sendmsg() iterates ep->asocs with list_for_each_entry_safe(), which caches the next entry in @tmp before the loop body runs. The body calls sctp_sendmsg_to_asoc(), which may drop the socket lock inside sctp_wait_for_sndbuf(). While the lock is dropped, another thread can SCTP_SOCKOPT_PEELOFF the association cached in @tmp, migrating it to a new endpoint via sctp_sock_migrate() (list_del_init() + list_add_tail() to newep->asocs), and optionally close the new socket which frees the association via kfree_rcu(). The cached @tmp can also be freed by a network ABORT for that association, processed in softirq while the lock is dropped. sctp_wait_for_sndbuf() revalidates @asoc (the current entry) on re-lock via the "sk != asoc->base.sk" and "asoc->base.dead" checks, but nothing revalidates @tmp. After a successful return, the iterator advances to the stale @tmp, yielding either a use-after-free (if the peeled socket was closed) or a list-walk onto the new endpoint's list head (type confusion of &newep->asocs as a struct sctp_association *). Both are reachable from CapEff=0; the type-confusion path gives controlled indirect call via the outqueue.sched->init_sid pointer. Fix by re-deriving @tmp from @asoc after sctp_sendmsg_to_asoc() returns. @asoc is known to still be on ep->asocs at that point: the only callers that list_del an association from ep->asocs are sctp_association_free() (which sets asoc->base.dead) and sctp_assoc_migrate() (which changes asoc->base.sk), and sctp_wait_for_sndbuf() checks both under the lock before any successful return; a tripped check propagates as err < 0 and the loop bails before the re-derive. The SCTP_ABORT path in sctp_sendmsg_check_sflags() returns 0 and the loop hits 'continue' before sctp_sendmsg_to_asoc() is ever called, so the @tmp cached by list_for_each_entry_safe() still covers the lock-held free that ba59fb027307 ("sctp: walk the list of asoc safely") was added for. Fixes: 4910280503f3 ("sctp: add support for snd flag SCTP_SENDALL process in sendmsg") Cc: stable@vger.kernel.org Signed-off-by: Ben Morris Acked-by: Xin Long Link: https://patch.msgid.link/20260508001455.3137-1-joycathacker@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/socket.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 58d0d9747f0b39..1d2568bb6bc277 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1986,6 +1986,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_unlock; iov_iter_revert(&msg->msg_iter, err); + + /* sctp_sendmsg_to_asoc() may have released the socket + * lock (sctp_wait_for_sndbuf), during which other + * associations on ep->asocs could have been peeled + * off or freed. @asoc itself is revalidated by the + * base.dead and base.sk checks in sctp_wait_for_sndbuf, + * so re-derive the cached cursor from it. + */ + tmp = list_next_entry(asoc, asocs); } goto out_unlock; From 496c0c4c53bbe1bad97e82cd12103df61a6e459d Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Thu, 7 May 2026 17:53:32 +0200 Subject: [PATCH 610/972] net: wan: fsl_ucc_hdlc: free tx_skbuff in uhdlc_memclean When the device is removed all allocated resources should be freed. In uhdlc_memclean the netdev transmit queue was already stopped. But at this point we may have pending skb in the transmit queue which must be freed. Therefore iterate over the tx_skbuff pointers and free all pending skb. The issue was discovered by sashiko. Tested on a ls1043a board running HDLC in bus mode on kernel 6.12. https: //sashiko.dev/#/patchset/20260429114208.941011-1-holger.brunck%40hitachienergy.com Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Link: https://patch.msgid.link/20260507155332.3452319-1-holger.brunck@hitachienergy.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 15bfb78381d4cb..809f21fb93f56e 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -740,6 +740,8 @@ static int uhdlc_open(struct net_device *dev) static void uhdlc_memclean(struct ucc_hdlc_private *priv) { + int i; + qe_muram_free(ioread16be(&priv->ucc_pram->riptr)); qe_muram_free(ioread16be(&priv->ucc_pram->tiptr)); @@ -770,6 +772,11 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv) kfree(priv->rx_skbuff); priv->rx_skbuff = NULL; + for (i = 0; i < TX_BD_RING_LEN; i++) { + dev_kfree_skb(priv->tx_skbuff[i]); + priv->tx_skbuff[i] = NULL; + } + kfree(priv->tx_skbuff); priv->tx_skbuff = NULL; From 512809bb8a370d071f66fc53abe67368e171dec5 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 7 May 2026 20:22:06 +0200 Subject: [PATCH 611/972] bpf: Don't run arg-tracking analysis twice on main subprog Because subprog 0, the main subprog, is considered a global function, we end up running the arg-tracking dataflow analysis twice on it. That results in slightly longer verification but mostly in more verbose verifier logs. This patch fixes it by keeping only the iteration over global subprogs. When running over all of Cilium's programs with BPF_LOG_LEVEL2, this reduces verbosity by ~20% on average. Fixes: bf0c571f7feb6 ("bpf: introduce forward arg-tracking dataflow analysis") Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/e4d7b53d4963ef520541a782f5fc8108a168877c.1778176504.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/liveness.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 332e6e003f270a..58197d73b12010 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -1914,26 +1914,15 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env) return -ENOMEM; } - instance = call_instance(env, NULL, 0, 0); - if (IS_ERR(instance)) { - err = PTR_ERR(instance); - goto out; - } - err = analyze_subprog(env, NULL, info, instance, callsites); - if (err) - goto out; - /* - * Subprogs and callbacks that don't receive FP-derived arguments - * cannot access ancestor stack frames, so they were skipped during - * the recursive walk above. Async callbacks (timer, workqueue) are - * also not reachable from the main program's call graph. Analyze - * all unvisited subprogs as independent roots at depth 0. + * Analyze every subprog in reverse topological order (callers + * before callees) so that each subprog is analyzed before its + * callees, allowing the recursive walk inside analyze_subprog() + * to naturally reach callees that receive FP-derived args. * - * Use reverse topological order (callers before callees) so that - * each subprog is analyzed before its callees, allowing the - * recursive walk inside analyze_subprog() to naturally - * reach nested callees that also lack FP-derived args. + * Subprogs and callbacks that don't receive FP-derived arguments + * cannot access ancestor stack frames are analyzed independently. + * Async callbacks (timer, workqueue) are handled the same way. */ for (k = env->subprog_cnt - 1; k >= 0; k--) { int sub = env->subprog_topo_order[k]; From bf6d507f7e3c65751d52fd8caf1ea4e003922624 Mon Sep 17 00:00:00 2001 From: Linpu Yu Date: Fri, 8 May 2026 22:43:43 +0800 Subject: [PATCH 612/972] xskmap: reject TX-only AF_XDP sockets XSKMAP entries are used as redirect targets for incoming XDP frames. A TX-only AF_XDP socket lacks an Rx ring and cannot handle redirected traffic, but xsk_map_update_elem() currently allows such sockets to be inserted into the map. Redirecting packets to such a socket on the veth generic-XDP path causes a kernel crash in xsk_generic_rcv(). This became possible after xsk_is_setup_for_bpf_map() was removed from the XSKMAP update path, which allowed bound TX-only sockets to be inserted into the map. Reject TX-only sockets during XSKMAP updates to avoid the crash. They remain fully operational for pure Tx purposes outside XSKMAP. Fixes: 968be23ceaca ("xsk: Fix possible segfault at xskmap entry insertion") Reported-by: Juefei Pu Reported-by: Yuan Tan Reported-by: Xin Liu Signed-off-by: Yifan Wu Signed-off-by: Linpu Yu Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20260508144344.694-1-linpu5433@gmail.com Signed-off-by: Alexei Starovoitov --- net/xdp/xskmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index afa457506274c1..3bff346308d0f1 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -184,6 +184,10 @@ static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value, } xs = (struct xdp_sock *)sock->sk; + if (!READ_ONCE(xs->rx)) { + sockfd_put(sock); + return -ENOBUFS; + } map_entry = &m->xsk_map[i]; node = xsk_map_node_alloc(m, map_entry); From 3ac1a467e37683f602221e243fa3c59b0de81165 Mon Sep 17 00:00:00 2001 From: Junyoung Jang Date: Mon, 27 Apr 2026 02:25:05 +0900 Subject: [PATCH 613/972] bpf: Fix off-by-one boundary validation in arena direct-value access BPF_MAP_TYPE_ARENA accepts BPF_PSEUDO_MAP_VALUE offsets at exactly the end of the arena mapping (off == arena_size). The boundary check in arena_map_direct_value_addr() uses `>` instead of `>=`, which incorrectly allows a one-past-end pointer to be accepted. Change the condition to `>=` to correctly reject offsets that fall outside the valid arena user_vm range. Fixes: 317460317a02 ("bpf: Introduce bpf_arena.") Signed-off-by: Junyoung Jang Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260426172505.1947915-1-graypanda.inzag@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 802656c6fd3c03..49a8f7b1beef59 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -511,7 +511,7 @@ static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - if ((u64)off > arena->user_vm_end - arena->user_vm_start) + if ((u64)off >= arena->user_vm_end - arena->user_vm_start) return -ERANGE; *imm = (unsigned long)arena->user_vm_start; return 0; From 45efb8fbdae303539e7fb5562e147583d4ed63ad Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Sat, 9 May 2026 22:39:37 -0700 Subject: [PATCH 614/972] ASoC: ti: omap3pandora: update board check to use DT compatible The omap3pandora driver contains a check for the ARM machine ID via the machine_is_omap3_pandora() macro. The board concerned now supports only FDT booting, which does not use machine IDs, and therefore the code should be updated to check the DT compatible property instead. The legacy board file for this machine was removed in commit 7fcf7e061edd ("ARM: OMAP2+: Remove legacy booting support for Pandora"). The presence of this machine ID check prevents the removal of machine IDs no longer used by the kernel from arch/arm/tools/mach-types, because the machine_is_*() macros are generated from mach-types. To resolve this issue, use of_machine_is_compatible() instead. Signed-off-by: Ethan Nelson-Moore Acked-by: Jarkko Nikula Signed-off-by: Mark Brown --- sound/soc/ti/omap3pandora.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/ti/omap3pandora.c b/sound/soc/ti/omap3pandora.c index f11b1d8a1306c0..6c9c184cd9d6f6 100644 --- a/sound/soc/ti/omap3pandora.c +++ b/sound/soc/ti/omap3pandora.c @@ -11,12 +11,12 @@ #include #include #include +#include #include #include #include -#include #include #include "omap-mcbsp.h" @@ -225,7 +225,8 @@ static int __init omap3pandora_soc_init(void) { int ret; - if (!machine_is_omap3_pandora()) + if (!of_machine_is_compatible("openpandora,omap3-pandora-600mhz") && + !of_machine_is_compatible("openpandora,omap3-pandora-1ghz")) return -ENODEV; pr_info("OMAP3 Pandora SoC init\n"); From b2d1eaa9b6608ba6e34f0b9419254ec9e5d47d41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Fri, 8 May 2026 01:20:50 -0300 Subject: [PATCH 615/972] ASoC: docs: Fix stale and misspelled references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While reading some docs, I have found some minor misspelled words and some parts that could be improved. The ASoC USB documentation refers to the USB DPCM backend link as DCPM in two places. The ASoC platform documentation still points readers to the old DPCM.txt name. The ALSA configuration guide also has two references that omit their current subdirectory. Fix a few stale and misspelled references in the ALSA and ASoC documentation and update the references and fix a typo in the ASoC index. Assisted-by: Codex:GPT-5.5 Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260508-asoc-doc-fixes-v1-1-b53eec42e340@gmail.com Signed-off-by: Mark Brown --- Documentation/sound/alsa-configuration.rst | 4 ++-- Documentation/sound/soc/index.rst | 2 +- Documentation/sound/soc/platform.rst | 2 +- Documentation/sound/soc/usb.rst | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/sound/alsa-configuration.rst b/Documentation/sound/alsa-configuration.rst index f75f087639415d..394cf87f340b4e 100644 --- a/Documentation/sound/alsa-configuration.rst +++ b/Documentation/sound/alsa-configuration.rst @@ -1097,7 +1097,7 @@ output (with ``--no-upload`` option) to kernel bugzilla or alsa-devel ML (see the section `Links and Addresses`_). ``power_save`` and ``power_save_controller`` options are for power-saving -mode. See powersave.rst for details. +mode. See Documentation/sound/designs/powersave.rst for details. Note 2: If you get click noises on output, try the module option ``position_fix=1`` or ``2``. ``position_fix=1`` will use the SD_LPIB @@ -1168,7 +1168,7 @@ line_outs_monitor enable_monitor Enable Analog Out on Channel 63/64 by default. -See hdspm.rst for details. +See Documentation/sound/cards/hdspm.rst for details. Module snd-ice1712 ------------------ diff --git a/Documentation/sound/soc/index.rst b/Documentation/sound/soc/index.rst index 8bed8f8f48daee..22878d73f7a54a 100644 --- a/Documentation/sound/soc/index.rst +++ b/Documentation/sound/soc/index.rst @@ -2,7 +2,7 @@ ALSA SoC Layer ============== -The documentation is spilt into the following sections:- +The documentation is split into the following sections:- .. toctree:: :maxdepth: 2 diff --git a/Documentation/sound/soc/platform.rst b/Documentation/sound/soc/platform.rst index bd21d0a4dd9b0b..53caba6ce4cea5 100644 --- a/Documentation/sound/soc/platform.rst +++ b/Documentation/sound/soc/platform.rst @@ -75,4 +75,4 @@ Each SoC DSP driver usually supplies the following features :- 3. DMA IO to/from DSP buffers (if applicable) 4. Definition of DSP front end (FE) PCM devices. -Please see DPCM.txt for a description of item 4. +Please see dpcm.rst for a description of item 4. diff --git a/Documentation/sound/soc/usb.rst b/Documentation/sound/soc/usb.rst index 94c12f9d9dd129..9cb860547f22d6 100644 --- a/Documentation/sound/soc/usb.rst +++ b/Documentation/sound/soc/usb.rst @@ -103,7 +103,7 @@ Returns 0 on success, and -EOPNOTSUPP on failure. - ``usbdev``: the usb device that was discovered - ``sdev``: capabilities of the device -**snd_soc_usb_connect()** notifies the ASoC USB DCPM BE DAI link of a USB +**snd_soc_usb_connect()** notifies the ASoC USB DPCM BE DAI link of a USB audio device detection. This can be utilized in the BE DAI driver to keep track of available USB audio devices. This is intended to be called by the USB offload driver residing in USB SND. @@ -118,7 +118,7 @@ Returns 0 on success, negative error code on failure. - ``usbdev``: the usb device that was removed - ``sdev``: capabilities to free -**snd_soc_usb_disconnect()** notifies the ASoC USB DCPM BE DAI link of a USB +**snd_soc_usb_disconnect()** notifies the ASoC USB DPCM BE DAI link of a USB audio device removal. This is intended to be called by the USB offload driver that resides in USB SND. From ee05c329c0487c86d37635a7503ba989c6aa7636 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Sat, 9 May 2026 22:42:14 -0700 Subject: [PATCH 616/972] ASoC: ti: rx51: remove stale reference to machine_is_nokia_rx51() The rx51 driver relies on the machine_is_nokia_rx51() macro, which was used with the legacy board file dropped in commit 9b7141d01a76 ("ARM: OMAP2+: Drop legacy board file for n900"). The use of this macro prevents the removal of machine IDs no longer used by the kernel from arch/arm/tools/mach-types, because the machine_is_*() macros are generated from mach-types. Drop this unused code. Tested-by: Jarkko Nikula # v1 Signed-off-by: Ethan Nelson-Moore Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260510054214.564950-1-enelsonmoore@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/rx51.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/ti/rx51.c b/sound/soc/ti/rx51.c index 7eeb12e5066c4a..cfc23e0838c25e 100644 --- a/sound/soc/ti/rx51.c +++ b/sound/soc/ti/rx51.c @@ -19,8 +19,6 @@ #include #include -#include - #include "omap-mcbsp.h" enum { @@ -364,7 +362,7 @@ static int rx51_soc_probe(struct platform_device *pdev) struct snd_soc_card *card = &rx51_sound_card; int err; - if (!machine_is_nokia_rx51() && !of_machine_is_compatible("nokia,omap3-n900")) + if (!of_machine_is_compatible("nokia,omap3-n900")) return -ENODEV; card->dev = &pdev->dev; From aa54b1d27fe0c2b78e664a34fd0fdf7cd1960d71 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Fri, 8 May 2026 17:53:09 +0900 Subject: [PATCH 617/972] rxrpc: Also unshare DATA/RESPONSE packets when paged frags are present The DATA-packet handler in rxrpc_input_call_event() and the RESPONSE handler in rxrpc_verify_response() copy the skb to a linear one before calling into the security ops only when skb_cloned() is true. An skb that is not cloned but still carries externally-owned paged fragments (e.g. SKBFL_SHARED_FRAG set by splice() into a UDP socket via __ip_append_data, or a chained skb_has_frag_list()) falls through to the in-place decryption path, which binds the frag pages directly into the AEAD/skcipher SGL via skb_to_sgvec(). Extend the gate to also unshare when skb_has_frag_list() or skb_has_shared_frag() is true. This catches the splice-loopback vector and other externally-shared frag sources while preserving the zero-copy fast path for skbs whose frags are kernel-private (e.g. NIC page_pool RX, GRO). The OOM/trace handling already in place is reused. Fixes: d0d5c0cd1e71 ("rxrpc: Use skb_unshare() rather than skb_cow_data()") Cc: stable@vger.kernel.org Signed-off-by: Hyunwoo Kim Reviewed-by: Jiayuan Chen Acked-by: David Howells Signed-off-by: Linus Torvalds --- net/rxrpc/call_event.c | 4 +++- net/rxrpc/conn_event.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fdd683261226cf..2b19b252225e55 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -334,7 +334,9 @@ bool rxrpc_input_call_event(struct rxrpc_call *call) if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.securityIndex != 0 && - skb_cloned(skb)) { + (skb_cloned(skb) || + skb_has_frag_list(skb) || + skb_has_shared_frag(skb))) { /* Unshare the packet so that it can be * modified by in-place decryption. */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index a2130d25aaa9b7..442414d90ba1cd 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -245,7 +245,8 @@ static int rxrpc_verify_response(struct rxrpc_connection *conn, { int ret; - if (skb_cloned(skb)) { + if (skb_cloned(skb) || skb_has_frag_list(skb) || + skb_has_shared_frag(skb)) { /* Copy the packet if shared so that we can do in-place * decryption. */ From 46e9b0224475abc739612ef72c35b7c90211a0c1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2026 13:41:13 -0700 Subject: [PATCH 618/972] tools/ynl: add missing uapi header deps in Makefile.deps ethtool.h includes linux/typelimits.h which is a relatively new header not yet shipped in most distro kernel-header packages. Without the explicit entry, the build silently falls through to -idirafter. dev_energymodel.h is a new YNL family whose uapi header is not in system paths at all and was missing a CFLAGS entry entirely. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260508204114.205896-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 08205f9fc52575..cc53b2f21c4446 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -15,9 +15,11 @@ UAPI_PATH:=../../../../include/uapi/ get_hdr_inc=-D$(1) -include $(UAPI_PATH)/linux/$(2) get_hdr_inc2=-D$(1) -D$(2) -include $(UAPI_PATH)/linux/$(3) +CFLAGS_dev-energymodel:=$(call get_hdr_inc,_LINUX_DEV_ENERGYMODEL_H,dev_energymodel.h) CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_dpll:=$(call get_hdr_inc,_LINUX_DPLL_H,dpll.h) -CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ +CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_TYPELIMITS_H,typelimits.h) \ + $(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) From 304d81a2fbf2b454def4debcb38ea173911b72cd Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 7 Apr 2026 18:08:57 -0400 Subject: [PATCH 619/972] nfsd: fix file change detection in CB_GETATTR RFC 8881, section 10.4.3 doesn't say anything about caching the file size in the delegation record, nor does it say anything about comparing a cached file size with the size reported by the client in the CB_GETATTR reply for the purpose of determining if the client holds modified data for the file. What section 10.4.3 of RFC 8881 does say is that the server should compare the *current* file size with the size reported by the client holding the delegation in the CB_GETATTR reply, and if they differ to treat it as a modification regardless of the change attribute retrieved via the CB_GETATTR. Doing otherwise would cause the server to believe the client holding the delegation has a modified version of the file, even if the client flushed the modifications to the server prior to the CB_GETATTR. This would have the added side effect of subsequent CB_GETATTRs causing updates to the mtime, ctime, and change attribute even if the client holding the delegation makes no further updates to the file. Modify nfsd4_deleg_getattr_conflict() to obtain the current file size via i_size_read(). Retain the ncf_cur_fsize field, since it's a convenient way to return the file size back to nfsd4_encode_fattr4(), but don't use it for the purpose of detecting file changes. Remove the unnecessary initialization of ncf_cur_fsize in nfs4_open_delegation(). Also, if we recall the delegation (because the client didn't respond to the CB_GETATTR), then skip the logic that checks the nfs4_cb_fattr fields. Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation") Cc: stable@vger.kernel.org Signed-off-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b4d0e82b2690f1..a9f7bd491b8cd3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6378,7 +6378,6 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, } open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE; - dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); dp->dl_atime = stat.atime; dp->dl_ctime = stat.ctime; @@ -9429,11 +9428,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, if (status != nfserr_jukebox || !nfsd_wait_for_delegreturn(rqstp, inode)) goto out_status; + status = nfs_ok; + goto out_status; + } + if (!ncf->ncf_file_modified) { + if (ncf->ncf_initial_cinfo != ncf->ncf_cb_change) + ncf->ncf_file_modified = true; + else if (i_size_read(inode) != ncf->ncf_cb_fsize) + ncf->ncf_file_modified = true; } - if (!ncf->ncf_file_modified && - (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || - ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) - ncf->ncf_file_modified = true; if (ncf->ncf_file_modified) { int err; From 2863bac7f49c4acd80a048ce52506a2b9c8db015 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 10 Apr 2026 12:09:19 -0400 Subject: [PATCH 620/972] nfsd: update mtime/ctime on CLONE in presense of delegated attributes When delegated attributes are given on open, the file is opened with NOCMTIME and modifying operations do not update mtime/ctime as to not get out-of-sync with the client's delegated view. However, for CLONE operation, the server should update its view of mtime/ctime and reflect that in any GETATTR queries. Fixes: e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4state.c | 44 +++++++++++++++++++++++++++++--------------- fs/nfsd/state.h | 1 + 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2797da8cc9501f..5de8f37df78a29 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1413,6 +1413,9 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dst, clone->cl_dst_pos, clone->cl_count, EX_ISSYNC(cstate->current_fh.fh_export)); + if (!status && (READ_ONCE(dst->nf_file->f_mode) & FMODE_NOCMTIME) != 0) + nfsd_update_cmtime_attr(dst->nf_file, 0); + nfsd_file_put(dst); nfsd_file_put(src); out: diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a9f7bd491b8cd3..3cd8b3b59de465 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1221,10 +1221,6 @@ static void put_deleg_file(struct nfs4_file *fp) static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) { - struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME | ATTR_DELEG }; - struct inode *inode = file_inode(f); - int ret; - /* don't do anything if FMODE_NOCMTIME isn't set */ if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) return; @@ -1242,17 +1238,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f return; /* Stamp everything to "now" */ - inode_lock(inode); - ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); - inode_unlock(inode); - if (ret) { - struct inode *inode = file_inode(f); - - pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%lu: %d\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino, ret); - } + nfsd_update_cmtime_attr(f, ATTR_ATIME); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) @@ -9563,3 +9549,31 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate, put_nfs4_file(fp); return ERR_PTR(status); } + +/** + * nfsd_update_cmtime_attr - update file's delegated ctime/mtime, + * and optionally other attributes (ie ATTR_ATIME). + * @f: pointer to an opened file + * @flags: any additional flags that should be updated + * + * Given upon opening a file delegated attributes were issues, update + * @f attributes to current times. + */ +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags) +{ + int ret; + struct inode *inode = file_inode(f); + struct iattr attr = { + .ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_DELEG | flags, + }; + + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &attr, NULL); + inode_unlock(inode); + if (ret) + pr_notice_ratelimited("nfsd: Unable to update timestamps on " + "inode %02x:%02x:%lu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); +} diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 953675eba5c366..c5ccea64c2817f 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -843,6 +843,7 @@ extern void nfsd4_shutdown_copy(struct nfs4_client *clp); void nfsd4_put_client(struct nfs4_client *clp); void nfsd4_async_copy_reaper(struct nfsd_net *nn); bool nfsd4_has_active_async_copies(struct nfs4_client *clp); +void nfsd_update_cmtime_attr(struct file *f, unsigned int flags); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); From 4183cf383b6faec17a0882b84cd2d901dba62b16 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 10 Apr 2026 12:09:20 -0400 Subject: [PATCH 621/972] nfsd: update mtime/ctime on COPY in presence of delegated attributes When delegated attributes are given on open, the file is opened with NOCMTIME and modifying operations do not update mtime/ctime as to not get out-of-sync with the client's delegated view. However, for COPY operation, the server should update its view of mtime/ctime and reflect that in any GETATTR queries. Fixes: e5e9b24ab8fa ("nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 11 ++++++++++- fs/nfsd/xdr4.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5de8f37df78a29..ab39ec88544050 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2121,8 +2121,10 @@ static int nfsd4_do_async_copy(void *data) set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); trace_nfsd_copy_async_done(copy); - nfsd4_send_cb_offload(copy); atomic_dec(©->cp_nn->pending_async_copies); + if (copy->cp_res.wr_bytes_written > 0 && copy->attr_update) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); + nfsd4_send_cb_offload(copy); return 0; } @@ -2182,6 +2184,9 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0) + async_copy->attr_update = true; memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, cstate->session->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); @@ -2200,6 +2205,10 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } else { status = nfsd4_do_copy(copy, copy->nf_src->nf_file, copy->nf_dst->nf_file, true); + if ((READ_ONCE(copy->nf_dst->nf_file->f_mode) & + FMODE_NOCMTIME) != 0 && + copy->cp_res.wr_bytes_written > 0) + nfsd_update_cmtime_attr(copy->nf_dst->nf_file, 0); } out: trace_nfsd_copy_done(copy, status); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 417e9ad9fbb397..9a4124c77e049d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -752,6 +752,7 @@ struct nfsd4_copy { struct nfsd_file *nf_src; struct nfsd_file *nf_dst; + bool attr_update; copy_stateid_t cp_stateid; From c00b472a1322d4f5424cd7b6c7d00270eae673bd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 11 Apr 2026 17:12:16 -0400 Subject: [PATCH 622/972] sunrpc: start cache request seqno at 1 to fix netlink GET_REQS sunrpc_cache_requests_snapshot() filters requests with crq->seqno <= min_seqno. The min_seqno for the first netlink dump call is cb->args[0] which is 0. Since next_seqno was initialized to 0, the very first cache request got seqno=0 and was silently skipped by the snapshot (0 <= 0 is true). This caused netlink-based GET_REQS to return 0 pending requests even when a request was queued, preventing mountd from resolving cache entries (particularly expkey/nfsd.fh). The unresolved CACHE_PENDING state blocked all further notifications for the entry, leading to permanent NFS4ERR_DELAY hangs. Start next_seqno at 1 so all requests have seqno >= 1 and pass the snapshot filter when min_seqno is 0. Fixes: facc4e3c8042 ("sunrpc: split cache_detail queue into request and reader lists") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7081c1214e6c32..b5474ce534fb9d 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -403,7 +403,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd) INIT_LIST_HEAD(&cd->readers); spin_lock_init(&cd->queue_lock); init_waitqueue_head(&cd->queue_wait); - cd->next_seqno = 0; + cd->next_seqno = 1; spin_lock(&cache_list_lock); cd->nextcheck = 0; cd->entries = 0; From 4f8ef58c10bfe5f86a643c7c8331b37e69e3dae1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 19 Apr 2026 14:52:59 -0400 Subject: [PATCH 623/972] NFSD: Fix infinite loop in layout state revocation find_one_sb_stid() skips stids whose sc_status is non-zero, but the SC_TYPE_LAYOUT case in nfsd4_revoke_states() never sets sc_status before calling nfsd4_close_layout(). The retry loop therefore finds the same layout stid on every iteration, hanging the revoker indefinitely. Fixes: 1e33e1414bec ("nfsd: allow layout state to be admin-revoked.") Reported-by: Dai Ngo Reviewed-by: Jeff Layton Tested-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3cd8b3b59de465..c92bc0c110650e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1851,6 +1851,13 @@ void nfsd4_revoke_states(struct nfsd_net *nn, struct super_block *sb) break; case SC_TYPE_LAYOUT: ls = layoutstateid(stid); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + } + spin_unlock(&clp->cl_lock); nfsd4_close_layout(ls); break; } From e42c755582f0960e684298762f0ab927b3778376 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Fri, 8 May 2026 06:21:21 +0000 Subject: [PATCH 624/972] net: ena: PHC: Fix potential use-after-free in get_timestamp Move the phc->active check and resp pointer assignment to after acquiring the spinlock. Previously, phc->active was checked without holding the lock, and resp was cached from ena_dev->phc.virt_addr before the lock was acquired. If ena_com_phc_destroy() runs between the lockless active check and the lock acquisition, it sets active=false, releases the lock, frees the DMA memory, and sets virt_addr=NULL. The get_timestamp path would then read a NULL virt_addr and dereference it. With both the active check and the pointer read under the lock, destroy cannot free the memory while get_timestamp is using it. Fixes: e0ea34158ee8 ("net: ena: Add PHC support in the ENA driver") Cc: stable@vger.kernel.org Signed-off-by: Arthur Kiyanovski Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260508062126.7273-1-akiyano@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_com.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index e67b592e569763..8c86789d867a5f 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1782,20 +1782,23 @@ void ena_com_phc_destroy(struct ena_com_dev *ena_dev) int ena_com_phc_get_timestamp(struct ena_com_dev *ena_dev, u64 *timestamp) { - volatile struct ena_admin_phc_resp *resp = ena_dev->phc.virt_addr; const ktime_t zero_system_time = ktime_set(0, 0); struct ena_com_phc_info *phc = &ena_dev->phc; + volatile struct ena_admin_phc_resp *resp; ktime_t expire_time; ktime_t block_time; unsigned long flags = 0; int ret = 0; + spin_lock_irqsave(&phc->lock, flags); + if (!phc->active) { + spin_unlock_irqrestore(&phc->lock, flags); netdev_err(ena_dev->net_device, "PHC feature is not active in the device\n"); return -EOPNOTSUPP; } - spin_lock_irqsave(&phc->lock, flags); + resp = ena_dev->phc.virt_addr; /* Check if PHC is in blocked state */ if (unlikely(ktime_compare(phc->system_time, zero_system_time))) { From a450063ef86b9967234ca1f896c0d77400c74f11 Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Thu, 7 May 2026 19:50:24 +0530 Subject: [PATCH 625/972] net: xgene: fix mdio_np leak in xgene_mdiobus_register() The for_each_child_of_node() loop captures mdio_np via break, holding the refcount. of_mdiobus_register() does not consume the reference, so it leaks on success. Put it after registration. Fixes: e6ad767305eb ("drivers: net: Add APM X-Gene SoC ethernet driver support.") Signed-off-by: Shitalkumar Gandhi Link: https://patch.msgid.link/20260507142024.811543-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/apm/xgene/xgene_enet_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index b854b6b42d77b9..2926e1e599419d 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -910,7 +910,9 @@ static int xgene_mdiobus_register(struct xgene_enet_pdata *pdata, return -ENXIO; } - return of_mdiobus_register(mdio, mdio_np); + ret = of_mdiobus_register(mdio, mdio_np); + of_node_put(mdio_np); + return ret; } /* Mask out all PHYs from auto probing. */ From 6947bea4b79115f50138882512f85fa9c93b2827 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 626/972] sched_ext: Cleanups in preparation for the SCX_TASK_INIT_BEGIN/DEAD work Cleanups in preparation for the state-machine work that follows: - Convert three sub-sched call sites that open-code rcu_assign_pointer(p->scx.sched, ...) to scx_set_task_sched(). - Move scx_get_task_state()/scx_set_task_state() above the SCX task iter section so scx_task_iter_next_locked() can use them without a forward declaration. No functional change. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index df305712a2d424..10c6e0261f1119 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -711,6 +711,41 @@ struct bpf_iter_scx_dsq { } __attribute__((aligned(8))); +static u32 scx_get_task_state(const struct task_struct *p) +{ + return p->scx.flags & SCX_TASK_STATE_MASK; +} + +static void scx_set_task_state(struct task_struct *p, u32 state) +{ + u32 prev_state = scx_get_task_state(p); + bool warn = false; + + switch (state) { + case SCX_TASK_NONE: + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_READY: + warn = prev_state == SCX_TASK_NONE; + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + default: + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state; +} + /* * SCX task iterator. */ @@ -3499,41 +3534,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static u32 scx_get_task_state(const struct task_struct *p) -{ - return p->scx.flags & SCX_TASK_STATE_MASK; -} - -static void scx_set_task_state(struct task_struct *p, u32 state) -{ - u32 prev_state = scx_get_task_state(p); - bool warn = false; - - switch (state) { - case SCX_TASK_NONE: - break; - case SCX_TASK_INIT: - warn = prev_state != SCX_TASK_NONE; - break; - case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; - break; - case SCX_TASK_ENABLED: - warn = prev_state != SCX_TASK_READY; - break; - default: - WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", - prev_state, state, p->comm, p->pid); - return; - } - - WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", - prev_state, state, p->comm, p->pid); - - p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state; -} - static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { int ret; @@ -5696,7 +5696,7 @@ static void scx_fail_parent(struct scx_sched *sch, scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { scx_disable_and_exit_task(sch, p); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); } } scx_task_iter_stop(&sti); @@ -5783,7 +5783,7 @@ static void scx_sub_disable(struct scx_sched *sch) */ scx_disable_and_exit_task(sch, p); scx_set_task_state(p, SCX_TASK_INIT); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } @@ -7212,7 +7212,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) * $p is now only initialized for @sch and READY, which * is what we want. Assign it to @sch and enable. */ - rcu_assign_pointer(p->scx.sched, sch); + scx_set_task_sched(p, sch); scx_enable_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; From 938dd9ab2bd7df0a7e58ce4249794156be9530b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 627/972] sched_ext: Inline scx_init_task() and move RESET_RUNNABLE_AT into scx_set_task_state() Prepare for the SCX_TASK_INIT_BEGIN/DEAD work that follows by collapsing the scx_init_task() helper. Move the SCX_TASK_RESET_RUNNABLE_AT setting into scx_set_task_state() on the INIT transition (it was set unconditionally at every INIT site through the scx_init_task() helper), inline scx_init_task() into scx_fork() and scx_root_enable_workfn(), and drop the helper. As a side effect, scx_sub_disable() migration sequence now also sets RESET_RUNNABLE_AT (it previously wrote INIT directly without going through scx_init_task()). The flag triggers a runnable_at reset on the next set_task_runnable(), which is harmless on a task that has just been moved between scheds. On root-enable, p->scx.flags is written without the task's rq lock. The task isn't visible to scx yet, and a follow-up patch restores the lock-held write. v2: Note p->scx.flags rq-lock relaxation on root-enable path. (Andrea) Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 10c6e0261f1119..81841277a54f6e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -726,6 +726,7 @@ static void scx_set_task_state(struct task_struct *p, u32 state) break; case SCX_TASK_INIT: warn = prev_state != SCX_TASK_NONE; + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: warn = prev_state == SCX_TASK_NONE; @@ -3585,22 +3586,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo return 0; } -static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) -{ - int ret; - - ret = __scx_init_task(sch, p, fork); - if (!ret) { - /* - * While @p's rq is not locked. @p is not visible to the rest of - * SCX yet and it's safe to update the flags and state. - */ - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; - scx_set_task_state(p, SCX_TASK_INIT); - } - return ret; -} - static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { struct rq *rq = task_rq(p); @@ -3763,10 +3748,11 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif - ret = scx_init_task(sch, p, true); - if (!ret) - scx_set_task_sched(p, sch); - return ret; + ret = __scx_init_task(sch, p, true); + if (unlikely(ret)) + return ret; + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); } return 0; @@ -6897,8 +6883,8 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_unlock(&sti); - ret = scx_init_task(sch, p, false); - if (ret) { + ret = __scx_init_task(sch, p, false); + if (unlikely(ret)) { put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", @@ -6906,6 +6892,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } + scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); From cceb8fa9cb2cf98e31d81ecf6353b6ba5ac57744 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 628/972] sched_ext: Replace SCX_TASK_OFF_TASKS flag with SCX_TASK_DEAD state SCX_TASK_OFF_TASKS marked tasks already through sched_ext_dead() so cgroup task iteration would skip them. This can be expressed better with a task state. Replace the flag with SCX_TASK_DEAD. scx_disable_and_exit_task() resets state to NONE on its way out, so sched_ext_dead() now sets DEAD after the wrapper returns. The validation matrix grows NONE -> DEAD, warns on DEAD -> NONE, and tightens READY's predecessor to INIT or ENABLED so the new DEAD value cannot silently transition to READY. Prepares for the following enable vs dead race fix. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 9 +++++---- kernel/sched/ext.c | 17 +++++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index adb9a4de068ae8..9f1a326ad03ec7 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -101,24 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ - SCX_TASK_OFF_TASKS = 1 << 6, /* removed from scx_tasks by sched_ext_dead() */ /* - * Bits 8 and 9 are used to carry task state: + * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext + * DEAD terminal state set by sched_ext_dead() */ - SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ - SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_SHIFT = 8, + SCX_TASK_STATE_BITS = 3, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 81841277a54f6e..2fc4a12711f9d5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -723,17 +723,22 @@ static void scx_set_task_state(struct task_struct *p, u32 state) switch (state) { case SCX_TASK_NONE: + warn = prev_state == SCX_TASK_DEAD; break; case SCX_TASK_INIT: warn = prev_state != SCX_TASK_NONE; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_INIT || + prev_state == SCX_TASK_ENABLED); break; case SCX_TASK_ENABLED: warn = prev_state != SCX_TASK_READY; break; + case SCX_TASK_DEAD: + warn = prev_state != SCX_TASK_NONE; + break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", prev_state, state, p->comm, p->pid); @@ -972,11 +977,11 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) /* * cgroup_task_dead() removes the dead tasks from cset->tasks * after sched_ext_dead() and cgroup iteration may see tasks - * which already finished sched_ext_dead(). %SCX_TASK_OFF_TASKS - * is set by sched_ext_dead() under @p's rq lock. Test it to + * which already finished sched_ext_dead(). %SCX_TASK_DEAD is + * set by sched_ext_dead() under @p's rq lock. Test it to * avoid visiting tasks which are already dead from SCX POV. */ - if (p->scx.flags & SCX_TASK_OFF_TASKS) { + if (scx_get_task_state(p) == SCX_TASK_DEAD) { __scx_task_iter_rq_unlock(iter); continue; } @@ -3847,7 +3852,7 @@ void sched_ext_dead(struct task_struct *p) * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. * - * %SCX_TASK_OFF_TASKS synchronizes against cgroup task iteration - see + * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. @@ -3858,7 +3863,7 @@ void sched_ext_dead(struct task_struct *p) rq = task_rq_lock(p, &rf); scx_disable_and_exit_task(scx_task_sched(p), p); - p->scx.flags |= SCX_TASK_OFF_TASKS; + scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } } From c941d7391f258d5d06e0f7e962a52f99a547a83e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 629/972] sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 10 ++++--- kernel/sched/ext.c | 56 ++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9f1a326ad03ec7..2129e18ada58ba 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_ent_flags { * Bits 8 to 10 are used to carry task state: * * NONE ops.init_task() not called yet + * INIT_BEGIN ops.init_task() in flight; see sched_ext_dead() * INIT ops.init_task() succeeded, but task can be cancelled * READY fully initialized, but not in sched_ext * ENABLED fully initialized and in sched_ext @@ -116,10 +117,11 @@ enum scx_ent_flags { SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, - SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, - SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, - SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT, + SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT, /* * Bits 12 and 13 are used to carry reenqueue reason. In addition to diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2fc4a12711f9d5..29fa9ffe7c7ba3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state) case SCX_TASK_NONE: warn = prev_state == SCX_TASK_DEAD; break; - case SCX_TASK_INIT: + case SCX_TASK_INIT_BEGIN: warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_INIT_BEGIN; p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; break; case SCX_TASK_READY: @@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state) warn = prev_state != SCX_TASK_READY; break; case SCX_TASK_DEAD: - warn = prev_state != SCX_TASK_NONE; + warn = !(prev_state == SCX_TASK_NONE || + prev_state == SCX_TASK_INIT_BEGIN); break; default: WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", @@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) #else struct scx_sched *sch = scx_root; #endif + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); ret = __scx_init_task(sch, p, true); - if (unlikely(ret)) + if (unlikely(ret)) { + scx_set_task_state(p, SCX_TASK_NONE); return ret; + } scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, sch); } @@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p) * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup * iteration is only used from sub-sched paths, which require root * enabled. Root enable transitions every live task to at least READY. + * + * %INIT_BEGIN means ops.init_task() is running for @p. Don't call + * into ops; transition to %DEAD so the post-init recheck unwinds + * via scx_sub_init_cancel_task(). */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_disable_and_exit_task(scx_task_sched(p), p); + if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) + scx_disable_and_exit_task(scx_task_sched(p), p); scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } @@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch) * $p having already been initialized, and then enable. */ scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_set_task_state(p, SCX_TASK_INIT); scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); @@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { + struct rq_flags rf; + struct rq *rq; + /* * @p may already be dead, have lost all its usages counts and * be waiting for RCU grace period before being freed. @p can't @@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!tryget_task_struct(p)) continue; + /* + * Set %INIT_BEGIN under the iter's rq lock so that a concurrent + * sched_ext_dead() does not call ops.exit_task() on @p while + * ops.init_task() is running. If sched_ext_dead() runs before + * this store, it has already removed @p from scx_tasks and the + * iter won't visit @p; if it runs after, it observes + * %INIT_BEGIN and transitions to %DEAD without calling ops, + * leaving the post-init recheck below to unwind. + */ + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_task_iter_unlock(&sti); ret = __scx_init_task(sch, p, false); + + rq = task_rq_lock(p, &rf); + if (unlikely(ret)) { + if (scx_get_task_state(p) != SCX_TASK_DEAD) + scx_set_task_state(p, SCX_TASK_NONE); + task_rq_unlock(rq, p, &rf); put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", @@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } - scx_set_task_state(p, SCX_TASK_INIT); - scx_set_task_sched(p, sch); - scx_set_task_state(p, SCX_TASK_READY); + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. + * ops.exit_task() is owed to the sched __scx_init_task() + * ran against; call it now. + */ + scx_sub_init_cancel_task(sch, p); + } else { + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); + scx_set_task_state(p, SCX_TASK_READY); + } + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); From cd6aab736702f981ac4d128e04a4e33105ea797d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 630/972] sched_ext: Close sub-sched init race with post-init DEAD recheck scx_sub_enable_workfn()'s init pass and scx_sub_disable() migration both drop the rq lock to call __scx_init_task() against the other sched. A TASK_DEAD @p can fall through sched_ext_dead() in that window. sched_ext_dead() runs ops.exit_task() on the sched @p was attached to, not on the sched whose init just completed, so the new allocation leaks. Reuse the DEAD signal set by sched_ext_dead(). After __scx_init_task() returns, take task_rq_lock(p) and check for DEAD; on hit, call scx_sub_init_cancel_task() against the sub sched the init ran for and drop @p; on miss, proceed as before. Reported-by: zhidao su Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 29fa9ffe7c7ba3..6fbe3160eccd66 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5777,6 +5777,21 @@ static void scx_sub_disable(struct scx_sched *sch) } rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on @sch (the + * sched @p was on at that point), not on $parent. + * $parent's just-completed init is owed an exit_task() + * and we issue it here. + */ + scx_sub_init_cancel_task(parent, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* * $p is initialized for $parent and still attached to @@ -5791,8 +5806,8 @@ static void scx_sub_disable(struct scx_sched *sch) scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -7212,6 +7227,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto abort; rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on $parent (the + * sched @p was on at that point), not on @sch. @sch's + * just-completed init is owed an exit_task() and we + * issue it here. + */ + scx_sub_init_cancel_task(sch, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + p->scx.flags |= SCX_TASK_SUB_INIT; task_rq_unlock(rq, p, &rf); From d3e73a0808ddfb91ac36cd548643cbbeb00ad4db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 10 May 2026 10:08:16 -1000 Subject: [PATCH 631/972] sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths scx_fail_parent() leaves cgroup tasks at (state=NONE, sched=parent, sched_class=ext) until the parent itself is torn down by the scx_error() it raised. When the later root_disable iterates them, two paths trip on NONE. scx_disable_and_exit_task() re-enters the wrapper at NONE: the inner switch returns early but the trailing scx_set_task_sched(p, NULL) clobbers the parent sched left by scx_fail_parent(), and scx_set_task_state(p, NONE) wastes a write on an already-NONE task. switched_from_scx() then calls scx_disable_task(), which WARNs on non-ENABLED state and writes state=READY, producing a NONE -> READY transition the validation matrix rejects. Treat NONE as "nothing to do" in both paths. Add a NONE early-return at the top of scx_disable_and_exit_task() and a parallel NONE check in switched_from_scx() next to task_dead_and_done(). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6fbe3160eccd66..4efe0099f79af8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3703,6 +3703,15 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct * static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { + /* + * %NONE means @p is already detached at the SCX level (e.g. handed + * back to the parent by scx_fail_parent() with no init to undo). + * Skip to avoid clobbering scx_task_sched() and writing %NONE again + * on a state that's already %NONE. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + __scx_disable_and_exit_task(sch, p); /* @@ -3921,6 +3930,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; + /* + * %NONE means SCX is no longer tracking @p at the task level (e.g. + * scx_fail_parent() handed @p back to the parent at NONE pending the + * parent's own teardown). There is nothing to disable; calling + * scx_disable_task() would WARN on the non-%ENABLED state and trigger a + * NONE -> READY validation failure. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + scx_disable_task(scx_task_sched(p), p); } From 5d6919055dec134de3c40167a490f33c74c12581 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 May 2026 14:08:09 -0700 Subject: [PATCH 632/972] Linux 7.1-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 93eded47ccd2e5..b7b80e84e1eb26 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 7 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From b9d16482bebdeded6e61891a5158b51f4ef04f5f Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 5 May 2026 19:47:44 +0300 Subject: [PATCH 633/972] MAINTAINERS: ASoC/ti: Remove myself and add Sen Wang as maintainer As I cannot spend adequate time to fulfill my role as maintainer for the TI ASoC drivers, it is for the better if I resign and hand over the role to Sen Wang. Signed-off-by: Peter Ujfalusi Acked-by: Nishanth Menon Link: https://patch.msgid.link/20260505164744.16134-1-peter.ujfalusi@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a38659..c9495b60f31d0f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19448,7 +19448,6 @@ F: include/misc/ocxl* F: include/uapi/misc/ocxl.h OMAP AUDIO SUPPORT -M: Peter Ujfalusi M: Jarkko Nikula L: linux-sound@vger.kernel.org L: linux-omap@vger.kernel.org @@ -26351,7 +26350,7 @@ F: arch/xtensa/ F: drivers/irqchip/irq-xtensa-* TEXAS INSTRUMENTS ASoC DRIVERS -M: Peter Ujfalusi +M: Sen Wang L: linux-sound@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml @@ -26853,12 +26852,6 @@ S: Maintained F: Documentation/devicetree/bindings/iio/adc/ti,tsc2046.yaml F: drivers/iio/adc/ti-tsc2046.c -TI TWL4030 SERIES SOC CODEC DRIVER -M: Peter Ujfalusi -L: linux-sound@vger.kernel.org -S: Maintained -F: sound/soc/codecs/twl4030* - TI VPE/CAL DRIVERS M: Yemike Abhilash Chandra L: linux-media@vger.kernel.org From cb196d50a78ddae227f09b3cd0b145f74a70d241 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Fri, 8 May 2026 13:24:37 -0500 Subject: [PATCH 634/972] ASoC; dt-bindings: mediatek,mt8173-rt5650-rt5514: Fix mediatek,audio-codec constraints A phandle-array is really a matrix and needs constraints on the number of elements for both the inner and outer dimensions. Add the missing inner constraints. Fixes: 472d77bdc511 ("ASoC: dt-bindings: mediatek,mt8173-rt5650-rt5514: convert to DT schema") Signed-off-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260508182438.1757394-1-robh@kernel.org Signed-off-by: Mark Brown --- .../bindings/sound/mediatek,mt8173-rt5650-rt5514.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/sound/mediatek,mt8173-rt5650-rt5514.yaml b/Documentation/devicetree/bindings/sound/mediatek,mt8173-rt5650-rt5514.yaml index ed698c9ff42b0e..becc7a11f8dc14 100644 --- a/Documentation/devicetree/bindings/sound/mediatek,mt8173-rt5650-rt5514.yaml +++ b/Documentation/devicetree/bindings/sound/mediatek,mt8173-rt5650-rt5514.yaml @@ -18,7 +18,9 @@ properties: description: Phandles of rt5650 and rt5514 codecs items: - description: phandle of rt5650 codec + maxItems: 1 - description: phandle of rt5514 codec + maxItems: 1 mediatek,platform: $ref: /schemas/types.yaml#/definitions/phandle From 422bd00b71ab42163aa3b8f8370276fe4c1581e7 Mon Sep 17 00:00:00 2001 From: Krishnamoorthi M Date: Thu, 7 May 2026 23:30:51 +0530 Subject: [PATCH 635/972] spi: amd: Set correct bus number in ACPI probe path On platforms where the HID2 SPI controller (AMDI0063) is enumerated via ACPI instead of PCI, amd_spi_probe() unconditionally sets bus_num to 0, while the PCI probe path assigns bus_num 2 for HID2 controller. Align the ACPI probe path to use the same bus number so that userspace and SPI client drivers see a consistent bus assignment regardless of the enumeration method. Fixes: b644c2776652 ("spi: spi_amd: Add PCI-based driver for AMD HID2 SPI controller") Cc: stable@vger.kernel.org # v6.16+ Signed-off-by: Krishnamoorthi M Link: https://patch.msgid.link/20260507180051.4158674-1-krishnamoorthi.m@amd.com Signed-off-by: Mark Brown --- drivers/spi/spi-amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-amd.c b/drivers/spi/spi-amd.c index 4d1dce4f497406..71a6e5c475b037 100644 --- a/drivers/spi/spi-amd.c +++ b/drivers/spi/spi-amd.c @@ -868,7 +868,7 @@ static int amd_spi_probe(struct platform_device *pdev) dev_dbg(dev, "io_remap_address: %p\n", amd_spi->io_remap_addr); amd_spi->version = (uintptr_t)device_get_match_data(dev); - host->bus_num = 0; + host->bus_num = (amd_spi->version == AMD_HID2_SPI) ? 2 : 0; return amd_spi_probe_common(dev, host); } From 3760befa5c08b229df76ab458520beeb26024716 Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Sun, 10 May 2026 15:14:00 -0700 Subject: [PATCH 636/972] ASoC: fsl: eukrea-tlv320: update board checks to use the DT The eukrea-tlv320 driver contains checks for ARM machine IDs via machine_is_*() macros. The boards concerned now support only FDT booting, which does not use machine IDs, and therefore the code should be updated to check the DT compatible property instead. Non-DT booting support for these machines was removed in these commits: commit f2f55499942a ("ARM: imx: Remove eukrea_mbimxsd35 non-dt support") commit 3877942b0c7f ("ARM: imx25: Remove eukrea mx25 board files") commit 7c5deaf77526 ("ARM: i.MX: Remove mach-cpuimx27sd board file") commit 8da4d6b2f798 ("ARM: mx51: Remove mach-cpuimx51sd board file") The presence of these machine ID checks prevents the removal of machine IDs no longer used by the kernel from arch/arm/tools/mach-types, because the machine_is_*() macros are generated from mach-types. To resolve this issue, use of_machine_is_compatible() instead. Signed-off-by: Ethan Nelson-Moore Link: https://patch.msgid.link/20260510221403.11283-1-enelsonmoore@gmail.com Signed-off-by: Mark Brown --- sound/soc/fsl/eukrea-tlv320.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sound/soc/fsl/eukrea-tlv320.c b/sound/soc/fsl/eukrea-tlv320.c index 6be074ea0b3f04..5bb31a5cdf23c1 100644 --- a/sound/soc/fsl/eukrea-tlv320.c +++ b/sound/soc/fsl/eukrea-tlv320.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "../codecs/tlv320aic23.h" #include "imx-ssi.h" @@ -142,7 +141,7 @@ static int eukrea_tlv320_probe(struct platform_device *pdev) eukrea_tlv320.name = "cpuimx-audio"; } - if (machine_is_eukrea_cpuimx27() || + if (of_machine_is_compatible("eukrea,cpuimx27") || (tmp_np = of_find_compatible_node(NULL, NULL, "fsl,imx21-audmux"))) { imx_audmux_v1_configure_port(MX27_AUDMUX_HPCR1_SSI0, IMX_AUDMUX_V1_PCR_SYN | @@ -159,12 +158,12 @@ static int eukrea_tlv320_probe(struct platform_device *pdev) IMX_AUDMUX_V1_PCR_RXDSEL(MX27_AUDMUX_HPCR1_SSI0) ); of_node_put(tmp_np); - } else if (machine_is_eukrea_cpuimx25sd() || - machine_is_eukrea_cpuimx35sd() || - machine_is_eukrea_cpuimx51sd() || + } else if (of_machine_is_compatible("eukrea,cpuimx25") || + of_machine_is_compatible("eukrea,cpuimx35") || + of_machine_is_compatible("eukrea,cpuimx51") || (tmp_np = of_find_compatible_node(NULL, NULL, "fsl,imx31-audmux"))) { if (!np) - ext_port = machine_is_eukrea_cpuimx25sd() ? + ext_port = of_machine_is_compatible("eukrea,cpuimx25") ? 4 : 3; imx_audmux_v2_configure_port(int_port, From 45f7d7af0eeb3e59141a2197e796d675ad5416c0 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 8 May 2026 17:03:18 +0200 Subject: [PATCH 637/972] ASoC: cs42l43: Move long delayed work on system_dfl_long_wq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move long delayed work on system_dfl_long_wq Currently the code enqueue work items using {queue|mod}_delayed_work(), using system_long_wq. This workqueue should be used when long works are expected and it is a per-cpu workqueue. The function(s) end up calling __queue_delayed_work(), which set a global timer that could fire anywhere, enqueuing the work where the timer fired. Unbound works could benefit from scheduler task placement, to optimize performance and power consumption. Long work shouldn't stick to a single CPU. Recently, a new unbound workqueue specific for long running work has been added:     c116737e972e ("workqueue: Add system_dfl_long_wq for long unbound works") Since the workqueue work doesn't rely on per-cpu variables, there is no obvious reason that justify the use of a per-cpu workqueue. So change system_long_wq with system_dfl_long_wq so that the work may benefit from scheduler task placement. Cc: David Rhodes Cc: Richard Fitzgerald Cc: patches@opensource.cirrus.com Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20260508150327.351779-2-marco.crivellari@suse.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l43-jack.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs42l43-jack.c b/sound/soc/codecs/cs42l43-jack.c index 3e04e6897b1423..934666295ee307 100644 --- a/sound/soc/codecs/cs42l43-jack.c +++ b/sound/soc/codecs/cs42l43-jack.c @@ -805,7 +805,7 @@ irqreturn_t cs42l43_tip_sense(int irq, void *data) if (priv->suspend_jack_debounce) db_delay += priv->tip_fall_db_ms + priv->tip_rise_db_ms; - queue_delayed_work(system_long_wq, &priv->tip_sense_work, + queue_delayed_work(system_dfl_long_wq, &priv->tip_sense_work, msecs_to_jiffies(db_delay)); return IRQ_HANDLED; @@ -932,7 +932,8 @@ int cs42l43_jack_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *u snd_soc_jack_report(priv->jack_hp, 0, 0xFFFF); if (!override) { - queue_delayed_work(system_long_wq, &priv->tip_sense_work, 0); + queue_delayed_work(system_dfl_long_wq, &priv->tip_sense_work, + 0); } else { override--; From 11b92ac8df4418d553ba7d4656e6284fa54737c2 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 8 May 2026 17:03:19 +0200 Subject: [PATCH 638/972] ASoC: codecs: rt5640: Move long delayed work on system_dfl_long_wq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the code enqueue work items using {queue|mod}_delayed_work(), using system_long_wq. This workqueue should be used when long works are expected and it is a per-cpu workqueue. The function(s) end up calling __queue_delayed_work(), which set a global timer that could fire anywhere, enqueuing the work where the timer fired. Unbound works could benefit from scheduler task placement, to optimize performance and power consumption. Long work shouldn't stick to a single CPU. Recently, a new unbound workqueue specific for long running work has been added:     c116737e972e ("workqueue: Add system_dfl_long_wq for long unbound works") Since the workqueue work doesn't rely on per-cpu variables, there is no obvious reason that justify the use of a per-cpu workqueue. So change system_long_wq with system_dfl_long_wq so that the work may benefit from scheduler task placement. Cc: Oder Chiou Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20260508150327.351779-3-marco.crivellari@suse.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5640.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c index f6c6294e158805..1a5bb72b7d0301 100644 --- a/sound/soc/codecs/rt5640.c +++ b/sound/soc/codecs/rt5640.c @@ -2397,7 +2397,7 @@ static void rt5640_jack_work(struct work_struct *work) * disabled the OVCD IRQ, the IRQ pin will stay high and as * we react to edges, we miss the unplug event -> recheck. */ - queue_delayed_work(system_long_wq, &rt5640->jack_work, 0); + queue_delayed_work(system_dfl_long_wq, &rt5640->jack_work, 0); } } @@ -2410,7 +2410,8 @@ static irqreturn_t rt5640_irq(int irq, void *data) delay = 100; if (rt5640->jack) - mod_delayed_work(system_long_wq, &rt5640->jack_work, delay); + mod_delayed_work(system_dfl_long_wq, &rt5640->jack_work, + delay); return IRQ_HANDLED; } @@ -2419,7 +2420,7 @@ static irqreturn_t rt5640_jd_gpio_irq(int irq, void *data) { struct rt5640_priv *rt5640 = data; - queue_delayed_work(system_long_wq, &rt5640->jack_work, + queue_delayed_work(system_dfl_long_wq, &rt5640->jack_work, msecs_to_jiffies(JACK_SETTLE_TIME)); return IRQ_HANDLED; @@ -2579,7 +2580,7 @@ static void rt5640_enable_jack_detect(struct snd_soc_component *component, rt5640->irq_requested = true; /* sync initial jack state */ - queue_delayed_work(system_long_wq, &rt5640->jack_work, 0); + queue_delayed_work(system_dfl_long_wq, &rt5640->jack_work, 0); } static const struct snd_soc_dapm_route rt5640_hda_jack_dapm_routes[] = { @@ -2632,7 +2633,7 @@ static void rt5640_enable_hda_jack_detect( rt5640->irq_requested = true; /* sync initial jack state */ - queue_delayed_work(system_long_wq, &rt5640->jack_work, 0); + queue_delayed_work(system_dfl_long_wq, &rt5640->jack_work, 0); snd_soc_dapm_add_routes(dapm, rt5640_hda_jack_dapm_routes, ARRAY_SIZE(rt5640_hda_jack_dapm_routes)); @@ -2860,7 +2861,7 @@ static int rt5640_resume(struct snd_soc_component *component) } enable_irq(rt5640->irq); - queue_delayed_work(system_long_wq, &rt5640->jack_work, 0); + queue_delayed_work(system_dfl_long_wq, &rt5640->jack_work, 0); } return 0; From 6f4cf77320ae0a39f251cd0e4398efafac154dfb Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:31 +0700 Subject: [PATCH 639/972] ASoC: ti: j721e-evm: Use guard() for mutex locks Clean up the code using guard() for mutex locks. Merely code refactoring, and no behavior change. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-2-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/j721e-evm.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/sound/soc/ti/j721e-evm.c b/sound/soc/ti/j721e-evm.c index faa62c1a9b8e12..8a3333aa32ee02 100644 --- a/sound/soc/ti/j721e-evm.c +++ b/sound/soc/ti/j721e-evm.c @@ -263,7 +263,7 @@ static int j721e_audio_startup(struct snd_pcm_substream *substream) int ret = 0; int i; - mutex_lock(&priv->mutex); + guard(mutex)(&priv->mutex); domain->active++; @@ -303,7 +303,6 @@ static int j721e_audio_startup(struct snd_pcm_substream *substream) out: if (ret) domain->active--; - mutex_unlock(&priv->mutex); return ret; } @@ -323,30 +322,28 @@ static int j721e_audio_hw_params(struct snd_pcm_substream *substream, int ret; int i; - mutex_lock(&priv->mutex); + guard(mutex)(&priv->mutex); - if (domain->rate && domain->rate != params_rate(params)) { - ret = -EINVAL; - goto out; - } + if (domain->rate && domain->rate != params_rate(params)) + return -EINVAL; if (params_width(params) == 16) slot_width = 16; ret = snd_soc_dai_set_tdm_slot(cpu_dai, 0x3, 0x3, 2, slot_width); if (ret && ret != -ENOTSUPP) - goto out; + return ret; for_each_rtd_codec_dais(rtd, i, codec_dai) { ret = snd_soc_dai_set_tdm_slot(codec_dai, 0x3, 0x3, 2, slot_width); if (ret && ret != -ENOTSUPP) - goto out; + return ret; } ret = j721e_configure_refclk(priv, domain_id, params_rate(params)); if (ret) - goto out; + return ret; sysclk_rate = priv->hsdiv_rates[domain->parent_clk_id]; for_each_rtd_codec_dais(rtd, i, codec_dai) { @@ -356,7 +353,7 @@ static int j721e_audio_hw_params(struct snd_pcm_substream *substream, dev_err(priv->dev, "codec set_sysclk failed for %u Hz\n", sysclk_rate); - goto out; + return ret; } } @@ -371,8 +368,6 @@ static int j721e_audio_hw_params(struct snd_pcm_substream *substream, ret = 0; } -out: - mutex_unlock(&priv->mutex); return ret; } @@ -383,15 +378,13 @@ static void j721e_audio_shutdown(struct snd_pcm_substream *substream) unsigned int domain_id = rtd->dai_link->id; struct j721e_audio_domain *domain = &priv->audio_domains[domain_id]; - mutex_lock(&priv->mutex); + guard(mutex)(&priv->mutex); domain->active--; if (!domain->active) { domain->rate = 0; domain->active_link = 0; } - - mutex_unlock(&priv->mutex); } static const struct snd_soc_ops j721e_audio_ops = { From bc59728f93499b2c8655b2379df93f1fd92bc7e3 Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:32 +0700 Subject: [PATCH 640/972] ASoC: ti: omap-dmic: Use guard() for mutex locks Replace open-coded mutex_lock()/mutex_unlock() pairs with guard(mutex)() and scoped_guard() helpers. This also simplifies the control flow by removing temporary return variables and unnecessary goto-based cleanup paths. No functional change intended. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-3-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-dmic.c | 44 ++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/sound/soc/ti/omap-dmic.c b/sound/soc/ti/omap-dmic.c index fb92bb88eb5c2b..dc92fdb89a0fb1 100644 --- a/sound/soc/ti/omap-dmic.c +++ b/sound/soc/ti/omap-dmic.c @@ -91,18 +91,14 @@ static int omap_dmic_dai_startup(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct omap_dmic *dmic = snd_soc_dai_get_drvdata(dai); - int ret = 0; - - mutex_lock(&dmic->mutex); - if (!snd_soc_dai_active(dai)) - dmic->active = 1; - else - ret = -EBUSY; + guard(mutex)(&dmic->mutex); - mutex_unlock(&dmic->mutex); + if (snd_soc_dai_active(dai)) + return -EBUSY; - return ret; + dmic->active = 1; + return 0; } static void omap_dmic_dai_shutdown(struct snd_pcm_substream *substream, @@ -110,14 +106,12 @@ static void omap_dmic_dai_shutdown(struct snd_pcm_substream *substream, { struct omap_dmic *dmic = snd_soc_dai_get_drvdata(dai); - mutex_lock(&dmic->mutex); + guard(mutex)(&dmic->mutex); cpu_latency_qos_remove_request(&dmic->pm_qos_req); if (!snd_soc_dai_active(dai)) dmic->active = 0; - - mutex_unlock(&dmic->mutex); } static int omap_dmic_select_divider(struct omap_dmic *dmic, int sample_rate) @@ -334,26 +328,24 @@ static int omap_dmic_select_fclk(struct omap_dmic *dmic, int clk_id, return -ENODEV; } - mutex_lock(&dmic->mutex); - if (dmic->active) { - /* disable clock while reparenting */ - pm_runtime_put_sync(dmic->dev); - ret = clk_set_parent(mux, parent_clk); - pm_runtime_get_sync(dmic->dev); - } else { - ret = clk_set_parent(mux, parent_clk); + scoped_guard(mutex, &dmic->mutex) { + if (dmic->active) { + /* disable clock while reparenting */ + pm_runtime_put_sync(dmic->dev); + ret = clk_set_parent(mux, parent_clk); + pm_runtime_get_sync(dmic->dev); + } else { + ret = clk_set_parent(mux, parent_clk); + } } - mutex_unlock(&dmic->mutex); if (ret < 0) { dev_err(dmic->dev, "re-parent failed\n"); - goto err_busy; + } else { + dmic->sysclk = clk_id; + dmic->fclk_freq = freq; } - dmic->sysclk = clk_id; - dmic->fclk_freq = freq; - -err_busy: clk_put(mux); clk_put(parent_clk); From 59115f79acd2132d1ee92ff30d63a3f2113e3c6d Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:33 +0700 Subject: [PATCH 641/972] ASoC: ti: omap-hdmi: Use guard() for mutex locks Clean up the code using guard() for mutex locks. Merely code refactoring, and no behavior change. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-4-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-hdmi.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/sound/soc/ti/omap-hdmi.c b/sound/soc/ti/omap-hdmi.c index 55e7cb96858fca..e60f5b483fc574 100644 --- a/sound/soc/ti/omap-hdmi.c +++ b/sound/soc/ti/omap-hdmi.c @@ -49,7 +49,7 @@ static void hdmi_dai_abort(struct device *dev) { struct hdmi_audio_data *ad = dev_get_drvdata(dev); - mutex_lock(&ad->current_stream_lock); + guard(mutex)(&ad->current_stream_lock); if (ad->current_stream && ad->current_stream->runtime && snd_pcm_running(ad->current_stream)) { dev_err(dev, "HDMI display disabled, aborting playback\n"); @@ -57,7 +57,6 @@ static void hdmi_dai_abort(struct device *dev) snd_pcm_stop(ad->current_stream, SNDRV_PCM_STATE_DISCONNECTED); snd_pcm_stream_unlock_irq(ad->current_stream); } - mutex_unlock(&ad->current_stream_lock); } static int hdmi_dai_startup(struct snd_pcm_substream *substream, @@ -86,16 +85,14 @@ static int hdmi_dai_startup(struct snd_pcm_substream *substream, snd_soc_dai_set_dma_data(dai, substream, &ad->dma_data); - mutex_lock(&ad->current_stream_lock); - ad->current_stream = substream; - mutex_unlock(&ad->current_stream_lock); + scoped_guard(mutex, &ad->current_stream_lock) + ad->current_stream = substream; ret = ad->ops->audio_startup(ad->dssdev, hdmi_dai_abort); if (ret) { - mutex_lock(&ad->current_stream_lock); - ad->current_stream = NULL; - mutex_unlock(&ad->current_stream_lock); + scoped_guard(mutex, &ad->current_stream_lock) + ad->current_stream = NULL; } return ret; @@ -261,9 +258,8 @@ static void hdmi_dai_shutdown(struct snd_pcm_substream *substream, ad->ops->audio_shutdown(ad->dssdev); - mutex_lock(&ad->current_stream_lock); - ad->current_stream = NULL; - mutex_unlock(&ad->current_stream_lock); + scoped_guard(mutex, &ad->current_stream_lock) + ad->current_stream = NULL; } static const struct snd_soc_dai_ops hdmi_dai_ops = { From 70031e9ad601591cff5562df2b9b9412a700e949 Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:34 +0700 Subject: [PATCH 642/972] ASoC: ti: omap-mcpdm: Use guard() for mutex locks Clean up the code using guard() for mutex locks. Merely code refactoring, and no behavior change. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-5-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-mcpdm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sound/soc/ti/omap-mcpdm.c b/sound/soc/ti/omap-mcpdm.c index 1a5d19937c6424..c7d7b502f120fe 100644 --- a/sound/soc/ti/omap-mcpdm.c +++ b/sound/soc/ti/omap-mcpdm.c @@ -251,13 +251,11 @@ static int omap_mcpdm_dai_startup(struct snd_pcm_substream *substream, { struct omap_mcpdm *mcpdm = snd_soc_dai_get_drvdata(dai); - mutex_lock(&mcpdm->mutex); + guard(mutex)(&mcpdm->mutex); if (!snd_soc_dai_active(dai)) omap_mcpdm_open_streams(mcpdm); - mutex_unlock(&mcpdm->mutex); - return 0; } @@ -269,7 +267,7 @@ static void omap_mcpdm_dai_shutdown(struct snd_pcm_substream *substream, int stream1 = tx ? SNDRV_PCM_STREAM_PLAYBACK : SNDRV_PCM_STREAM_CAPTURE; int stream2 = tx ? SNDRV_PCM_STREAM_CAPTURE : SNDRV_PCM_STREAM_PLAYBACK; - mutex_lock(&mcpdm->mutex); + guard(mutex)(&mcpdm->mutex); if (!snd_soc_dai_active(dai)) { if (omap_mcpdm_active(mcpdm)) { @@ -287,8 +285,6 @@ static void omap_mcpdm_dai_shutdown(struct snd_pcm_substream *substream, cpu_latency_qos_remove_request(&mcpdm->pm_qos_req); mcpdm->latency[stream1] = 0; - - mutex_unlock(&mcpdm->mutex); } static int omap_mcpdm_dai_hw_params(struct snd_pcm_substream *substream, From a8217778be47b648163dd6e78537b7aace5f0271 Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:35 +0700 Subject: [PATCH 643/972] ASoC: ti: ams-delta: Use guard() for spin locks Clean up the code using guard() for spin locks. Merely code refactoring, and no behavior change. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-6-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/ams-delta.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sound/soc/ti/ams-delta.c b/sound/soc/ti/ams-delta.c index ba173d9fcba91d..6dd0d051a757e6 100644 --- a/sound/soc/ti/ams-delta.c +++ b/sound/soc/ti/ams-delta.c @@ -264,10 +264,10 @@ static void cx81801_timeout(struct timer_list *unused) { int muted; - spin_lock(&ams_delta_lock); - cx81801_cmd_pending = 0; - muted = ams_delta_muted; - spin_unlock(&ams_delta_lock); + scoped_guard(spinlock, &ams_delta_lock) { + cx81801_cmd_pending = 0; + muted = ams_delta_muted; + } /* Reconnect the codec DAI back from the modem to the CPU DAI * only if digital mute still off */ @@ -373,11 +373,11 @@ static void cx81801_receive(struct tty_struct *tty, const u8 *cp, const u8 *fp, continue; /* Complete modem response received, apply config to codec */ - spin_lock_bh(&ams_delta_lock); - mod_timer(&cx81801_timer, jiffies + msecs_to_jiffies(150)); - apply = !ams_delta_muted && !cx81801_cmd_pending; - cx81801_cmd_pending = 1; - spin_unlock_bh(&ams_delta_lock); + scoped_guard(spinlock_bh, &ams_delta_lock) { + mod_timer(&cx81801_timer, jiffies + msecs_to_jiffies(150)); + apply = !ams_delta_muted && !cx81801_cmd_pending; + cx81801_cmd_pending = 1; + } /* Apply config pulse by connecting the codec to the modem * if not already done */ @@ -426,10 +426,10 @@ static int ams_delta_mute(struct snd_soc_dai *dai, int mute, int direction) if (ams_delta_muted == mute) return 0; - spin_lock_bh(&ams_delta_lock); - ams_delta_muted = mute; - apply = !cx81801_cmd_pending; - spin_unlock_bh(&ams_delta_lock); + scoped_guard(spinlock_bh, &ams_delta_lock) { + ams_delta_muted = mute; + apply = !cx81801_cmd_pending; + } if (apply) gpiod_set_value(gpiod_modem_codec, !!mute); From d94e794e8294382dd438b1d7ae6cd1958b4af1c3 Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:36 +0700 Subject: [PATCH 644/972] ASoC: ti: omap-mcbsp-st: Use guard() for spin locks Clean up the code using guard() for spin locks. Merely code refactoring, and no behavior change. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Tested-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-7-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-mcbsp-st.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/sound/soc/ti/omap-mcbsp-st.c b/sound/soc/ti/omap-mcbsp-st.c index 901578896ef324..b762d5d3e33b6e 100644 --- a/sound/soc/ti/omap-mcbsp-st.c +++ b/sound/soc/ti/omap-mcbsp-st.c @@ -156,7 +156,7 @@ static int omap_mcbsp_st_set_chgain(struct omap_mcbsp *mcbsp, int channel, if (!st_data) return -ENOENT; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); if (channel == 0) st_data->ch0gain = chgain; else if (channel == 1) @@ -166,7 +166,6 @@ static int omap_mcbsp_st_set_chgain(struct omap_mcbsp *mcbsp, int channel, if (st_data->enabled) omap_mcbsp_st_chgain(mcbsp); - spin_unlock_irq(&mcbsp->lock); return ret; } @@ -180,14 +179,13 @@ static int omap_mcbsp_st_get_chgain(struct omap_mcbsp *mcbsp, int channel, if (!st_data) return -ENOENT; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); if (channel == 0) *chgain = st_data->ch0gain; else if (channel == 1) *chgain = st_data->ch1gain; else ret = -EINVAL; - spin_unlock_irq(&mcbsp->lock); return ret; } @@ -199,10 +197,9 @@ static int omap_mcbsp_st_enable(struct omap_mcbsp *mcbsp) if (!st_data) return -ENODEV; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); st_data->enabled = 1; omap_mcbsp_st_start(mcbsp); - spin_unlock_irq(&mcbsp->lock); return 0; } @@ -215,10 +212,9 @@ static int omap_mcbsp_st_disable(struct omap_mcbsp *mcbsp) if (!st_data) return -ENODEV; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); omap_mcbsp_st_stop(mcbsp); st_data->enabled = 0; - spin_unlock_irq(&mcbsp->lock); return ret; } @@ -241,13 +237,12 @@ static ssize_t st_taps_show(struct device *dev, ssize_t status = 0; int i; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); for (i = 0; i < st_data->nr_taps; i++) status += sysfs_emit_at(buf, status, (i ? ", %d" : "%d"), st_data->taps[i]); if (i) status += sysfs_emit_at(buf, status, "\n"); - spin_unlock_irq(&mcbsp->lock); return status; } @@ -260,19 +255,17 @@ static ssize_t st_taps_store(struct device *dev, struct omap_mcbsp_st_data *st_data = mcbsp->st_data; int val, tmp, status, i = 0; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); memset(st_data->taps, 0, sizeof(st_data->taps)); st_data->nr_taps = 0; do { status = sscanf(buf, "%d%n", &val, &tmp); if (status < 0 || status == 0) { - size = -EINVAL; - goto out; + return -EINVAL; } if (val < -32768 || val > 32767) { - size = -EINVAL; - goto out; + return -EINVAL; } st_data->taps[i++] = val; buf += tmp; @@ -283,9 +276,6 @@ static ssize_t st_taps_store(struct device *dev, st_data->nr_taps = i; -out: - spin_unlock_irq(&mcbsp->lock); - return size; } From 822f67bc269d6c393a978fad9915a28ac272738f Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Fri, 8 May 2026 17:38:37 +0700 Subject: [PATCH 645/972] ASoC: ti: omap-mcbsp: Simplify lock and resource handling Convert spinlock protected sections to guard()/scoped_guard() helpers and simplify the cleanup paths, including the reg_cache lifetime handling in omap_mcbsp_request(). No functional change intended. Signed-off-by: bui duc phuc Acked-by: Jarkko Nikula Link: https://patch.msgid.link/20260508103837.138142-8-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-mcbsp.c | 54 +++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/sound/soc/ti/omap-mcbsp.c b/sound/soc/ti/omap-mcbsp.c index 411970399271a2..d82fef629867aa 100644 --- a/sound/soc/ti/omap-mcbsp.c +++ b/sound/soc/ti/omap-mcbsp.c @@ -290,23 +290,22 @@ static u16 omap_mcbsp_get_rx_delay(struct omap_mcbsp *mcbsp) static int omap_mcbsp_request(struct omap_mcbsp *mcbsp) { - void *reg_cache; + void *reg_cache __free(kfree) = kzalloc(mcbsp->reg_cache_size, GFP_KERNEL); int err; - reg_cache = kzalloc(mcbsp->reg_cache_size, GFP_KERNEL); if (!reg_cache) return -ENOMEM; - spin_lock(&mcbsp->lock); - if (!mcbsp->free) { - dev_err(mcbsp->dev, "McBSP%d is currently in use\n", mcbsp->id); - err = -EBUSY; - goto err_kfree; - } + scoped_guard(spinlock, &mcbsp->lock) { + if (!mcbsp->free) { + dev_err(mcbsp->dev, "McBSP%d is currently in use\n", mcbsp->id); + return -EBUSY; + } - mcbsp->free = false; - mcbsp->reg_cache = reg_cache; - spin_unlock(&mcbsp->lock); + mcbsp->free = false; + mcbsp->reg_cache = reg_cache; + reg_cache = NULL; + } if(mcbsp->pdata->ops && mcbsp->pdata->ops->request) mcbsp->pdata->ops->request(mcbsp->id - 1); @@ -352,12 +351,11 @@ static int omap_mcbsp_request(struct omap_mcbsp *mcbsp) if (mcbsp->pdata->has_wakeup) MCBSP_WRITE(mcbsp, WAKEUPEN, 0); - spin_lock(&mcbsp->lock); - mcbsp->free = true; - mcbsp->reg_cache = NULL; -err_kfree: - spin_unlock(&mcbsp->lock); - kfree(reg_cache); + scoped_guard(spinlock, &mcbsp->lock) { + reg_cache = mcbsp->reg_cache; + mcbsp->free = true; + mcbsp->reg_cache = NULL; + } return err; } @@ -395,13 +393,13 @@ static void omap_mcbsp_free(struct omap_mcbsp *mcbsp) if (!mcbsp_omap1()) omap2_mcbsp_set_clks_src(mcbsp, MCBSP_CLKS_PRCM_SRC); - spin_lock(&mcbsp->lock); - if (mcbsp->free) - dev_err(mcbsp->dev, "McBSP%d was not reserved\n", mcbsp->id); - else - mcbsp->free = true; - mcbsp->reg_cache = NULL; - spin_unlock(&mcbsp->lock); + scoped_guard(spinlock, &mcbsp->lock) { + if (mcbsp->free) + dev_err(mcbsp->dev, "McBSP%d was not reserved\n", mcbsp->id); + else + mcbsp->free = true; + mcbsp->reg_cache = NULL; + } kfree(reg_cache); } @@ -581,16 +579,12 @@ static ssize_t dma_op_mode_store(struct device *dev, if (i < 0) return i; - spin_lock_irq(&mcbsp->lock); + guard(spinlock_irq)(&mcbsp->lock); if (!mcbsp->free) { - size = -EBUSY; - goto unlock; + return -EBUSY; } mcbsp->dma_op_mode = i; -unlock: - spin_unlock_irq(&mcbsp->lock); - return size; } From ac2f21ceddeec5553285e9fc4837a1f23d5e6a37 Mon Sep 17 00:00:00 2001 From: Mac Chiang Date: Fri, 8 May 2026 18:42:37 +0800 Subject: [PATCH 646/972] ASoC: Intel: soc-acpi-intel-arl-match: Reorder ACPI machine tables When the SOF device driver enumerates the machine tables, it selects the entry with the most numbers of matched links in ascending order. Align the ordering with commit 08095e20995ad6e3648af7416c90163627fe7e44 ("ASoC: Intel: soc-acpi-intel-ptl-match: Sort ACPI link/machine tables"). Signed-off-by: Mac Chiang Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508104239.1247525-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- .../intel/common/soc-acpi-intel-arl-match.c | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sound/soc/intel/common/soc-acpi-intel-arl-match.c b/sound/soc/intel/common/soc-acpi-intel-arl-match.c index c952f7d2b2c0e0..cd4023ccadeb65 100644 --- a/sound/soc/intel/common/soc-acpi-intel-arl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-arl-match.c @@ -483,12 +483,18 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[] = { .get_function_tplg_files = sof_sdw_get_tplg_files, }, { - .link_mask = BIT(0), - .links = arl_cs42l43_l0, + .link_mask = BIT(0) | BIT(2), + .links = arl_rt722_l0_rt1320_l2, .drv_name = "sof_sdw", - .sof_tplg_filename = "sof-arl-cs42l43-l0.tplg", + .sof_tplg_filename = "sof-arl-rt722-l0_rt1320-l2.tplg", .get_function_tplg_files = sof_sdw_get_tplg_files, }, + { + .link_mask = BIT(0) | BIT(3), + .links = arl_rt711_l0_rt1316_l3, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-arl-rt711-l0-rt1316-l3.tplg", + }, { .link_mask = BIT(2) | BIT(3), .links = arl_cs42l43_l2_cs35l56_l3, @@ -497,18 +503,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[] = { .get_function_tplg_files = sof_sdw_get_tplg_files, }, { - .link_mask = BIT(2), - .links = arl_cs42l43_l2, + .link_mask = BIT(0), + .links = arl_cs42l43_l0, .drv_name = "sof_sdw", - .sof_tplg_filename = "sof-arl-cs42l43-l2.tplg", + .sof_tplg_filename = "sof-arl-cs42l43-l0.tplg", .get_function_tplg_files = sof_sdw_get_tplg_files, }, - { - .link_mask = BIT(0) | BIT(3), - .links = arl_rt711_l0_rt1316_l3, - .drv_name = "sof_sdw", - .sof_tplg_filename = "sof-arl-rt711-l0-rt1316-l3.tplg", - }, { .link_mask = 0x1, /* link0 required */ .links = arl_rvp, @@ -522,10 +522,10 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[] = { .sof_tplg_filename = "sof-arl-rt711-l0.tplg", }, { - .link_mask = BIT(0) | BIT(2), - .links = arl_rt722_l0_rt1320_l2, + .link_mask = BIT(2), + .links = arl_cs42l43_l2, .drv_name = "sof_sdw", - .sof_tplg_filename = "sof-arl-rt722-l0_rt1320-l2.tplg", + .sof_tplg_filename = "sof-arl-cs42l43-l2.tplg", .get_function_tplg_files = sof_sdw_get_tplg_files, }, {}, From 242200c297030d9bab62c0ea65f2094981bcf013 Mon Sep 17 00:00:00 2001 From: Gary C Wang Date: Fri, 8 May 2026 18:42:38 +0800 Subject: [PATCH 647/972] ASoC: soc-acpi-intel-arl-match: add rt712_l0_rt1320_l3 support Add support for using the rt712 multi-function codec on link 0 and the rt1320 amplifier on link 3 on ARL platforms. Signed-off-by: Gary C Wang Co-developed-by: Mac Chiang Signed-off-by: Mac Chiang Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508104239.1247525-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- .../intel/common/soc-acpi-intel-arl-match.c | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-arl-match.c b/sound/soc/intel/common/soc-acpi-intel-arl-match.c index cd4023ccadeb65..52c5b5719f51c0 100644 --- a/sound/soc/intel/common/soc-acpi-intel-arl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-arl-match.c @@ -8,6 +8,7 @@ #include #include #include +#include "soc-acpi-intel-sdca-quirks.h" #include "sof-function-topology-lib.h" static const struct snd_soc_acpi_endpoint single_endpoint = { @@ -237,6 +238,15 @@ static const struct snd_soc_acpi_adr_device rt722_0_agg_adr[] = { } }; +static const struct snd_soc_acpi_adr_device rt712_0_agg_adr[] = { + { + .adr = 0x000030025D071201ull, + .num_endpoints = ARRAY_SIZE(jack_amp_g1_dmic_endpoints), + .endpoints = jack_amp_g1_dmic_endpoints, + .name_prefix = "rt712" + } +}; + static const struct snd_soc_acpi_adr_device rt1316_3_single_adr[] = { { .adr = 0x000330025D131601ull, @@ -255,6 +265,15 @@ static const struct snd_soc_acpi_adr_device rt1320_2_single_adr[] = { } }; +static const struct snd_soc_acpi_adr_device rt1320_3_group1_adr[] = { + { + .adr = 0x000330025D132001ull, + .num_endpoints = 1, + .endpoints = &spk_r_endpoint, + .name_prefix = "rt1320-1" + } +}; + static const struct snd_soc_acpi_link_adr arl_cs42l43_l0[] = { { .mask = BIT(0), @@ -404,6 +423,20 @@ static const struct snd_soc_acpi_link_adr arl_rt722_l0_rt1320_l2[] = { {} }; +static const struct snd_soc_acpi_link_adr arl_rt712_l0_rt1320_l3[] = { + { + .mask = BIT(0), + .num_adr = ARRAY_SIZE(rt712_0_agg_adr), + .adr_d = rt712_0_agg_adr, + }, + { + .mask = BIT(3), + .num_adr = ARRAY_SIZE(rt1320_3_group1_adr), + .adr_d = rt1320_3_group1_adr, + }, + {} +}; + static const struct snd_soc_acpi_codecs arl_essx_83x6 = { .num_codecs = 3, .codecs = { "ESSX8316", "ESSX8326", "ESSX8336"}, @@ -495,6 +528,14 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[] = { .drv_name = "sof_sdw", .sof_tplg_filename = "sof-arl-rt711-l0-rt1316-l3.tplg", }, + { + .link_mask = BIT(0) | BIT(3), + .links = arl_rt712_l0_rt1320_l3, + .drv_name = "sof_sdw", + .machine_check = snd_soc_acpi_intel_sdca_is_device_rt712_vb, + .sof_tplg_filename = "sof-arl-rt712-l0-rt1320-l3.tplg", + .get_function_tplg_files = sof_sdw_get_tplg_files, + }, { .link_mask = BIT(2) | BIT(3), .links = arl_cs42l43_l2_cs35l56_l3, From d714913b61d55b936e9060e51e7b78216d34f6b1 Mon Sep 17 00:00:00 2001 From: Jang Pyohwan Date: Sat, 9 May 2026 17:53:10 +0900 Subject: [PATCH 648/972] ASoC: Intel: soc-acpi: add LG Gram 16Z90U RT713 + single RT1320 quirk Add a SoundWire machine table entry for the LG Gram Pro 2026 (16Z90U-KU7BK), which has an unusual configuration: sdw:0:1:025d:1320:01 single stereo RT1320 SmartAmp on link 1 sdw:0:3:025d:0713:01 RT713 jack/headset codec on link 3 Existing rt713-rt1320 boards have two RT1320 amps on different links ("link_mask = BIT(1) | BIT(2) | BIT(3)"). The LG Gram uses a single stereo RT1320 chip, so the new entry uses "link_mask = BIT(1) | BIT(3)" with the existing rt1320_1_group2_adr structure, leaving the two-channel routing to the topology. The RT713 on this board does not expose a SMART_MIC function in ACPI, so the .machine_check callback used by the existing entries (snd_soc_acpi_intel_sdca_is_device_rt712_vb) would reject this board. Drop machine_check for the new entry; speaker output and the headset jack do not depend on the SMART_MIC presence check. The corresponding topology source has been submitted to the SOF project at https://github.com/thesofproject/sof/pull/10760 . The generated sof-ptl-rt713-l3-rt1320-l1-2ch.tplg and nhlt-sof-ptl-rt713-l3-rt1320-l1.bin will follow in linux-firmware once that lands. Tested on Ubuntu 26.04 with kernel 7.0.0-15: speaker (RT1320 stereo), headphone jack with auto-routing, headset mic, and the internal NHLT DMIC array all work via the UCM HiFi profile. Signed-off-by: Jang Pyohwan Link: https://patch.msgid.link/20260509175317.DnhjxHczQay7kkp5z6t4lg@vhgksl.daum.net Signed-off-by: Mark Brown --- .../intel/common/soc-acpi-intel-ptl-match.c | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c index 3b7818355ff645..ad3af8834e431a 100644 --- a/sound/soc/intel/common/soc-acpi-intel-ptl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-ptl-match.c @@ -493,6 +493,20 @@ static const struct snd_soc_acpi_link_adr ptl_sdw_rt713_vb_l3_rt1320_l12[] = { {} }; +static const struct snd_soc_acpi_link_adr ptl_sdw_rt713_vb_l3_rt1320_l1[] = { + { + .mask = BIT(3), + .num_adr = ARRAY_SIZE(rt713_vb_3_adr), + .adr_d = rt713_vb_3_adr, + }, + { + .mask = BIT(1), + .num_adr = ARRAY_SIZE(rt1320_1_group2_adr), + .adr_d = rt1320_1_group2_adr, + }, + {} +}; + static const struct snd_soc_acpi_link_adr ptl_sdw_rt712_vb_l2_rt1320_l1[] = { { .mask = BIT(2), @@ -578,6 +592,13 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_sdw_machines[] = { .sof_tplg_filename = "sof-ptl-rt713-l3-rt1320-l12.tplg", .get_function_tplg_files = sof_sdw_get_tplg_files, }, + { + .link_mask = BIT(1) | BIT(3), + .links = ptl_sdw_rt713_vb_l3_rt1320_l1, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-ptl-rt713-l3-rt1320-l1.tplg", + .get_function_tplg_files = sof_sdw_get_tplg_files, + }, { .link_mask = BIT(1) | BIT(2) | BIT(3), .links = ptl_cs42l43_l2_cs35l56x6_l13, From 9c37daee7c17fa17e8d41089ee1f658b06cb672a Mon Sep 17 00:00:00 2001 From: Mac Chiang Date: Fri, 8 May 2026 17:32:23 +0800 Subject: [PATCH 649/972] ASoC: sdw_utils: Add quirk to ignore RT712 CODEC_MIC Some devices do not use CODEC_MIC but use the host PCH_DMIC instead. Add a quirk to skip the CODEC_MIC DAI when it is not present in disco table, ensuring the correct capture device is used. If CODEC_MIC is present, it continues to be used as default. Fixes: 9489db97f6f0 ("ASoC: sdw_utils: add SmartMic DAI for RT712 VB") Signed-off-by: Mac Chiang Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508093224.1246282-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 849ae876d7a442..863ded2ceda541 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -194,6 +194,8 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dai_type = SOC_SDW_DAI_TYPE_MIC, .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_DMIC_DAI_ID}, .rtd_init = asoc_sdw_rt_dmic_rtd_init, + .quirk = SOC_SDW_CODEC_MIC, + .quirk_exclude = true, }, }, .dai_num = 3, From fa749a77bdc50f0d695aaf81f1bd55967d77d10f Mon Sep 17 00:00:00 2001 From: Mac Chiang Date: Fri, 8 May 2026 17:32:24 +0800 Subject: [PATCH 650/972] ASoC: sdw_utils: Add quirk to ignore RT721 CODEC_MIC Add a quirk to skip the CODEC_MIC DAI when it is not present. This ensures PCH_DMIC is used as the fallback; otherwise, CODEC_MIC remains the default. Fixes: 846a8d3cf3ba ("ASoC: Intel: soc-acpi-intel-ptl-match: Add rt721 support") Signed-off-by: Mac Chiang Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508093224.1246282-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 863ded2ceda541..be45a2e62d3ea6 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -503,6 +503,8 @@ struct asoc_sdw_codec_info codec_info_list[] = { .dai_type = SOC_SDW_DAI_TYPE_MIC, .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_DMIC_DAI_ID}, .rtd_init = asoc_sdw_rt_dmic_rtd_init, + .quirk = SOC_SDW_CODEC_MIC, + .quirk_exclude = true, }, }, .dai_num = 3, From 954222bb686d22f4359b7014cbce16bca2470927 Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Fri, 8 May 2026 17:47:49 +0800 Subject: [PATCH 651/972] ASoC: soc_sdw_utils: skip aux device if it is not present Similarly to codec endpoints that may not be used [1], aux devices (like HID) also may not be used. Hence skip aux devices which are not present. [1] https://lore.kernel.org/all/20250414063239.85200-12-yung-chuan.liao@linux.intel.com/ Signed-off-by: Maciej Strozek Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508094750.1246796-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 76 ++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 849ae876d7a442..88aedc2099650b 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1553,6 +1553,63 @@ int asoc_sdw_init_simple_dai_link(struct device *dev, struct snd_soc_dai_link *d } EXPORT_SYMBOL_NS(asoc_sdw_init_simple_dai_link, "SND_SOC_SDW_UTILS"); +/** + * is_sdca_aux_dev_present - Check if an SDCA aux device is present on the SDW peripheral + * @dev: Device pointer + * @aux_codec_name: Aux codec name from the codec info (e.g. "snd_soc_sdca.HID.2") + * @adr_link: ACPI link address + * @adr_index: Index of the ACPI link address + * + * Return: 1 if the aux is present, 0 if the aux is not present, or negative error code. + */ +static int is_sdca_aux_dev_present(struct device *dev, + const char *aux_codec_name, + const struct snd_soc_acpi_link_adr *adr_link, + int adr_index) +{ + struct sdw_slave *slave; + struct device *sdw_dev; + const char *sdw_codec_name; + int ret = 0; + int i; + + if (!aux_codec_name) + return 0; + + sdw_codec_name = _asoc_sdw_get_codec_name(dev, adr_link, adr_index); + if (!sdw_codec_name) + return -ENOMEM; + + sdw_dev = bus_find_device_by_name(&sdw_bus_type, NULL, sdw_codec_name); + if (!sdw_dev) { + dev_err(dev, "codec %s not found\n", sdw_codec_name); + return -EINVAL; + } + + slave = dev_to_sdw_dev(sdw_dev); + + if (!slave->sdca_data.interface_revision) { + dev_warn(dev, "No SDCA properties, assuming aux '%s' present\n", aux_codec_name); + ret = 1; + goto put_dev; + } + + for (i = 0; i < slave->sdca_data.num_functions; i++) { + const char *fname = slave->sdca_data.function[i].name; + + if (fname && strstr(aux_codec_name, fname)) { + ret = 1; + goto put_dev; + } + } + + dev_dbg(dev, "SDCA function for aux '%s' NOT FOUND on slave, skipping\n", aux_codec_name); + +put_dev: + put_device(sdw_dev); + return ret; +} + int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *num_ends, int *num_aux) { @@ -1560,7 +1617,7 @@ int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, struct snd_soc_acpi_mach *mach = dev_get_platdata(dev); struct snd_soc_acpi_mach_params *mach_params = &mach->mach_params; const struct snd_soc_acpi_link_adr *adr_link; - int i; + int i, j, ret; for (adr_link = mach_params->links; adr_link->num_adr; adr_link++) { *num_devs += adr_link->num_adr; @@ -1575,7 +1632,14 @@ int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, if (!codec_info) return -EINVAL; - *num_aux += codec_info->aux_num; + for (j = 0; j < codec_info->aux_num; j++) { + ret = is_sdca_aux_dev_present(dev, codec_info->auxs[j].codec_name, + adr_link, i); + if (ret < 0) + return ret; + if (ret) + (*num_aux)++; + } } } @@ -1739,6 +1803,14 @@ int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card, for (j = 0; j < codec_info->aux_num; j++) { struct snd_soc_component *component; + ret = is_sdca_aux_dev_present(dev, codec_info->auxs[j].codec_name, + adr_link, i); + if (ret < 0) + return ret; + + if (ret == 0) + continue; + component = snd_soc_lookup_component_by_name(codec_info->auxs[j].codec_name); if (component) { dev_dbg(dev, "%s found component %s for aux name %s\n", From 21bcd34e70afbe998e143d173d80a02ccace9fe0 Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Fri, 8 May 2026 17:47:50 +0800 Subject: [PATCH 652/972] ASoC: soc_sdw_utils: Change comment into proper kernel doc Update the comment above is_sdca_endpoint_present(). Signed-off-by: Maciej Strozek Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508094750.1246796-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 88aedc2099650b..4ade1d9e79bfd2 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1688,20 +1688,16 @@ int asoc_sdw_get_dai_type(u32 type) } EXPORT_SYMBOL_NS(asoc_sdw_get_dai_type, "SND_SOC_SDW_UTILS"); -/* - * Check if the SDCA endpoint is present by the SDW peripheral - * +/** + * is_sdca_endpoint_present - Check if an SDCA endpoint is present on the SDW peripheral * @dev: Device pointer * @codec_info: Codec info pointer * @adr_link: ACPI link address * @adr_index: Index of the ACPI link address * @end_index: Index of the endpoint * - * Return: 1 if the endpoint is present, - * 0 if the endpoint is not present, - * negative error code. + * Return: 1 if the endpoint is present, 0 if the endpoint is not present, or negative error code. */ - static int is_sdca_endpoint_present(struct device *dev, struct asoc_sdw_codec_info *codec_info, const struct snd_soc_acpi_link_adr *adr_link, From 42d99857d6f08a40a8bde7b9e68d330f18b159a0 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Thu, 30 Apr 2026 16:07:52 +0200 Subject: [PATCH 653/972] ASoC: core: Move all users to deferrable card binding Commit a3375522bb5e2 ("ASoC: core: Complete support for card rebinding") completed the feature and at the same time divided ASoC users into two groups: 1) cards that fail to enumerate the moment one of the components is not available 2) cards that succeed to enumerate even if some of their components become available late Given the component-based nature of ASoC, approach 2) is preferred and can be used by all ASoC users. By dropping 1) the card binding code can also be simplified. Flatten code that is currently conditional based on ->devres_dev and convert snd_soc_rebind_card() to call_soc_bind_card(). The latter is a selector between managed and unmanaged card-binding behaviour to keep non-devm users happy. With rebinding being the default, devm_snd_soc_register_card() takes form of its deferrable friend - all the devm job is already done by devm_snd_soc_bind_card(). Suggested-by: Kuninori Morimoto Signed-off-by: Cezary Rojewski Link: https://patch.msgid.link/20260430140752.766130-1-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/sound/soc.h | 2 +- sound/soc/soc-core.c | 44 ++++++++++++++---------------------------- sound/soc/soc-devres.c | 28 +-------------------------- 3 files changed, 17 insertions(+), 57 deletions(-) diff --git a/include/sound/soc.h b/include/sound/soc.h index 5e3eb617d8323a..50d1fd110a4ab1 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -431,7 +431,7 @@ struct snd_soc_jack_pin; int snd_soc_register_card(struct snd_soc_card *card); void snd_soc_unregister_card(struct snd_soc_card *card); int devm_snd_soc_register_card(struct device *dev, struct snd_soc_card *card); -int devm_snd_soc_register_deferrable_card(struct device *dev, struct snd_soc_card *card); +#define devm_snd_soc_register_deferrable_card(d, c) devm_snd_soc_register_card(d, c) #ifdef CONFIG_PM_SLEEP int snd_soc_suspend(struct device *dev); int snd_soc_resume(struct device *dev); diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 3fecf9fc903c00..1d9099595a88ef 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -2168,6 +2168,7 @@ static int snd_soc_bind_card(struct snd_soc_card *card) snd_soc_fill_dummy_dai(card); snd_soc_dapm_init(dapm, card, NULL); + list_del_init(&card->list); /* check whether any platform is ignore machine FE and using topology */ soc_check_tplg_fes(card); @@ -2311,6 +2312,10 @@ static int snd_soc_bind_card(struct snd_soc_card *card) probe_end: if (ret < 0) soc_cleanup_card_resources(card); + if (ret == -EPROBE_DEFER) { + list_add(&card->list, &unbind_card_list); + ret = 0; + } snd_soc_card_mutex_unlock(card); return ret; @@ -2326,12 +2331,15 @@ static int devm_snd_soc_bind_card(struct device *dev, struct snd_soc_card *card) struct snd_soc_card **ptr; int ret; + /* The procedure may be called many times during the lifetime of the card. */ + devres_destroy(dev, devm_card_bind_release, NULL, NULL); + ptr = devres_alloc(devm_card_bind_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; ret = snd_soc_bind_card(card); - if (ret == 0 || ret == -EPROBE_DEFER) { + if (ret == 0) { *ptr = card; devres_add(dev, ptr); } else { @@ -2341,21 +2349,11 @@ static int devm_snd_soc_bind_card(struct device *dev, struct snd_soc_card *card) return ret; } -static int snd_soc_rebind_card(struct snd_soc_card *card) +static int call_soc_bind_card(struct snd_soc_card *card) { - int ret; - - if (card->devres_dev) { - devres_destroy(card->devres_dev, devm_card_bind_release, NULL, NULL); - ret = devm_snd_soc_bind_card(card->devres_dev, card); - } else { - ret = snd_soc_bind_card(card); - } - - if (ret != -EPROBE_DEFER) - list_del_init(&card->list); - - return ret; + if (card->devres_dev) + return devm_snd_soc_bind_card(card->devres_dev, card); + return snd_soc_bind_card(card); } /* probes a new socdev */ @@ -2553,8 +2551,6 @@ EXPORT_SYMBOL_GPL(snd_soc_add_dai_controls); */ int snd_soc_register_card(struct snd_soc_card *card) { - int ret; - if (!card->name || !card->dev) return -EINVAL; @@ -2580,17 +2576,7 @@ int snd_soc_register_card(struct snd_soc_card *card) guard(mutex)(&client_mutex); - if (card->devres_dev) { - ret = devm_snd_soc_bind_card(card->devres_dev, card); - if (ret == -EPROBE_DEFER) { - list_add(&card->list, &unbind_card_list); - ret = 0; - } - } else { - ret = snd_soc_bind_card(card); - } - - return ret; + return call_soc_bind_card(card); } EXPORT_SYMBOL_GPL(snd_soc_register_card); @@ -2910,7 +2896,7 @@ int snd_soc_add_component(struct snd_soc_component *component, list_add(&component->list, &component_list); list_for_each_entry_safe(card, c, &unbind_card_list, list) - snd_soc_rebind_card(card); + call_soc_bind_card(card); err_cleanup: if (ret < 0) diff --git a/sound/soc/soc-devres.c b/sound/soc/soc-devres.c index d33f83ec24f275..718165ba84ac87 100644 --- a/sound/soc/soc-devres.c +++ b/sound/soc/soc-devres.c @@ -49,11 +49,6 @@ int devm_snd_soc_register_component(struct device *dev, } EXPORT_SYMBOL_GPL(devm_snd_soc_register_component); -static void devm_card_release(struct device *dev, void *res) -{ - snd_soc_unregister_card(*(struct snd_soc_card **)res); -} - /** * devm_snd_soc_register_card - resource managed card registration * @dev: Device used to manage card @@ -63,32 +58,11 @@ static void devm_card_release(struct device *dev, void *res) * unregistered. */ int devm_snd_soc_register_card(struct device *dev, struct snd_soc_card *card) -{ - struct snd_soc_card **ptr; - int ret; - - ptr = devres_alloc(devm_card_release, sizeof(*ptr), GFP_KERNEL); - if (!ptr) - return -ENOMEM; - - ret = snd_soc_register_card(card); - if (ret == 0) { - *ptr = card; - devres_add(dev, ptr); - } else { - devres_free(ptr); - } - - return ret; -} -EXPORT_SYMBOL_GPL(devm_snd_soc_register_card); - -int devm_snd_soc_register_deferrable_card(struct device *dev, struct snd_soc_card *card) { card->devres_dev = dev; return snd_soc_register_card(card); } -EXPORT_SYMBOL_GPL(devm_snd_soc_register_deferrable_card); +EXPORT_SYMBOL_GPL(devm_snd_soc_register_card); #ifdef CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM From 796ad622040f7f955ccc3973085e953415920496 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Mon, 11 May 2026 09:31:50 +0800 Subject: [PATCH 654/972] cgroup/dmem: Return -ENOMEM on failed pool preallocation get_cg_pool_unlocked() handles allocation failures under dmemcg_lock by dropping the lock, preallocating a pool with GFP_KERNEL, and retrying the locked lookup and creation path. If the fallback allocation fails too, pool remains NULL. Since the loop condition is while (!pool), the function can keep retrying instead of propagating the allocation failure to the caller. Set pool to ERR_PTR(-ENOMEM) when the fallback allocation fails so the loop exits through the existing common return path. The callers already handle ERR_PTR() from get_cg_pool_unlocked(), so this restores the expected error path. Fixes: b168ed458dde ("kernel/cgroup: Add "dmem" memory accounting cgroup") Cc: stable@vger.kernel.org # v6.14+ Signed-off-by: Guopeng Zhang Signed-off-by: Tejun Heo --- kernel/cgroup/dmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f2711e..4753a67d0f0f2b 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = NULL; continue; } + pool = ERR_PTR(-ENOMEM); } } From e32e6f02168f2ad7991eb5d160d312d2001520c8 Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Sat, 9 May 2026 16:03:28 +0800 Subject: [PATCH 655/972] selftests/cgroup: Fix cg_read_strcmp() empty string comparison cg_read_strcmp() allocated a buffer sized to strlen(expected) + 1, then passed it to read_text() which calls read(fd, buf, size-1). When comparing against an empty string (""), strlen("") = 0 gives a 1-byte buffer, and read() is asked to read 0 bytes. The file content is never actually read, so strcmp("", buf) always returns 0 regardless of the real content. This caused cg_test_proc_killed() to always report the cgroup as empty immediately, making OOM tests pass without verifying that processes were killed. Signed-off-by: Hongfu Li Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/lib/cgroup_util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/cgroup/lib/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 6a7295347e90b1..42f54936f4bbd3 100644 --- a/tools/testing/selftests/cgroup/lib/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -106,8 +106,9 @@ int cg_read_strcmp(const char *cgroup, const char *control, /* Handle the case of comparing against empty string */ if (!expected) return -1; - else - size = strlen(expected) + 1; + + /* needs size > 1, otherwise cg_read() reads 0 bytes */ + size = (expected[0] == '\0') ? 2 : strlen(expected) + 1; buf = malloc(size); if (!buf) From 2a3d7256faf06d1a15bb5b07e851ac4e1680c26d Mon Sep 17 00:00:00 2001 From: Hongfu Li Date: Mon, 11 May 2026 09:39:57 +0800 Subject: [PATCH 656/972] selftests/cgroup: Fix string comparison in write_test Use string comparison (!=) instead of numeric comparison (-ne) for cpuset values like "0-1". For example: $ [[ "0-1" != "2-3" ]] && echo "true" || echo "false" true $ [[ "0-1" -ne "2-3" ]] && echo "true" || echo "false" false Signed-off-by: Hongfu Li Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_cpuset_v1_base.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh index 42a6628fb8bc3e..1c0444729e707d 100755 --- a/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh +++ b/tools/testing/selftests/cgroup/test_cpuset_v1_base.sh @@ -18,7 +18,7 @@ write_test() { echo "testing $interface $value" echo $value > $dir/$interface new=$(cat $dir/$interface) - [[ $value -ne $(cat $dir/$interface) ]] && { + [[ "$value" != "$new" ]] && { echo "$interface write $value failed: new:$new" exit 1 } From 3788e32516530dee66cf9186f846480a16799b05 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sun, 10 May 2026 19:52:11 +0200 Subject: [PATCH 657/972] selftests/sched_ext: Fix build error in dequeue selftest Building the dequeue selftest with newer compilers (e.g., gcc 16) triggers the following error: dequeue.c:28:22: error: variable 'sum' set but not used The 'volatile' qualifier prevents the writes from being optimized away, but does not silence the unused variable 'sum' is indeed only written and never read. Consume 'sum' via an empty asm() with a register input constraint. This forces the compiler to keep the accumulated value (preserving the CPU stress loop) and avoiding the build error. Fixes: 658ad2259b3e ("selftests/sched_ext: Add test to validate ops.dequeue() semantics") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/dequeue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c index 4e93262703ca84..383d06e972a463 100644 --- a/tools/testing/selftests/sched_ext/dequeue.c +++ b/tools/testing/selftests/sched_ext/dequeue.c @@ -33,6 +33,7 @@ static void worker_fn(int id) /* Do some work to trigger scheduling events */ for (j = 0; j < 10000; j++) sum += j; + asm volatile("" : : "r"(sum)); /* Sleep to trigger dequeue */ usleep(1000 + (id * 100)); From bbf30b383cf6e87f2fe57c292fbd640b1d88b4c3 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 11 May 2026 08:18:12 +0200 Subject: [PATCH 658/972] sched_ext: Fix ops->priv clobber on concurrent attach/detach Under heavy concurrent attach/detach operations, scx_claim_exit() can trigger a NULL pointer dereference. This can be reproduced running the reload_loop kselftests inside a virtme-ng session: $ vng -v -- ./tools/testing/selftests/sched_ext/runner -t reload_loop ... BUG: kernel NULL pointer dereference, address: 0000000000000400 RIP: 0010:scx_claim_exit+0x3b/0x120 Call Trace: bpf_scx_unreg+0x45/0xb0 bpf_struct_ops_map_link_dealloc+0x39/0x50 bpf_link_release+0x18/0x20 __fput+0x10b/0x2e0 __x64_sys_close+0x47/0xa0 The underlying race (diagnosed by Tejun Heo) is a stomp of @ops->priv, not a missing NULL check: T2 unreg(K) T1 reg(K) ----------- --------- sch = ops->priv = sch_b800 scx_disable; flush_disable_work [scx_root_disable: scx_root=NULL, mutex_unlock, state=DISABLED] mutex_lock; state ok scx_alloc_and_add_sched: ops->priv = sch_a800 scx_root = sch_a800; init=0 state=ENABLED; mutex_unlock [flush returns] RCU_INIT_POINTER(ops->priv, NULL) <-- clobbers sch_a800 kobject_put(sch_b800) T1 acquires scx_enable_mutex inside scx_root_disable()'s mutex_unlock window and starts a fresh attach on the same kdata, assigning sch_a800 to @ops->priv. T2 then continues out of scx_disable()/flush_disable_work and clobbers @ops->priv to NULL, leaking sch_a800; the bpf_link is gone but state stays SCX_ENABLED, so all future attaches fail with -EBUSY permanently. The next bpf_scx_unreg() on that kdata then reads NULL @ops->priv and dereferences it in scx_claim_exit(). Make @ops->priv the lifecycle binding: in scx_root_enable_workfn() and scx_sub_enable_workfn(), after the existing state check and still under scx_enable_mutex, refuse with -EBUSY if @ops->priv is non-NULL. This rejects an attempt to reuse a kdata that is still bound to a previous scheduler instance, closing the race without changing the unreg side. Fixes: 105dcd005be2 ("sched_ext: Introduce scx_prog_sched()") Suggested-by: Tejun Heo Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4efe0099f79af8..8e06694094d78b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6803,6 +6803,19 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_unlock; } + /* + * @ops->priv binds @ops to its scx_sched instance. It is set here by + * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), + * which runs after scx_root_disable() has dropped scx_enable_mutex. If + * it's still non-NULL here, a previous attachment on @ops has not + * finished tearing down; proceeding would let the in-flight unreg's + * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. + */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto err_unlock; + } + ret = alloc_kick_syncs(); if (ret) goto err_unlock; @@ -7120,6 +7133,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto out_unlock; } + /* See scx_root_enable_workfn() for the @ops->priv check. */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto out_unlock; + } + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); From 8dfd3d8d74435344ee8dc9237596959c8b2a6cbe Mon Sep 17 00:00:00 2001 From: Eder Zulian Date: Fri, 10 Apr 2026 14:55:50 +0200 Subject: [PATCH 659/972] iommu/amd: Remove latent out-of-bounds access in IOMMU debugfs In iommu_mmio_write() and iommu_capability_write(), the variables dbg_mmio_offset and dbg_cap_offset are declared as int. However, they are populated using kstrtou32_from_user(). If a user provides a sufficiently large value, it can become a negative integer. Prior to this patch, the AMD IOMMU debugfs implementation was already protected by different mechanisms. 1. #define OFS_IN_SZ 8 ensures the user string <= 8 bytes, so e.g. 0xffffffff isn't a valid input. if (cnt > OFS_IN_SZ) return -EINVAL; 2. Implicit type promotion in iommu_mmio_write(), dbg_mmio_offset is int and iommu->mmio_phys_end is u64 if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; 3. The show handlers would currently catch the negative number and refuse to perform the read. Replace kstrtou32_from_user() with kstrtos32_from_user() to parse the input, and check for negative values to explicitly prevent out-of-bounds memory accesses directly in iommu_mmio_write() and iommu_capability_write(). Signed-off-by: Eder Zulian Fixes: 7a4ee419e8c1 ("iommu/amd: Add debugfs support to dump IOMMU MMIO registers") Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- drivers/iommu/amd/debugfs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/debugfs.c b/drivers/iommu/amd/debugfs.c index 4e66473d7ceafe..4c53b63613148e 100644 --- a/drivers/iommu/amd/debugfs.c +++ b/drivers/iommu/amd/debugfs.c @@ -31,11 +31,12 @@ static ssize_t iommu_mmio_write(struct file *filp, const char __user *ubuf, if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); if (ret) return ret; - if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) + if (dbg_mmio_offset < 0 || dbg_mmio_offset > + iommu->mmio_phys_end - sizeof(u64)) return -EINVAL; iommu->dbg_mmio_offset = dbg_mmio_offset; @@ -71,12 +72,12 @@ static ssize_t iommu_capability_write(struct file *filp, const char __user *ubuf if (cnt > OFS_IN_SZ) return -EINVAL; - ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_cap_offset); + ret = kstrtos32_from_user(ubuf, cnt, 0, &dbg_cap_offset); if (ret) return ret; /* Capability register at offset 0x14 is the last IOMMU capability register. */ - if (dbg_cap_offset > 0x14) + if (dbg_cap_offset < 0 || dbg_cap_offset > 0x14) return -EINVAL; iommu->dbg_cap_offset = dbg_cap_offset; From 07d0f496fe7ec5abe3bee7e38be709521567bb33 Mon Sep 17 00:00:00 2001 From: "Jose Fernandez (Anthropic)" Date: Tue, 21 Apr 2026 19:26:13 +0000 Subject: [PATCH 660/972] iommu/amd: Bounds-check devid in __rlookup_amd_iommu() iommu_device_register() walks every device on the PCI bus via bus_for_each_dev() and calls amd_iommu_probe_device() for each. The inlined check_device() path computes the device's sbdf, calls rlookup_amd_iommu() to find the owning IOMMU, and only afterwards verifies devid <= pci_seg->last_bdf. __rlookup_amd_iommu() indexes rlookup_table[devid] with no bounds check of its own, so for a PCI device whose BDF is not described by the IVRS, the lookup reads past the end of the allocation before the caller's bounds check can run. This was harmless before commit e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()"): the table was a zeroed page-order allocation, so the over-read returned NULL and the caller's NULL check skipped the device. After that commit the table is a tight kvcalloc() and the over-read returns adjacent slab contents, which check_device() then dereferences as a struct amd_iommu *, causing a boot-time GPF. Seen on Google Compute Engine ct6e VMs, where the virtualized IVRS describes only the four TPU endpoints 00:04.0-07.0; the gVNIC at 00:08.0 (devid 0x40) indexes 56 bytes past the 456-byte allocation, into the adjacent kmalloc-512 slab object: pci 0000:00:04.0: Adding to iommu group 0 pci 0000:00:05.0: Adding to iommu group 1 pci 0000:00:06.0: Adding to iommu group 2 pci 0000:00:07.0: Adding to iommu group 3 Oops: general protection fault, probably for non-canonical address 0x3a64695f78746382: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.18.22 #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 12/06/2025 RIP: 0010:amd_iommu_probe_device+0x54/0x3a0 Call Trace: __iommu_probe_device+0x107/0x520 probe_iommu_group+0x29/0x50 bus_for_each_dev+0x7e/0xe0 iommu_device_register+0xc9/0x240 iommu_go_to_state+0x9c0/0x1c60 amd_iommu_init+0x14/0x40 pci_iommu_init+0x16/0x60 do_one_initcall+0x47/0x2f0 Guard the array access in __rlookup_amd_iommu(). With the fix applied on 6.18.22, the gVNIC at 00:08.0 is skipped cleanly and the VM boots. Fixes: e874c666b15b ("iommu/amd: Change rlookup, irq_lookup, and alias to use kvalloc()") Cc: stable@vger.kernel.org Reported-by: Ziyuan Chen Tested-by: Ziyuan Chen Reviewed-by: Josef Bacik Assisted-by: Claude:unspecified Signed-off-by: Jose Fernandez (Anthropic) Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f78e23f03938d6..57dc8fabc7d9b9 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -351,8 +351,12 @@ static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid) struct amd_iommu_pci_seg *pci_seg; for_each_pci_segment(pci_seg) { - if (pci_seg->id == seg) - return pci_seg->rlookup_table[devid]; + if (pci_seg->id != seg) + continue; + /* IVRS may not describe every device on the bus */ + if (devid > pci_seg->last_bdf) + return NULL; + return pci_seg->rlookup_table[devid]; } return NULL; } From d769711fcddd005f1e654b3bde547140917fe696 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:20 -0700 Subject: [PATCH 661/972] iommu: Fix NULL group->domain dereference in pci_dev_reset_iommu_done() Local sashiko review pointed it out that group->domain could be NULL when a default domain fails to allocate during the first probe, which can crash at domain->ops->attach_dev dereference in __iommu_attach_device() invoked by pci_dev_reset_iommu_done(). pci_dev_reset_iommu_prepare() is fine as an old_domain pointer can be NULL. Skip the re-attach in pci_dev_reset_iommu_done() to fix the bug. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 61c12ba782066a..b8847cc43e764c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4073,8 +4073,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; - /* Re-attach RID domain back to group->domain */ - if (group->domain != group->blocking_domain) { + /* + * Re-attach RID domain back to group->domain + * + * Leave the device parked in the blocking_domain if group->domain isn't + * initialized yet + */ + if (group->domain && group->domain != group->blocking_domain) { WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, group->blocking_domain)); } From 834ab85aa96656f87bb215a8285d34af870f4b66 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:21 -0700 Subject: [PATCH 662/972] iommu: Fix kdocs of pci_dev_reset_iommu_done() Remove the duplicated word. No functional change. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b8847cc43e764c..66659e5d1ec20c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4045,9 +4045,9 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); * @pdev: PCI device that has finished a reset routine * * After a PCIe device finishes a reset routine, it wants to restore its IOMMU - * IOMMU activity, including new translation as well as cache invalidation, by - * re-attaching all RID/PASID of the device's back to the domains retained in - * the core-level structure. + * activity, including new translation and cache invalidation, by re-attaching + * all RID/PASID of the device back to the domains retained in the core-level + * structure. * * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). * From b296ca1fb43aa435edd86131f5230f70c03b2829 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:22 -0700 Subject: [PATCH 663/972] iommu: Replace per-group resetting_domain with per-gdev blocked flag The core tracks device resetting states with a per-group resetting_domain, while a reset is actually per group-device. Such a mismatch might lead to confusion and even difficulty to untangle per-gdev handling requirement. Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). And the solution requires the core to track at the group_device level as well. Introduce a 'blocked' flag to struct group_device, to allow a multi-device group to isolate concurrent device resets independently. As the reset routine is per gdev, it cannot clear group->resetting_domain without iterating over the device list to ensure no other device is being reset. Simplify it by replacing the resetting_domain with a 'recovery_cnt' in the struct iommu_group. No functional change. But this is essential to apply following bug fixes. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 102 ++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 24 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 66659e5d1ec20c..2515786d415628 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -62,14 +62,14 @@ struct iommu_group { int id; struct iommu_domain *default_domain; struct iommu_domain *blocking_domain; - /* - * During a group device reset, @resetting_domain points to the physical - * domain, while @domain points to the attached domain before the reset. - */ - struct iommu_domain *resetting_domain; struct iommu_domain *domain; struct list_head entry; unsigned int owner_cnt; + /* + * Number of devices in the group undergoing or awaiting recovery. + * If non-zero, concurrent domain attachments are rejected. + */ + unsigned int recovery_cnt; void *owner; }; @@ -77,12 +77,32 @@ struct group_device { struct list_head list; struct device *dev; char *name; + /* + * Device is blocked for a pending recovery while its group->domain is + * retained. This can happen when: + * - Device is undergoing a reset + */ + bool blocked; }; /* Iterate over each struct group_device in a struct iommu_group */ #define for_each_group_device(group, pos) \ list_for_each_entry(pos, &(group)->devices, list) +static struct group_device *__dev_to_gdev(struct device *dev) +{ + struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; + + lockdep_assert_held(&group->mutex); + + for_each_group_device(group, gdev) { + if (gdev->dev == dev) + return gdev; + } + return NULL; +} + struct iommu_group_attribute { struct attribute attr; ssize_t (*show)(struct iommu_group *group, char *buf); @@ -2196,6 +2216,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_device); int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) { + struct group_device *gdev; + /* * This is called on the dma mapping fast path so avoid locking. This is * racy, but we have an expectation that the driver will setup its DMAs @@ -2206,14 +2228,18 @@ int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) guard(mutex)(&dev->iommu_group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. * * Note that this might fail the iommu_dma_map(). But there's nothing * more we can do here. */ - if (dev->iommu_group->resetting_domain) + if (gdev->blocked) return -EBUSY; return __iommu_attach_device(domain, dev, NULL); } @@ -2270,19 +2296,24 @@ EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) { struct iommu_group *group = dev->iommu_group; + struct group_device *gdev; lockdep_assert_held(&group->mutex); + gdev = __dev_to_gdev(dev); + if (WARN_ON(!gdev)) + return NULL; + /* * Driver handles the low-level __iommu_attach_device(), including the * one invoked by pci_dev_reset_iommu_done() re-attaching the device to * the cached group->domain. In this case, the driver must get the old - * domain from group->resetting_domain rather than group->domain. This + * domain from group->blocking_domain rather than group->domain. This * prevents it from re-attaching the device from group->domain (old) to * group->domain (new). */ - if (group->resetting_domain) - return group->resetting_domain; + if (gdev->blocked) + return group->blocking_domain; return group->domain; } @@ -2441,10 +2472,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, return -EINVAL; /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) + if (group->recovery_cnt) return -EBUSY; /* @@ -3615,10 +3646,10 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3708,10 +3739,10 @@ int iommu_replace_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); /* - * This is a concurrent attach during a device reset. Reject it until + * This is a concurrent attach during device recovery. Reject it until * pci_dev_reset_iommu_done() attaches the device to group->domain. */ - if (group->resetting_domain) { + if (group->recovery_cnt) { ret = -EBUSY; goto out_unlock; } @@ -3982,12 +4013,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); * routine wants to block any IOMMU activity: translation and ATS invalidation. * * This function attaches the device's RID/PASID(s) the group->blocking_domain, - * setting the group->resetting_domain. This allows the IOMMU driver pausing any + * incrementing the group->recovery_cnt, to allow the IOMMU driver pausing any * IOMMU activity while leaving the group->domain pointer intact. Later when the * reset is finished, pci_dev_reset_iommu_done() can restore everything. * * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() - * before/after the core-level reset routine, to unset the resetting_domain. + * before/after the core-level reset routine, to decrement the recovery_cnt. * * Return: 0 on success or negative error code if the preparation failed. * @@ -4000,6 +4031,7 @@ EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; int ret; @@ -4009,8 +4041,12 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) guard(mutex)(&group->mutex); + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return -ENODEV; + /* Re-entry is not allowed */ - if (WARN_ON(group->resetting_domain)) + if (WARN_ON(gdev->blocked)) return -EBUSY; ret = __iommu_group_alloc_blocking_domain(group); @@ -4025,6 +4061,13 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) return ret; } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = true; + /* * Stage PASID domains at blocking_domain while retaining pasid_array. * @@ -4035,7 +4078,7 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_array_entry_to_domain(entry)); - group->resetting_domain = group->blocking_domain; + group->recovery_cnt++; return ret; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); @@ -4057,6 +4100,7 @@ EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); void pci_dev_reset_iommu_done(struct pci_dev *pdev) { struct iommu_group *group = pdev->dev.iommu_group; + struct group_device *gdev; unsigned long pasid; void *entry; @@ -4065,11 +4109,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) guard(mutex)(&group->mutex); - /* pci_dev_reset_iommu_prepare() was bypassed for the device */ - if (!group->resetting_domain) + gdev = __dev_to_gdev(&pdev->dev); + if (WARN_ON(!gdev)) + return; + + if (!gdev->blocked) return; - /* pci_dev_reset_iommu_prepare() was not successfully called */ if (WARN_ON(!group->blocking_domain)) return; @@ -4084,6 +4130,13 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) group->blocking_domain)); } + /* + * Update gdev->blocked upon the domain change, as it is used to return + * the correct domain in iommu_driver_get_domain_for_dev() that might be + * called in a set_dev_pasid callback function. + */ + gdev->blocked = false; + /* * Re-attach PASID domains back to the domains retained in pasid_array. * @@ -4095,7 +4148,8 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) pasid_array_entry_to_domain(entry), group, pasid, group->blocking_domain)); - group->resetting_domain = NULL; + if (!WARN_ON(group->recovery_cnt == 0)) + group->recovery_cnt--; } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); From 1615e8896a8f6d7b2adf6495e538a81bf6cea3e0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:23 -0700 Subject: [PATCH 664/972] iommu: Fix pasid attach in pci_dev_reset_iommu_prepare/done() Now the helpers handle per-gdev resets. Replace __iommu_set_group_pasid() with set_dev_pasid() accordingly, in the pci_dev_reset_iommu_done(). Also add max_pasids check as other callers. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/ad858513-09fc-455e-bbc5-fe38a225cc78@linux.alibaba.com/ Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2515786d415628..221be84db5ad54 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4074,9 +4074,14 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - iommu_remove_dev_pasid(&pdev->dev, pasid, - pasid_array_entry_to_domain(entry)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + iommu_remove_dev_pasid(&pdev->dev, pasid, pasid_dom); + } + } group->recovery_cnt++; return ret; @@ -4143,10 +4148,16 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) * The pasid_array is mostly fenced by group->mutex, except one reader * in iommu_attach_handle_get(), so it's safe to read without xa_lock. */ - xa_for_each_start(&group->pasid_array, pasid, entry, 1) - WARN_ON(__iommu_set_group_pasid( - pasid_array_entry_to_domain(entry), group, pasid, - group->blocking_domain)); + if (pdev->dev.iommu->max_pasids > 0) { + xa_for_each_start(&group->pasid_array, pasid, entry, 1) { + struct iommu_domain *pasid_dom = + pasid_array_entry_to_domain(entry); + + WARN_ON(pasid_dom->ops->set_dev_pasid( + pasid_dom, &pdev->dev, pasid, + group->blocking_domain)); + } + } if (!WARN_ON(group->recovery_cnt == 0)) group->recovery_cnt--; From 0d5fd7a9323ce6bedd170e21e1e90b8904917c75 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:24 -0700 Subject: [PATCH 665/972] iommu: Fix nested pci_dev_reset_iommu_prepare/done() Shuai found that cxl_reset_bus_function() calls pci_reset_bus_function() internally while both are calling pci_dev_reset_iommu_prepare/done(). As pci_dev_reset_iommu_prepare() doesn't support re-entry, the inner call will trigger a WARN_ON and return -EBUSY, resulting in failing the entire device reset. On the other hand, removing the outer calls in the PCI callers is unsafe. As pointed out by Kevin, device-specific quirks like reset_hinic_vf_dev() execute custom firmware waits after their inner pcie_flr() completes. If the IOMMU protection relies solely on the inner reset, the IOMMU will be unblocked prematurely while the device is still resetting. Instead, fix this by making pci_dev_reset_iommu_prepare/done() reentrant. Introduce gdev->reset_depth to handle the re-entries on the same device. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Reported-by: Shuai Xue Closes: https://lore.kernel.org/all/absKsk7qQOwzhpzv@Asurada-Nvidia/ Suggested-by: Kevin Tian Reviewed-by: Shuai Xue Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 221be84db5ad54..301c76c40e3d0b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -83,6 +83,7 @@ struct group_device { * - Device is undergoing a reset */ bool blocked; + unsigned int reset_depth; }; /* Iterate over each struct group_device in a struct iommu_group */ @@ -4045,20 +4046,23 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) if (WARN_ON(!gdev)) return -ENODEV; - /* Re-entry is not allowed */ - if (WARN_ON(gdev->blocked)) - return -EBUSY; + if (gdev->reset_depth++) + return 0; ret = __iommu_group_alloc_blocking_domain(group); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } /* Stage RID domain at blocking_domain while retaining group->domain */ if (group->domain != group->blocking_domain) { ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, group->domain); - if (ret) + if (ret) { + gdev->reset_depth--; return ret; + } } /* @@ -4118,7 +4122,10 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!gdev)) return; - if (!gdev->blocked) + /* Unbalanced done() calls would underflow the counter */ + if (WARN_ON(gdev->reset_depth == 0)) + return; + if (--gdev->reset_depth) return; if (WARN_ON(!group->blocking_domain)) From fc3523b16d2b4b88e61e69504b0ae0b18b869c8f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:25 -0700 Subject: [PATCH 666/972] iommu: Fix ATS invalidation timeouts during __iommu_remove_group_pasid() If a device is blocked, its PASID domains are already detached. Repeating iommu_remove_dev_pasid() is unnecessary and might trigger ATS invalidation timeouts. Skip the iommu_remove_dev_pasid() call upon gdev->blocked. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 301c76c40e3d0b..a5e3e90883f84f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3602,7 +3602,12 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, struct group_device *device; for_each_group_device(group, device) { - if (device->dev->iommu->max_pasids > 0) + /* + * A group-level detach cannot fail, even if there is a blocked + * device. In fact, blocked devices must be already detached for + * a pending device recovery. + */ + if (!device->blocked && device->dev->iommu->max_pasids > 0) iommu_remove_dev_pasid(device->dev, pasid, domain); } } From 5474e6e17a262db45c60575c73f70210f5c7001f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:26 -0700 Subject: [PATCH 667/972] iommu: Fix WARN_ON in __iommu_group_set_domain_nofail() due to reset In __iommu_group_set_domain_internal(), concurrent domain attachments are rejected when any device in the group is recovering. This is necessary to fence concurrent attachments to a multi-device group where devices might share the same RID due to PCI DMA alias quirks, but triggers the WARN_ON in __iommu_group_set_domain_nofail(). Other IOMMU_SET_DOMAIN_MUST_SUCCEED callers in detach/teardown paths, such as __iommu_group_set_core_domain and __iommu_release_dma_ownership, should not be rejected, as the domain would be freed anyway in these nofail paths while group->domain is still pointing to it. So pci_dev_reset_iommu_done() could trigger a UAF when re-attaching group->domain. Honor the IOMMU_SET_DOMAIN_MUST_SUCCEED flag, allowing the callers through the group->recovery_cnt fence, so as to update the group->domain pointer. Instead add a gdev->blocked check in the device iteration loop, to prevent any concurrent per-device detachment. Fixes: c279e83953d9 ("iommu: Introduce pci_dev_reset_iommu_prepare/done()") Cc: stable@vger.kernel.org Closes: https://sashiko.dev/#/patchset/20260407194644.171304-1-nicolinc%40nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a5e3e90883f84f..845284a9eeaf4e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2474,9 +2474,10 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, /* * This is a concurrent attach during device recovery. Reject it until - * pci_dev_reset_iommu_done() attaches the device to group->domain. + * pci_dev_reset_iommu_done() attaches the device to group->domain, if + * IOMMU_SET_DOMAIN_MUST_SUCCEED is not set. */ - if (group->recovery_cnt) + if (group->recovery_cnt && !(flags & IOMMU_SET_DOMAIN_MUST_SUCCEED)) return -EBUSY; /* @@ -2487,6 +2488,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group, */ result = 0; for_each_group_device(group, gdev) { + /* + * Device under recovery is attached to group->blocking_domain. + * Don't change that. pci_dev_reset_iommu_done() will re-attach + * its domain to the updated group->domain, after the recovery. + */ + if (gdev->blocked) + continue; ret = __iommu_device_set_domain(group, gdev->dev, new_domain, group->domain, flags); if (ret) { From 15dd29ca620648a18a0334a3f6b26af181154a23 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 24 Apr 2026 18:15:27 -0700 Subject: [PATCH 668/972] iommu: Warn on premature unblock during DMA aliased sibling reset When two aliased siblings are in the same iommu_group, they might share the same RID. The reset functions don't support this case, though it is unclear whether there is a real case of having an ATS capable device on a PCI/PCI-X bus. Theoretically, however, if two aliased devices are resetting concurrently, one might be unblocked prematurely in the middle of the reset by the other sibling who completes the reset first. This isn't a regression from this series but it's better to spit a warning, so we can know if such use case is common enough for us to make subsequent patches for its coverage. Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 845284a9eeaf4e..e7bd28cc77eeb4 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -4105,6 +4105,41 @@ int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); +static int __group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return alias == *(u16 *)data; +} + +static int group_device_cmp_dma_alias(struct pci_dev *dev, u16 alias, + void *data) +{ + return pci_for_each_dma_alias(data, __group_device_cmp_dma_alias, + &alias); +} + +static bool group_device_dma_alias_is_blocked(struct iommu_group *group, + struct group_device *gdev) +{ + struct group_device *sibling; + + lockdep_assert_held(&group->mutex); + + if (!dev_is_pci(gdev->dev)) + return false; + + for_each_group_device(group, sibling) { + if (sibling == gdev || !sibling->blocked || + !dev_is_pci(sibling->dev)) + continue; + if (pci_for_each_dma_alias(to_pci_dev(gdev->dev), + group_device_cmp_dma_alias, + to_pci_dev(sibling->dev))) + return true; + } + return false; +} + /** * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done * @pdev: PCI device that has finished a reset routine @@ -4144,6 +4179,20 @@ void pci_dev_reset_iommu_done(struct pci_dev *pdev) if (WARN_ON(!group->blocking_domain)) return; + if (group_device_dma_alias_is_blocked(group, gdev)) { + /* + * FIXME: DMA aliased devices share the same RID, which would be + * convoluted to handle, as "gdev->blocked" is not sufficient: + * - "blocked" state is effectively shared across these devices + * - if the core skipped the blocking on the second device, the + * IOMMU driver's attachment state would diverge from the HW + * state + * For now, just warn and see whether real ATS use cases hit it. + */ + pci_warn(pdev, + "DMA-aliased sibling may be prematurely unblocked\n"); + } + /* * Re-attach RID domain back to group->domain * From 4a39eda5fdd867fc39f3c039714dd432cee00268 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:30 +0800 Subject: [PATCH 669/972] cgroup/cpuset: Reset DL migration state on can_attach() failure cpuset_can_attach() accumulates temporary SCHED_DEADLINE migration state in the destination cpuset while walking the taskset. If a later task_can_attach() or security_task_setscheduler() check fails, cgroup_migrate_execute() treats cpuset as the failing subsystem and does not call cpuset_cancel_attach() for it. The partially accumulated state is then left behind and can be consumed by a later attach, corrupting cpuset DL task accounting and pending DL bandwidth accounting. Reset the pending DL migration state from the common error exit when ret is non-zero. Successful can_attach() keeps the state for cpuset_attach() or cpuset_cancel_attach(). Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Signed-off-by: Tejun Heo Reviewed-by: Chen Ridong Reviewed-by: Waiman Long --- kernel/cgroup/cpuset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a48901a0416a28..3fbf6e7f68c311 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3050,16 +3050,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); if (unlikely(cpu >= nr_cpu_ids)) { - reset_migrate_dl_data(cs); ret = -EINVAL; goto out_unlock; } ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) { - reset_migrate_dl_data(cs); + if (ret) goto out_unlock; - } cs->dl_bw_cpu = cpu; } @@ -3070,7 +3067,10 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; + out_unlock: + if (ret) + reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); return ret; } From 2cda2e10dc8343ae01eae9e999a876b7e7d37861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naval=20Alcal=C3=A1?= Date: Sat, 9 May 2026 10:43:44 +0800 Subject: [PATCH 670/972] iommu/vt-d: Disable DMAR for Intel Q35 IGFX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel Q35 integrated graphics (8086:29b2) exhibits broken DMAR behaviour similar to other G4x/GM45 devices for which DMAR is already disabled via quirks. When DMAR is enabled, the system may hard lock up during boot or early device initialization, requiring a reset. Add the missing PCI ID to the existing quirk list to disable DMAR for this device. Fixes: 1f76249cc3be ("iommu/vt-d: Declare Broadwell igfx dmar support snafu") Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=201185 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=216064 Signed-off-by: Naval Alcalá Link: https://lore.kernel.org/r/20260410161622.13549-1-ari@naval.cat Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index c3d18cd77d2f1a..2a6b6813a78dcd 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3937,6 +3937,9 @@ static void quirk_iommu_igfx(struct pci_dev *dev) disable_igfx_iommu = 1; } +/* Q35 integrated gfx dmar support is totally busted. */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x29b2, quirk_iommu_igfx); + /* G4x/GM45 integrated gfx dmar support is totally busted. */ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx); From a6dea58d8625c06b9654c0555f101742481335c3 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:45 +0800 Subject: [PATCH 671/972] iommu/vt-d: Fix oops due to out of scope access Below oops triggers when kill QEMU process: Oops: general protection fault, probably for non-canonical address 0x7fffffff844eaaa7: 0000 [#1] SMP NOPTI Call Trace: do_raw_spin_lock+0xaa/0xc0 _raw_spin_lock_irqsave+0x21/0x40 domain_remove_dev_pasid+0x52/0x160 intel_nested_set_dev_pasid+0x1b9/0x1e0 __iommu_set_group_pasid+0x56/0x120 pci_dev_reset_iommu_done+0xe3/0x180 pcie_flr+0x65/0x160 __pci_reset_function_locked+0x5b/0x120 vfio_pci_core_close_device+0x63/0xe0 [vfio_pci_core] vfio_df_close+0x4f/0xa0 vfio_df_unbind_iommufd+0x2d/0x60 vfio_device_fops_release+0x3e/0x40 __fput+0xe5/0x2c0 task_work_run+0x58/0xa0 do_exit+0x2c8/0x600 do_group_exit+0x2f/0xa0 get_signal+0x863/0x8c0 arch_do_signal_or_restart+0x24/0x100 exit_to_user_mode_loop+0x87/0x380 do_syscall_64+0x2ff/0x11e0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The global static blocked domain is a dummy domain without corresponding dmar_domain structure, accessing beyond iommu_domain structure triggers oops easily. Fix it by return early in domain_remove_dev_pasid() like identity domain. Fixes: 7d0c9da6c150 ("iommu/vt-d: Add set_dev_pasid callback for dma domain") Cc: stable@vger.kernel.org Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260421031347.1408890-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 2a6b6813a78dcd..a4b123c330221f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3530,8 +3530,8 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, if (!domain) return; - /* Identity domain has no meta data for pasid. */ - if (domain->type == IOMMU_DOMAIN_IDENTITY) + /* Identity domain and blocked domain have no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY || domain->type == IOMMU_DOMAIN_BLOCKED) return; dmar_domain = to_dmar_domain(domain); From 79ea2feb917b05366b49d85573c9c5331f043b2c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Sat, 9 May 2026 10:43:46 +0800 Subject: [PATCH 672/972] iommu/vt-d: Avoid NULL pointer dereference or refcount corruption Commit 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") fixed a NULL pointer dereference in an unlikely situation partly. If dev_pasid is not found in the dev_pasids list, it remains NULL. However, the teardown operations are executed unconditionally, this lead to a NULL pointer dereference or refcount corruption. If the domain was never attached to this IOMMU, info will be NULL, which would cause an immediate dereference when checking --info->refcnt. Even if info is not NULL, decrementing the refcount without having removed a valid PASID might unbalance the count. This could lead to premature dropping of the refcount to 0, potentially causing a use-after-free for the remaining active devices sharing the domain. Fix it by returning early if dev_pasid is NULL, before executing the teardown operations. Issue found by AI review and suggested by Kevin Tian. https://sashiko.dev/#/patchset/20260421031347.1408890-1-zhenzhong.duan%40intel.com Fixes: 60f030f7418d ("iommu/vt-d: Avoid use of NULL after WARN_ON_ONCE") Cc: stable@vger.kernel.org Suggested-by: Kevin Tian Signed-off-by: Zhenzhong Duan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260422033538.95000-1-zhenzhong.duan@intel.com Signed-off-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index a4b123c330221f..4d0e65bc131d77 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3545,12 +3545,13 @@ void domain_remove_dev_pasid(struct iommu_domain *domain, } spin_unlock_irqrestore(&dmar_domain->lock, flags); + if (WARN_ON_ONCE(!dev_pasid)) + return; + cache_tag_unassign_domain(dmar_domain, dev, pasid); domain_detach_iommu(dmar_domain, iommu); - if (!WARN_ON_ONCE(!dev_pasid)) { - intel_iommu_debugfs_remove_dev_pasid(dev_pasid); - kfree(dev_pasid); - } + intel_iommu_debugfs_remove_dev_pasid(dev_pasid); + kfree(dev_pasid); } static int blocking_domain_set_dev_pasid(struct iommu_domain *domain, From 4c79fc2d598694bda845b46229c9d48b65042970 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Wed, 22 Apr 2026 10:47:13 +0200 Subject: [PATCH 673/972] libceph: Fix potential out-of-bounds access in crush_decode() A message of type CEPH_MSG_OSD_MAP containing a crush map with at least one bucket has two fields holding the bucket algorithm. If the values in these two fields differ, an out-of-bounds access can occur. This is the case because the first algorithm field (alg) is used to allocate the correct amount of memory for a bucket of this type, while the second algorithm field inside the bucket (b->alg) is used in the subsequent processing. This patch fixes the issue by adding a check that compares alg and b->alg and aborts the processing in case they differ. Furthermore, b->alg is set to 0 in this case, because the destruction of the crush map also uses this field to determine the bucket type, which can again result in an out-of-bounds access when trying to free the memory pointed to by the fields of the bucket. To correctly free the memory allocated for the bucket in such a case, the corresponding call to kfree is moved from the algorithm-specific crush_destroy_bucket functions to the generic crush_destroy_bucket(). Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crush/crush.c | 6 +----- net/ceph/osdmap.c | 4 ++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 254ded0b05f6a1..521aec1d5fc060 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -47,7 +47,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_list(struct crush_bucket_list *b) @@ -55,14 +54,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) kfree(b->item_weights); kfree(b->sum_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { kfree(b->h.items); kfree(b->node_weights); - kfree(b); } void crush_destroy_bucket_straw(struct crush_bucket_straw *b) @@ -70,14 +67,12 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b->straws); kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) { kfree(b->item_weights); kfree(b->h.items); - kfree(b); } void crush_destroy_bucket(struct crush_bucket *b) @@ -99,6 +94,7 @@ void crush_destroy_bucket(struct crush_bucket *b) crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); break; } + kfree(b); } /** diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index c89e66d4fcb7fe..a87268058e61c0 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -516,6 +516,10 @@ static struct crush_map *crush_decode(void *pbyval, void *end) b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); + if (b->alg != alg) { + b->alg = 0; + goto bad; + } b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); From 596f91294b351866956808b1ecb8dfae15382a6d Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Fri, 24 Apr 2026 15:37:37 +0200 Subject: [PATCH 674/972] libceph: Fix unnecessarily high ceph_decode_need() for uniform bucket In crush_decode_uniform_bucket(), the item_weight field of the bucket is set. This is a single field of type u32 since the uniform bucket uses the same weight for all items. The value in ceph_decode_need() is set to (1+b->h.size) * sizeof(u32), which is higher than actually needed. This patch removes the call to ceph_decode_need() with the unnecessarily high value and switches the subsequent operation from ceph_decode_32() to ceph_decode_32_safe(), which already includes the correct bounds check. Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index a87268058e61c0..669348d883f09b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -72,8 +72,7 @@ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); - ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - b->item_weight = ceph_decode_32(p); + ceph_decode_32_safe(p, end, b->item_weight, bad); return 0; bad: return -EINVAL; From 5d3cc36b4e77a27ce7b686b7c59c7072bcb3fa8e Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 12:26:02 -0700 Subject: [PATCH 675/972] ceph: fix a buffer leak in __ceph_setxattr() The old_blob in __ceph_setxattr() can store ci->i_xattrs.prealloc_blob value during the retry. However, it is never called the ceph_buffer_put() for the old_blob object. This patch fixes the issue of the buffer leak. Cc: stable@vger.kernel.org Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5f87f62091a144..c6fcbf4283177f 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1294,6 +1294,7 @@ int __ceph_setxattr(struct inode *inode, const char *name, do_sync: spin_unlock(&ci->i_ceph_lock); + ceph_buffer_put(old_blob); do_sync_unlocked: if (lock_snap_rwsem) up_read(&mdsc->snap_rwsem); From 0c22d9511cbde746622f8e4c11aaa63fe76d45f9 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 12:43:40 -0700 Subject: [PATCH 676/972] ceph: fix BUG_ON in __ceph_build_xattrs_blob() due to stale blob size The generic/642 test-case can reproduce the kernel crash: [40243.605254] ------------[ cut here ]------------ [40243.605956] kernel BUG at fs/ceph/xattr.c:918! [40243.607142] Oops: invalid opcode: 0000 [#1] SMP PTI [40243.608067] CPU: 7 UID: 0 PID: 498762 Comm: kworker/7:1 Not tainted 7.0.0-rc7+ #3 PREEMPT(full) [40243.609700] Hardware name: QEMU Ubuntu 25.10 PC v2 (i440FX + PIIX, + 10.1 machine, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [40243.611820] Workqueue: ceph-msgr ceph_con_workfn [40243.612715] RIP: 0010:__ceph_build_xattrs_blob+0x1b8/0x1e0 [40243.613731] Code: 0f 84 82 fe ff ff e9 cf 8e 56 ff 48 8d 65 e8 31 c0 5b 41 5c 41 5d 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 c3 cc cc cc cc <0f> 0b 4c 8b 62 08 41 8b 85 24 07 00 00 49 83 c4 04 41 89 44 24 fc [40243.616888] RSP: 0018:ffffcc80c4d4b688 EFLAGS: 00010287 [40243.617773] RAX: 0000000000010026 RBX: 0000000000000001 RCX: 0000000000000000 [40243.618928] RDX: ffff8a773798dee0 RSI: 0000000000000000 RDI: 0000000000000000 [40243.620158] RBP: ffffcc80c4d4b6a0 R08: 0000000000000000 R09: 0000000000000000 [40243.621573] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a75f3b58000 [40243.622907] R13: ffff8a75f3b58000 R14: 0000000000000080 R15: 000000000000bffd [40243.624054] FS: 0000000000000000(0000) GS:ffff8a787d1b4000(0000) knlGS:0000000000000000 [40243.625331] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [40243.626269] CR2: 000072f390b623c0 CR3: 000000011c02a003 CR4: 0000000000372ef0 [40243.627408] Call Trace: [40243.627839] [40243.628188] __prep_cap+0x3fd/0x4a0 [40243.628789] ? do_raw_spin_unlock+0x4e/0xe0 [40243.629474] ceph_check_caps+0x46a/0xc80 [40243.630094] ? __lock_acquire+0x4a2/0x2650 [40243.630773] ? find_held_lock+0x31/0x90 [40243.631347] ? handle_cap_grant+0x79f/0x1060 [40243.632068] ? lock_release+0xd9/0x300 [40243.632696] ? __mutex_unlock_slowpath+0x3e/0x340 [40243.633429] ? lock_release+0xd9/0x300 [40243.634052] handle_cap_grant+0xcf6/0x1060 [40243.634745] ceph_handle_caps+0x122b/0x2110 [40243.635415] mds_dispatch+0x5bd/0x2160 [40243.636034] ? ceph_con_process_message+0x65/0x190 [40243.636828] ? lock_release+0xd9/0x300 [40243.637431] ceph_con_process_message+0x7a/0x190 [40243.638184] ? kfree+0x311/0x4f0 [40243.638749] ? kfree+0x311/0x4f0 [40243.639268] process_message+0x16/0x1a0 [40243.639915] ? sg_free_table+0x39/0x90 [40243.640572] ceph_con_v2_try_read+0xf58/0x2120 [40243.641255] ? lock_acquire+0xc8/0x300 [40243.641863] ceph_con_workfn+0x151/0x820 [40243.642493] process_one_work+0x22f/0x630 [40243.643093] ? process_one_work+0x254/0x630 [40243.643770] worker_thread+0x1e2/0x400 [40243.644332] ? __pfx_worker_thread+0x10/0x10 [40243.645020] kthread+0x109/0x140 [40243.645560] ? __pfx_kthread+0x10/0x10 [40243.646125] ret_from_fork+0x3f8/0x480 [40243.646752] ? __pfx_kthread+0x10/0x10 [40243.647316] ? __pfx_kthread+0x10/0x10 [40243.647919] ret_from_fork_asm+0x1a/0x30 [40243.648556] [40243.648902] Modules linked in: overlay hctr2 libpolyval chacha libchacha adiantum libnh libpoly1305 essiv intel_rapl_msr intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit kvm_intel kvm irqbypass joydev ghash_clmulni_intel aesni_intel rapl input_leds mac_hid psmouse vga16fb serio_raw vgastate floppy i2c_piix4 pata_acpi bochs qemu_fw_cfg i2c_smbus sch_fq_codel rbd dm_crypt msr parport_pc ppdev lp parport efi_pstore [40243.654766] ---[ end trace 0000000000000000 ]--- Commit d93231a6bc8a ("ceph: prevent a client from exceeding the MDS maximum xattr size") moved the required_blob_size computation to before the __build_xattrs() call, introducing a race. __build_xattrs() releases and reacquires i_ceph_lock during execution. In that window, handle_cap_grant() may update i_xattrs.blob with a newer MDS-provided blob and bump i_xattrs.version. When __build_xattrs() detects that index_version < version, it destroys and rebuilds the entire xattr rb-tree from the new blob, potentially increasing count, names_size, and vals_size. The prealloc_blob size check that follows still uses the stale required_blob_size computed before the rebuild, so it passes even when prealloc_blob is too small for the now-larger tree. After __set_xattr() adds one more xattr on top, __ceph_build_xattrs_blob() is called from the cap flush path and hits: BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len); Fix this by recomputing required_blob_size after __build_xattrs() returns, using the current tree state. Also re-validate against m_max_xattr_size to fall back to the sync path if the rebuilt tree now exceeds the MDS limit. Cc: stable@vger.kernel.org Fixes: d93231a6bc8a ("ceph: prevent a client from exceeding the MDS maximum xattr size") Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index c6fcbf4283177f..e773be07f7674a 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1254,6 +1254,22 @@ int __ceph_setxattr(struct inode *inode, const char *name, ceph_vinop(inode), name, ceph_cap_string(issued)); __build_xattrs(inode); + /* + * __build_xattrs() may have released and reacquired i_ceph_lock, + * during which handle_cap_grant() could have replaced i_xattrs.blob + * with a newer MDS-provided blob and bumped i_xattrs.version. If that + * caused __build_xattrs() to rebuild the rb-tree from the new blob, + * count/names_size/vals_size may now be larger than when + * required_blob_size was computed above. Recompute it here so the + * prealloc_blob size check below reflects the current tree state. + */ + required_blob_size = __get_required_blob_size(ci, name_len, val_len); + if (required_blob_size > mdsc->mdsmap->m_max_xattr_size) { + doutc(cl, "sync (size too large): %d > %llu\n", + required_blob_size, mdsc->mdsmap->m_max_xattr_size); + goto do_sync; + } + if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { struct ceph_buffer *blob; From 821365487aa58d06bda65c676ba215d506ba9768 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 28 Apr 2026 14:15:46 +0200 Subject: [PATCH 677/972] libceph: Fix potential out-of-bounds access in __ceph_x_decrypt() In __ceph_x_decrypt(), a part of the buffer p is interpreted as a ceph_x_encrypt_header, and the magic field of this struct is accessed. This happens without any guarantee that the buffer is large enough to hold this struct. The function parameter ciphertext_len represents the length of the ciphertext to decrypt and is guaranteed to be at most the remaining size of the allocated buffer p. However, this value is not necessarily greater than sizeof(ceph_x_encrypt_header). E.g., a message frame of type FRAME_TAG_AUTH_REPLY_MORE, that is just as long to hold the ciphertext at its end with a ciphertext_len of 8 or less, can trigger an out-of-bounds memory access when accessing hdr->magic. This patch fixes the issue by adding a check to ensure that the decrypted plaintext in the buffer is large enough to represent at least the ceph_x_encrypt_header. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/auth_x.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 692e0b86882238..9e64e82d0b63bf 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -115,6 +115,11 @@ static int __ceph_x_decrypt(const struct ceph_crypto_key *key, int usage_slot, if (ret) return ret; + if (plaintext_len < sizeof(*hdr)) { + pr_err("%s plaintext too small %d\n", __func__, plaintext_len); + return -EINVAL; + } + hdr = p + ceph_crypt_data_offset(key); if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { pr_err("%s bad magic\n", __func__); From 10d9be401108a0fc8b3bc99ba07bdee8fff875ac Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 9 Apr 2026 11:33:23 -0700 Subject: [PATCH 678/972] ceph: add ceph_has_realms_with_quotas() check to ceph_quota_update_statfs() When MDS rejects a session, remove_session_caps() -> __ceph_remove_cap() -> ceph_change_snap_realm() clears i_snap_realm for every inode that loses its last cap. The realm is restored once caps are re-granted after reconnect. It is not a real error and this patch changes pr_err_ratelimited_client() on doutc(). Every quota methods ceph_quota_is_max_files_exceeded(), ceph_quota_is_max_bytes_exceeded(), ceph_quota_is_max_bytes_approaching() calls ceph_has_realms_with_quotas() check. This patch adds the missing ceph_has_realms_with_quotas() call into ceph_quota_update_statfs(). [ idryomov: add braces around both arms of multiline ifs ] Signed-off-by: Viacheslav Dubeyko Reviewed-by: Alex Markuze Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 4dc9426643e834..053d5bf0c9f07c 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -228,12 +228,19 @@ static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode, restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "no quota realm found". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -340,12 +347,19 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, down_read(&mdsc->snap_rwsem); restart: realm = ceph_inode(inode)->i_snap_realm; - if (realm) + if (realm) { ceph_get_snap_realm(mdsc, realm); - else - pr_err_ratelimited_client(cl, - "%p %llx.%llx null i_snap_realm\n", - inode, ceph_vinop(inode)); + } else { + /* + * i_snap_realm is NULL when all caps have been released, e.g. + * after an MDS session rejection. This is a transient state; + * the realm will be restored once caps are re-granted. + * Treat it as "quota not exceeded". + */ + doutc(cl, "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); + } + while (realm) { bool has_inode; @@ -496,6 +510,9 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) u64 total = 0, used, free; bool is_updated = false; + if (!ceph_has_realms_with_quotas(d_inode(fsc->sb->s_root))) + return false; + down_read(&mdsc->snap_rwsem); get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES, &realm, true); From 544576f0f05c4a759806acddfaaeb686f14fb4b0 Mon Sep 17 00:00:00 2001 From: Hristo Venev Date: Mon, 4 May 2026 18:54:45 +0300 Subject: [PATCH 679/972] ceph: put folios not suitable for writeback The batch holds references to the folios (see `filemap_get_folios`, `folio_batch_release`), so we need to `folio_put` the folios we remove. Tested on v6.18. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/74156 Signed-off-by: Hristo Venev Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1454760332ffce..0a86f672cc09ce 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1336,6 +1336,7 @@ void ceph_process_folio_batch(struct address_space *mapping, ceph_wbc, folio); if (rc == -ENODATA) { folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } else if (rc == -E2BIG) { @@ -1346,6 +1347,7 @@ void ceph_process_folio_batch(struct address_space *mapping, if (!folio_clear_dirty_for_io(folio)) { doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); folio_unlock(folio); + folio_put(folio); ceph_wbc->fbatch.folios[i] = NULL; continue; } From 86ecb1c1a1f5c1bf4a45b91f54f8220c3121bd3b Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 11 May 2026 10:31:30 +0200 Subject: [PATCH 680/972] sched_ext: Clear ops->priv on scx_alloc_and_add_sched() error paths scx_alloc_and_add_sched() can fail after @sch has been assigned to ops->priv. In those cases @sch is torn down (either via kfree() through the err_free_* chain or via kobject_put() -> scx_kobj_release() -> RCU work), but @ops->priv is left pointing at the about-to-be-freed pointer. With the recent -EBUSY gate in scx_root_enable_workfn() and scx_sub_enable_workfn() that rejects an attach when @ops->priv is still non-NULL, see commit bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on concurrent attach/detach"), a dangling @ops->priv permanently locks the kdata out: every future attach attempt sees a stale binding and returns -EBUSY even though no scheduler is actually attached. Clear @ops->priv on the post-assign failure paths so that the kdata returns to its pre-attach state when the function returns ERR_PTR(). Fixes: bbf30b383cf6 ("sched_ext: Fix ops->priv clobber on concurrent attach/detach") Suggested-by: Tejun Heo Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8e06694094d78b..1efd5d82b08bba 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6670,6 +6670,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6677,6 +6678,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, if (ops->sub_attach) { sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); if (!sch->sub_kset) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(-ENOMEM); } @@ -6684,6 +6686,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6692,6 +6695,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: + RCU_INIT_POINTER(ops->priv, NULL); free_cpumask_var(sch->bypass_lb_resched_cpumask); #endif err_free_lb_cpumask: From 657b594b2084b39a4bc6d8493aa2140cb00cea49 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 7 May 2026 16:46:29 +0900 Subject: [PATCH 681/972] fprobe: Fix unregister_fprobe() to wait for RCU grace period Commit 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") changed fprobe to register struct fprobe to an rcu-hlist, but it forgot to wait for RCU GP. Thus there can be use-after-free if the fprobe is released right after unregistering. This can be happened on fprobe event and sample module code. To fix this issue, add synchronize_rcu() in unregister_fprobe(). Note that BPF is OK because fprobe is used as a part of bpf_kprobe_multi_link. This unregisters its fprobe in bpf_kprobe_multi_link_release() and it is deallocated via bpf_kprobe_multi_link_dealloc(), which is invoked from bpf_link_defer_dealloc_rcu_gp() RCU callback. For BPF, this also introduced unregister_fprobe_async() which does NOT wait for RCU grace priod. Link: https://lore.kernel.org/all/177813998919.256460.2809243930741138224.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba1604093 ("fprobe: Rewrite fprobe on function-graph tracer") Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 5 +++++ kernel/trace/bpf_trace.c | 3 ++- kernel/trace/fprobe.c | 23 +++++++++++++++++++++-- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 0a3bcd1718f379..be1b38c981d4d9 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -94,6 +94,7 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); int unregister_fprobe(struct fprobe *fp); +int unregister_fprobe_async(struct fprobe *fp); bool fprobe_is_registered(struct fprobe *fp); int fprobe_count_ips_from_filter(const char *filter, const char *notfilter); #else @@ -113,6 +114,10 @@ static inline int unregister_fprobe(struct fprobe *fp) { return -EOPNOTSUPP; } +static inline int unregister_fprobe_async(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} static inline bool fprobe_is_registered(struct fprobe *fp) { return false; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36d9..a02bd258677ee1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2384,7 +2384,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a7737b..f378613ad1209c 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) From 5082d8835070fe63f3c61fe574cbb5319bd94575 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:24:57 +0200 Subject: [PATCH 682/972] xfs: fix the "limiting open zones" message The xfs logging macros include a newline, remove the \n, which adds an extra one. Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Andrey Albershteyn Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index c64f9ab743a60a..5e297b75a85f63 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1170,7 +1170,7 @@ xfs_calc_open_zones( if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { mp->m_max_open_zones = bdev_open_zones; - xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + xfs_info(mp, "limiting open zones to %u due to hardware limit.", bdev_open_zones); } From 509fdeb3326be0db055e88d0f689a3888f147f90 Mon Sep 17 00:00:00 2001 From: Md Shofiqul Islam Date: Wed, 6 May 2026 19:36:58 +0300 Subject: [PATCH 683/972] xfs: Fix typo in comment Fix spelling mistake in comment: - occured -> occurred Reviewed-by: Darrick J. Wong Signed-off-by: Md Shofiqul Islam Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_notify_failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 64c8afb935c261..b994ff15d5e455 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -350,7 +350,7 @@ xfs_dax_notify_dev_failure( /* * Shutdown fs from a force umount in pre-remove case which won't fail, * so errors can be ignored. Otherwise, shutdown the filesystem with - * CORRUPT flag if error occured or notify.want_shutdown was set during + * CORRUPT flag if error occurred or notify.want_shutdown was set during * RMAP querying. */ if (mf_flags & MF_MEM_PRE_REMOVE) From 0c3887a134f191723b53e2a47e501b534c8723ee Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:31 +0000 Subject: [PATCH 684/972] platform/x86: lenovo-wmi-helpers: Fix memory leak in lwmi_dev_evaluate_int() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit lwmi_dev_evaluate_int() leaks output.pointer when retval == NULL (found by sashiko.dev [1]). Fix it by moving `ret_obj = output.pointer' outside of the `if (retval)' block so that it is always freed by the __free cleanup callback. No functional change intended. Reviewed-by: Mark Pearson Fixes: e521d16e76cd ("platform/x86: Add lenovo-wmi-helpers") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-2-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-helpers.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-helpers.c b/drivers/platform/x86/lenovo/wmi-helpers.c index 7379defac50028..018d7642e2bd5b 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.c +++ b/drivers/platform/x86/lenovo/wmi-helpers.c @@ -46,7 +46,6 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval) { struct acpi_buffer output = { ACPI_ALLOCATE_BUFFER, NULL }; - union acpi_object *ret_obj __free(kfree) = NULL; struct acpi_buffer input = { size, buf }; acpi_status status; @@ -55,8 +54,9 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, if (ACPI_FAILURE(status)) return -EIO; + union acpi_object *ret_obj __free(kfree) = output.pointer; + if (retval) { - ret_obj = output.pointer; if (!ret_obj) return -ENODATA; From 55a279ae819adaea99a94c609f31970b70e0ec0c Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:32 +0000 Subject: [PATCH 685/972] platform/x86: lenovo-wmi-other: Balance IDA id allocation and free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the IDA id is only freed on wmi-other device removal or failure to create firmware-attributes device, kset, or attributes. It leaks IDA ids if the wmi-other device is bound multiple times, as the unbind callback never frees the previously allocated IDA id. Additionally, if the wmi-other device has failed to create a firmware-attributes device before it gets removed, the wmi-device removal callback double frees the same IDA id. These bugs were found by sashiko.dev [1]. Fix them by moving ida_free() into lwmi_om_fw_attr_remove() so it is balanced with ida_alloc() in lwmi_om_fw_attr_add(). With them fixed, properly set and utilize the validity of priv->ida_id to balance firmware-attributes registration and removal, without relying on propagating the registration error to the component framework, which is more reliable and aligns with the hwmon device registration and removal sequences. No functional change intended. Reviewed-by: Mark Pearson Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-3-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 36 ++++++++++++++----------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 6c2febe1a595ca..ebef3649d0a78d 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -959,17 +959,17 @@ static struct capdata01_attr_group cd01_attr_groups[] = { /** * lwmi_om_fw_attr_add() - Register all firmware_attributes_class members * @priv: The Other Mode driver data. - * - * Return: Either 0, or an error code. */ -static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) +static void lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) { unsigned int i; int err; - priv->ida_id = ida_alloc(&lwmi_om_ida, GFP_KERNEL); - if (priv->ida_id < 0) - return priv->ida_id; + err = ida_alloc(&lwmi_om_ida, GFP_KERNEL); + if (err < 0) + goto err_no_ida; + + priv->ida_id = err; priv->fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), NULL, "%s-%u", @@ -995,7 +995,7 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; } - return 0; + return; err_remove_groups: while (i--) @@ -1009,7 +1009,12 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) err_free_ida: ida_free(&lwmi_om_ida, priv->ida_id); - return err; + +err_no_ida: + priv->ida_id = -EIDRM; + + dev_warn(&priv->wdev->dev, + "failed to register firmware-attributes device: %d\n", err); } /** @@ -1018,12 +1023,17 @@ static int lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) */ static void lwmi_om_fw_attr_remove(struct lwmi_om_priv *priv) { + if (priv->ida_id < 0) + return; + for (unsigned int i = 0; i < ARRAY_SIZE(cd01_attr_groups) - 1; i++) sysfs_remove_group(&priv->fw_attr_kset->kobj, cd01_attr_groups[i].attr_group); kset_unregister(priv->fw_attr_kset); device_unregister(priv->fw_attr_dev); + ida_free(&lwmi_om_ida, priv->ida_id); + priv->ida_id = -EIDRM; } /* ======== Self (master: lenovo-wmi-other) ======== */ @@ -1065,7 +1075,9 @@ static int lwmi_om_master_bind(struct device *dev) lwmi_om_fan_info_collect_cd00(priv); - return lwmi_om_fw_attr_add(priv); + lwmi_om_fw_attr_add(priv); + + return 0; } /** @@ -1117,13 +1129,7 @@ static int lwmi_other_probe(struct wmi_device *wdev, const void *context) static void lwmi_other_remove(struct wmi_device *wdev) { - struct lwmi_om_priv *priv = dev_get_drvdata(&wdev->dev); - component_master_del(&wdev->dev, &lwmi_om_master_ops); - - /* No IDA to free if the driver is never bound to its components. */ - if (priv->ida_id >= 0) - ida_free(&lwmi_om_ida, priv->ida_id); } static const struct wmi_device_id lwmi_other_id_table[] = { From 2fe2504abcfa4f82a4208e8d0c21ec0f22baca43 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:33 +0000 Subject: [PATCH 686/972] platform/x86: lenovo-wmi-other: Balance component bind and unbind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When lwmi_om_master_bind() fails, the master device's components are left bound, with the aggregate device destroyed due to the failure (found by sashiko.dev [1]). Balance calls to component_bind_all() and component_unbind_all() when an error is propagated to the component framework. No functional change intended. Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Cc: stable@vger.kernel.org Link: https://sashiko.dev/#/patchset/20260331181208.421552-1-derekjohn.clark%40gmail.com [1] Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-4-derekjohn.clark@gmail.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index ebef3649d0a78d..70fcb8406c274f 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -1070,8 +1070,11 @@ static int lwmi_om_master_bind(struct device *dev) priv->cd00_list = binder.cd00_list; priv->cd01_list = binder.cd01_list; - if (!priv->cd00_list || !priv->cd01_list) + if (!priv->cd00_list || !priv->cd01_list) { + component_unbind_all(dev, NULL); + return -ENODEV; + } lwmi_om_fan_info_collect_cd00(priv); From 816fbd5dacee977ca56bab79bf97f71f2f7ac24e Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:34 +0000 Subject: [PATCH 687/972] platform/x86: lenovo-wmi-other: Zero initialize WMI arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds explicit initialization of wmi_method_args_32 declarations with zero values to prevent uninitialized data from being sent to the device BIOS when passed. No functional change intended. Reviewed-by: Mark Pearson Fixes: 22024ac5366f ("platform/x86: Add Lenovo Gamezone WMI Driver") Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Reported-by: Rong Zhang Closes: https://lore.kernel.org/platform-driver-x86/95c7e7b539dd0af41189c754fcd35cec5b6fe182.camel@rong.moe/ Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-5-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-gamezone.c | 2 +- drivers/platform/x86/lenovo/wmi-other.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index c7fe7e3c9f1791..098239c9311a6b 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -201,7 +201,7 @@ static int lwmi_gz_profile_set(struct device *dev, enum platform_profile_option profile) { struct lwmi_gz_priv *priv = dev_get_drvdata(dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; enum thermal_mode mode; int ret; diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 70fcb8406c274f..c1b429269f8913 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -166,7 +166,7 @@ MODULE_PARM_DESC(relax_fan_constraint, */ static int lwmi_om_fan_get_set(struct lwmi_om_priv *priv, int channel, u32 *val, bool set) { - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; u32 method_id, retval; int err; @@ -775,7 +775,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, struct tunable_attr_01 *tunable_attr) { struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; struct capdata01 capdata; enum thermal_mode mode; u32 attribute_id; @@ -838,7 +838,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, struct tunable_attr_01 *tunable_attr) { struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); - struct wmi_method_args_32 args; + struct wmi_method_args_32 args = {}; enum thermal_mode mode; u32 attribute_id; int retval; From 71f3843e0f81e3c097a088c1121154bb9a44da0a Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:35 +0000 Subject: [PATCH 688/972] platform/x86: lenovo-wmi-other: Fix tunable_attr_01 struct members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In struct tunable_attr_01 the capdata pointer is unused and the size of the id members is u32 when it should be u8. Fix these prior to adding additional members. No functional change intended. Reviewed-by: Mark Pearson Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-6-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index c1b429269f8913..526075ff52d069 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -548,11 +548,10 @@ static void lwmi_om_fan_info_collect_cd_fan(struct device *dev, struct cd_list * /* ======== fw_attributes (component: lenovo-wmi-capdata 01) ======== */ struct tunable_attr_01 { - struct capdata01 *capdata; struct device *dev; - u32 feature_id; - u32 device_id; - u32 type_id; + u8 feature_id; + u8 device_id; + u8 type_id; }; static struct tunable_attr_01 ppt_pl1_spl = { From e8d5460ad3fd22409f2566ecbe2c82d94aabc246 Mon Sep 17 00:00:00 2001 From: Rong Zhang Date: Sun, 10 May 2026 04:25:36 +0000 Subject: [PATCH 689/972] platform/x86: lenovo: Decouple lenovo-wmi-gamezone and lenovo-wmi-other MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, lenovo-wmi-gamezone depends on lenovo-wmi-other as the former imports symbols from the latter. The imported symbols are just used to register a notifier block. However, there is no runtime dependency between both drivers, and either of them can run without the other, which is the major purpose of using the notifier framework. Such a link-time dependency is non-optimal. A previous attempt to "fix" it made LENOVO_WMI_GAMEZONE select LENOVO_WMI_TUNING, which was fundamentally broken and resulted in undefined Kconfig behavior, as `select' cannot be used on a symbol with potentially unmet dependencies. Decouple both drivers by moving the thermal mode notifier chain to lenovo-wmi-helpers. Methods for notifier block (un)registration are exported for lenovo-wmi-gamezone, while a method for querying the current thermal mode are exported for lenovo-wmi-other. This turns the dependency graph from +------------ lenovo-wmi-gamezone | | v | lenovo-wmi-helpers | ^ | | V +------------ lenovo-wmi-other into +------------ lenovo-wmi-gamezone | v lenovo-wmi-helpers ^ | +------------ lenovo-wmi-other To make it clear, the name of the notifier chain is also renamed from `om_chain_head' to `tm_chain_head', indicating that it's used to query the current thermal mode. No functional change intended. Reviewed-by: Mark Pearson Fixes: 6e38b9fcbfa3 ("platform/x86: lenovo: gamezone needs "other mode"") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202603252259.gHvJDyh3-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202603260302.X0NjQOda-lkp@intel.com/ Signed-off-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-7-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/Kconfig | 1 - drivers/platform/x86/lenovo/wmi-gamezone.c | 4 +- drivers/platform/x86/lenovo/wmi-helpers.c | 101 ++++++++++++++++++++ drivers/platform/x86/lenovo/wmi-helpers.h | 8 ++ drivers/platform/x86/lenovo/wmi-other.c | 104 +-------------------- drivers/platform/x86/lenovo/wmi-other.h | 16 ---- 6 files changed, 112 insertions(+), 122 deletions(-) delete mode 100644 drivers/platform/x86/lenovo/wmi-other.h diff --git a/drivers/platform/x86/lenovo/Kconfig b/drivers/platform/x86/lenovo/Kconfig index f885127b007f19..09b1b055d2e014 100644 --- a/drivers/platform/x86/lenovo/Kconfig +++ b/drivers/platform/x86/lenovo/Kconfig @@ -252,7 +252,6 @@ config LENOVO_WMI_GAMEZONE select ACPI_PLATFORM_PROFILE select LENOVO_WMI_EVENTS select LENOVO_WMI_HELPERS - select LENOVO_WMI_TUNING help Say Y here if you have a WMI aware Lenovo Legion device and would like to use the platform-profile firmware interface to manage power usage. diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index 098239c9311a6b..c28785d25673df 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -23,7 +23,6 @@ #include "wmi-events.h" #include "wmi-gamezone.h" #include "wmi-helpers.h" -#include "wmi-other.h" #define LENOVO_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" @@ -383,7 +382,7 @@ static int lwmi_gz_probe(struct wmi_device *wdev, const void *context) return ret; priv->mode_nb.notifier_call = lwmi_gz_mode_call; - return devm_lwmi_om_register_notifier(&wdev->dev, &priv->mode_nb); + return devm_lwmi_tm_register_notifier(&wdev->dev, &priv->mode_nb); } static const struct wmi_device_id lwmi_gz_id_table[] = { @@ -405,7 +404,6 @@ module_wmi_driver(lwmi_gz_driver); MODULE_IMPORT_NS("LENOVO_WMI_EVENTS"); MODULE_IMPORT_NS("LENOVO_WMI_HELPERS"); -MODULE_IMPORT_NS("LENOVO_WMI_OTHER"); MODULE_DEVICE_TABLE(wmi, lwmi_gz_id_table); MODULE_AUTHOR("Derek J. Clark "); MODULE_DESCRIPTION("Lenovo GameZone WMI Driver"); diff --git a/drivers/platform/x86/lenovo/wmi-helpers.c b/drivers/platform/x86/lenovo/wmi-helpers.c index 018d7642e2bd5b..7a198259e3933c 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.c +++ b/drivers/platform/x86/lenovo/wmi-helpers.c @@ -21,11 +21,15 @@ #include #include #include +#include #include #include #include "wmi-helpers.h" +/* Thermal mode notifier chain. */ +static BLOCKING_NOTIFIER_HEAD(tm_chain_head); + /** * lwmi_dev_evaluate_int() - Helper function for calling WMI methods that * return an integer. @@ -84,6 +88,103 @@ int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, }; EXPORT_SYMBOL_NS_GPL(lwmi_dev_evaluate_int, "LENOVO_WMI_HELPERS"); +/** + * lwmi_tm_register_notifier() - Add a notifier to the blocking notifier chain + * @nb: The notifier_block struct to register + * + * Call blocking_notifier_chain_register to register the notifier block to the + * thermal mode notifier chain. + * + * Return: 0 on success, %-EEXIST on error. + */ +int lwmi_tm_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&tm_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_register_notifier, "LENOVO_WMI_HELPERS"); + +/** + * lwmi_tm_unregister_notifier() - Remove a notifier from the blocking notifier + * chain. + * @nb: The notifier_block struct to register + * + * Call blocking_notifier_chain_unregister to unregister the notifier block from the + * thermal mode notifier chain. + * + * Return: 0 on success, %-ENOENT on error. + */ +int lwmi_tm_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&tm_chain_head, nb); +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_unregister_notifier, "LENOVO_WMI_HELPERS"); + +/** + * devm_lwmi_tm_unregister_notifier() - Remove a notifier from the blocking + * notifier chain. + * @data: Void pointer to the notifier_block struct to register. + * + * Call lwmi_tm_unregister_notifier to unregister the notifier block from the + * thermal mode notifier chain. + * + * Return: 0 on success, %-ENOENT on error. + */ +static void devm_lwmi_tm_unregister_notifier(void *data) +{ + struct notifier_block *nb = data; + + lwmi_tm_unregister_notifier(nb); +} + +/** + * devm_lwmi_tm_register_notifier() - Add a notifier to the blocking notifier + * chain. + * @dev: The parent device of the notifier_block struct. + * @nb: The notifier_block struct to register + * + * Call lwmi_tm_register_notifier to register the notifier block to the + * thermal mode notifier chain. Then add devm_lwmi_tm_unregister_notifier + * as a device managed action to automatically unregister the notifier block + * upon parent device removal. + * + * Return: 0 on success, or an error code. + */ +int devm_lwmi_tm_register_notifier(struct device *dev, + struct notifier_block *nb) +{ + int ret; + + ret = lwmi_tm_register_notifier(nb); + if (ret < 0) + return ret; + + return devm_add_action_or_reset(dev, devm_lwmi_tm_unregister_notifier, + nb); +} +EXPORT_SYMBOL_NS_GPL(devm_lwmi_tm_register_notifier, "LENOVO_WMI_HELPERS"); + +/** + * lwmi_tm_notifier_call() - Call functions for the notifier call chain. + * @mode: Pointer to a thermal mode enum to retrieve the data from. + * + * Call blocking_notifier_call_chain to retrieve the thermal mode from the + * lenovo-wmi-gamezone driver. + * + * Return: 0 on success, or an error code. + */ +int lwmi_tm_notifier_call(enum thermal_mode *mode) +{ + int ret; + + ret = blocking_notifier_call_chain(&tm_chain_head, + LWMI_GZ_GET_THERMAL_MODE, &mode); + if ((ret & ~NOTIFY_STOP_MASK) != NOTIFY_OK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(lwmi_tm_notifier_call, "LENOVO_WMI_HELPERS"); + MODULE_AUTHOR("Derek J. Clark "); MODULE_DESCRIPTION("Lenovo WMI Helpers Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/platform/x86/lenovo/wmi-helpers.h b/drivers/platform/x86/lenovo/wmi-helpers.h index 20fd217498035d..651a039228ed90 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.h +++ b/drivers/platform/x86/lenovo/wmi-helpers.h @@ -7,6 +7,8 @@ #include +struct device; +struct notifier_block; struct wmi_device; struct wmi_method_args_32 { @@ -17,4 +19,10 @@ struct wmi_method_args_32 { int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval); +int lwmi_tm_register_notifier(struct notifier_block *nb); +int lwmi_tm_unregister_notifier(struct notifier_block *nb); +int devm_lwmi_tm_register_notifier(struct device *dev, + struct notifier_block *nb); +int lwmi_tm_notifier_call(enum thermal_mode *mode); + #endif /* !_LENOVO_WMI_HELPERS_H_ */ diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 526075ff52d069..292fed8bcc807b 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -49,7 +48,6 @@ #include "wmi-events.h" #include "wmi-gamezone.h" #include "wmi-helpers.h" -#include "wmi-other.h" #include "../firmware_attributes_class.h" #define LENOVO_OTHER_MODE_GUID "DC2A8805-3A8C-41BA-A6F7-092E0089CD3B" @@ -81,7 +79,6 @@ #define LWMI_OM_FW_ATTR_BASE_PATH "lenovo-wmi-other" #define LWMI_OM_HWMON_NAME "lenovo_wmi_other" -static BLOCKING_NOTIFIER_HEAD(om_chain_head); static DEFINE_IDA(lwmi_om_ida); enum attribute_property { @@ -109,7 +106,6 @@ struct lwmi_om_priv { struct device *hwmon_dev; struct device *fw_attr_dev; struct kset *fw_attr_kset; - struct notifier_block nb; struct wmi_device *wdev; int ida_id; @@ -577,102 +573,6 @@ struct capdata01_attr_group { struct tunable_attr_01 *tunable_attr; }; -/** - * lwmi_om_register_notifier() - Add a notifier to the blocking notifier chain - * @nb: The notifier_block struct to register - * - * Call blocking_notifier_chain_register to register the notifier block to the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-EEXIST on error. - */ -int lwmi_om_register_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_register(&om_chain_head, nb); -} -EXPORT_SYMBOL_NS_GPL(lwmi_om_register_notifier, "LENOVO_WMI_OTHER"); - -/** - * lwmi_om_unregister_notifier() - Remove a notifier from the blocking notifier - * chain. - * @nb: The notifier_block struct to register - * - * Call blocking_notifier_chain_unregister to unregister the notifier block from the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-ENOENT on error. - */ -int lwmi_om_unregister_notifier(struct notifier_block *nb) -{ - return blocking_notifier_chain_unregister(&om_chain_head, nb); -} -EXPORT_SYMBOL_NS_GPL(lwmi_om_unregister_notifier, "LENOVO_WMI_OTHER"); - -/** - * devm_lwmi_om_unregister_notifier() - Remove a notifier from the blocking - * notifier chain. - * @data: Void pointer to the notifier_block struct to register. - * - * Call lwmi_om_unregister_notifier to unregister the notifier block from the - * lenovo-wmi-other driver notifier chain. - * - * Return: 0 on success, %-ENOENT on error. - */ -static void devm_lwmi_om_unregister_notifier(void *data) -{ - struct notifier_block *nb = data; - - lwmi_om_unregister_notifier(nb); -} - -/** - * devm_lwmi_om_register_notifier() - Add a notifier to the blocking notifier - * chain. - * @dev: The parent device of the notifier_block struct. - * @nb: The notifier_block struct to register - * - * Call lwmi_om_register_notifier to register the notifier block to the - * lenovo-wmi-other driver notifier chain. Then add devm_lwmi_om_unregister_notifier - * as a device managed action to automatically unregister the notifier block - * upon parent device removal. - * - * Return: 0 on success, or an error code. - */ -int devm_lwmi_om_register_notifier(struct device *dev, - struct notifier_block *nb) -{ - int ret; - - ret = lwmi_om_register_notifier(nb); - if (ret < 0) - return ret; - - return devm_add_action_or_reset(dev, devm_lwmi_om_unregister_notifier, - nb); -} -EXPORT_SYMBOL_NS_GPL(devm_lwmi_om_register_notifier, "LENOVO_WMI_OTHER"); - -/** - * lwmi_om_notifier_call() - Call functions for the notifier call chain. - * @mode: Pointer to a thermal mode enum to retrieve the data from. - * - * Call blocking_notifier_call_chain to retrieve the thermal mode from the - * lenovo-wmi-gamezone driver. - * - * Return: 0 on success, or an error code. - */ -static int lwmi_om_notifier_call(enum thermal_mode *mode) -{ - int ret; - - ret = blocking_notifier_call_chain(&om_chain_head, - LWMI_GZ_GET_THERMAL_MODE, &mode); - if ((ret & ~NOTIFY_STOP_MASK) != NOTIFY_OK) - return -EINVAL; - - return 0; -} - /* Attribute Methods */ /** @@ -781,7 +681,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, u32 value; int ret; - ret = lwmi_om_notifier_call(&mode); + ret = lwmi_tm_notifier_call(&mode); if (ret) return ret; @@ -843,7 +743,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, int retval; int ret; - ret = lwmi_om_notifier_call(&mode); + ret = lwmi_tm_notifier_call(&mode); if (ret) return ret; diff --git a/drivers/platform/x86/lenovo/wmi-other.h b/drivers/platform/x86/lenovo/wmi-other.h deleted file mode 100644 index 8ebf5602bb9970..00000000000000 --- a/drivers/platform/x86/lenovo/wmi-other.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -/* Copyright (C) 2025 Derek J. Clark */ - -#ifndef _LENOVO_WMI_OTHER_H_ -#define _LENOVO_WMI_OTHER_H_ - -struct device; -struct notifier_block; - -int lwmi_om_register_notifier(struct notifier_block *nb); -int lwmi_om_unregister_notifier(struct notifier_block *nb); -int devm_lwmi_om_register_notifier(struct device *dev, - struct notifier_block *nb); - -#endif /* !_LENOVO_WMI_OTHER_H_ */ From 7e27896e16a1c450085c3fe020eeb1b223880f37 Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:37 +0000 Subject: [PATCH 690/972] platform/x86: lenovo-wmi-helpers: Move gamezone enums to wmi-helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a later patch in the series the thermal mode enum will be accessed across three separate drivers (wmi-capdata, wmi-gamezonem and wmi-other). An additional patch in the series will also add a function prototype that needs to reference this enum in wmi-helpers.h. To avoid having all these drivers begin to import each others headers, and to avoid declaring an opaque enum to hande the second case, move the thermal mode enum to helpers where it can be safely accessed by everything that needs it from a single import. While at it, since the gamezone_events_type enum is the only remaining item in the header, move that as well and remove the gamezone header entirely. Cc: stable@vger.kernel.org Reviewed-by: Mark Pearson Reviewed-by: Rong Zhang Tested-by: Rong Zhang Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-8-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-events.c | 2 +- drivers/platform/x86/lenovo/wmi-gamezone.c | 1 - drivers/platform/x86/lenovo/wmi-gamezone.h | 20 -------------------- drivers/platform/x86/lenovo/wmi-helpers.h | 13 +++++++++++++ drivers/platform/x86/lenovo/wmi-other.c | 1 - 5 files changed, 14 insertions(+), 23 deletions(-) delete mode 100644 drivers/platform/x86/lenovo/wmi-gamezone.h diff --git a/drivers/platform/x86/lenovo/wmi-events.c b/drivers/platform/x86/lenovo/wmi-events.c index 4a6a2c82413acf..fc25bba68a7c67 100644 --- a/drivers/platform/x86/lenovo/wmi-events.c +++ b/drivers/platform/x86/lenovo/wmi-events.c @@ -17,7 +17,7 @@ #include #include "wmi-events.h" -#include "wmi-gamezone.h" +#include "wmi-helpers.h" #define THERMAL_MODE_EVENT_GUID "D320289E-8FEA-41E0-86F9-911D83151B5F" diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.c b/drivers/platform/x86/lenovo/wmi-gamezone.c index c28785d25673df..109c0b564a9f6b 100644 --- a/drivers/platform/x86/lenovo/wmi-gamezone.c +++ b/drivers/platform/x86/lenovo/wmi-gamezone.c @@ -21,7 +21,6 @@ #include #include "wmi-events.h" -#include "wmi-gamezone.h" #include "wmi-helpers.h" #define LENOVO_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" diff --git a/drivers/platform/x86/lenovo/wmi-gamezone.h b/drivers/platform/x86/lenovo/wmi-gamezone.h deleted file mode 100644 index 6b163a5eeb959d..00000000000000 --- a/drivers/platform/x86/lenovo/wmi-gamezone.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -/* Copyright (C) 2025 Derek J. Clark */ - -#ifndef _LENOVO_WMI_GAMEZONE_H_ -#define _LENOVO_WMI_GAMEZONE_H_ - -enum gamezone_events_type { - LWMI_GZ_GET_THERMAL_MODE = 1, -}; - -enum thermal_mode { - LWMI_GZ_THERMAL_MODE_QUIET = 0x01, - LWMI_GZ_THERMAL_MODE_BALANCED = 0x02, - LWMI_GZ_THERMAL_MODE_PERFORMANCE = 0x03, - LWMI_GZ_THERMAL_MODE_EXTREME = 0xE0, /* Ver 6+ */ - LWMI_GZ_THERMAL_MODE_CUSTOM = 0xFF, -}; - -#endif /* !_LENOVO_WMI_GAMEZONE_H_ */ diff --git a/drivers/platform/x86/lenovo/wmi-helpers.h b/drivers/platform/x86/lenovo/wmi-helpers.h index 651a039228ed90..ed7db3ebba6cc1 100644 --- a/drivers/platform/x86/lenovo/wmi-helpers.h +++ b/drivers/platform/x86/lenovo/wmi-helpers.h @@ -16,6 +16,19 @@ struct wmi_method_args_32 { u32 arg1; }; +enum lwmi_event_type { + LWMI_GZ_GET_THERMAL_MODE = 0x01, +}; + +enum thermal_mode { + LWMI_GZ_THERMAL_MODE_NONE = 0x00, + LWMI_GZ_THERMAL_MODE_QUIET = 0x01, + LWMI_GZ_THERMAL_MODE_BALANCED = 0x02, + LWMI_GZ_THERMAL_MODE_PERFORMANCE = 0x03, + LWMI_GZ_THERMAL_MODE_EXTREME = 0xE0, /* Ver 6+ */ + LWMI_GZ_THERMAL_MODE_CUSTOM = 0xFF, +}; + int lwmi_dev_evaluate_int(struct wmi_device *wdev, u8 instance, u32 method_id, unsigned char *buf, size_t size, u32 *retval); diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 292fed8bcc807b..19f48aeda22844 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -46,7 +46,6 @@ #include "wmi-capdata.h" #include "wmi-events.h" -#include "wmi-gamezone.h" #include "wmi-helpers.h" #include "../firmware_attributes_class.h" From 30a4ad208a7f7bdb790cd31d368595890045334f Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:38 +0000 Subject: [PATCH 691/972] platform/x86: lenovo-wmi-other: Add Attribute ID helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds lwmi_attr_id() function. In the same vein as LWMI_ATTR_ID_FAN_RPM(), but as a generic, to de-duplicate attribute_id assignment boilerplate. Adds tunable_attr_01_id() function that breaks out the members of a tunable_attr_01 struct and passes them to lwmi_attr_id(). No functional change intended. Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-9-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-capdata.c | 8 ++-- drivers/platform/x86/lenovo/wmi-capdata.h | 20 +++++++++ drivers/platform/x86/lenovo/wmi-other.c | 49 +++++++++-------------- 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-capdata.c b/drivers/platform/x86/lenovo/wmi-capdata.c index b73d378f0e8b3e..714aa6fd6f1fcb 100644 --- a/drivers/platform/x86/lenovo/wmi-capdata.c +++ b/drivers/platform/x86/lenovo/wmi-capdata.c @@ -27,7 +27,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include "wmi-capdata.h" +#include "wmi-helpers.h" #define LENOVO_CAPABILITY_DATA_00_GUID "362A3AFE-3D96-4665-8530-96DAD5BB300E" #define LENOVO_CAPABILITY_DATA_01_GUID "7A8F5407-CB67-4D6E-B547-39B3BE018154" @@ -57,9 +57,9 @@ #define LWMI_FEATURE_ID_FAN_TEST 0x05 -#define LWMI_ATTR_ID_FAN_TEST \ - (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, LWMI_DEVICE_ID_FAN) | \ - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, LWMI_FEATURE_ID_FAN_TEST)) +#define LWMI_ATTR_ID_FAN_TEST \ + lwmi_attr_id(LWMI_DEVICE_ID_FAN, LWMI_FEATURE_ID_FAN_TEST, \ + LWMI_GZ_THERMAL_MODE_NONE, LWMI_TYPE_ID_NONE) enum lwmi_cd_type { LENOVO_CAPABILITY_DATA_00, diff --git a/drivers/platform/x86/lenovo/wmi-capdata.h b/drivers/platform/x86/lenovo/wmi-capdata.h index 8c1df3efcc5533..c3e760b8c3c3df 100644 --- a/drivers/platform/x86/lenovo/wmi-capdata.h +++ b/drivers/platform/x86/lenovo/wmi-capdata.h @@ -6,6 +6,7 @@ #define _LENOVO_WMI_CAPDATA_H_ #include +#include #include #define LWMI_SUPP_VALID BIT(0) @@ -19,6 +20,8 @@ #define LWMI_DEVICE_ID_FAN 0x04 +#define LWMI_TYPE_ID_NONE 0x00 + struct component_match; struct device; struct cd_list; @@ -57,6 +60,23 @@ struct lwmi_cd_binder { cd_list_cb_t cd_fan_list_cb; }; +/** + * lwmi_attr_id() - Formats a capability data attribute ID + * @dev_id: The u8 corresponding to the device ID. + * @feat_id: The u8 corresponding to the feature ID on the device. + * @mode_id: The u8 corresponding to the wmi-gamezone mode for set/get. + * @type_id: The u8 corresponding to the sub-device. + * + * Return: encoded capability data attribute ID. + */ +static inline u32 lwmi_attr_id(u8 dev_id, u8 feat_id, u8 mode_id, u8 type_id) +{ + return (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, dev_id) | + FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, feat_id) | + FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode_id) | + FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, type_id)); +} + void lwmi_cd_match_add_all(struct device *master, struct component_match **matchptr); int lwmi_cd00_get_data(struct cd_list *list, u32 attribute_id, struct capdata00 *output); int lwmi_cd01_get_data(struct cd_list *list, u32 attribute_id, struct capdata01 *output); diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index 19f48aeda22844..a06b188eadacf2 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -59,8 +59,6 @@ #define LWMI_FEATURE_ID_FAN_RPM 0x03 -#define LWMI_TYPE_ID_NONE 0x00 - #define LWMI_FEATURE_VALUE_GET 17 #define LWMI_FEATURE_VALUE_SET 18 @@ -68,13 +66,12 @@ #define LWMI_FAN_NR 4 #define LWMI_FAN_ID(x) ((x) + LWMI_FAN_ID_BASE) -#define LWMI_ATTR_ID_FAN_RPM(x) \ - (FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, LWMI_DEVICE_ID_FAN) | \ - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, LWMI_FEATURE_ID_FAN_RPM) | \ - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, LWMI_FAN_ID(x))) - #define LWMI_FAN_DIV 100 +#define LWMI_ATTR_ID_FAN_RPM(x) \ + lwmi_attr_id(LWMI_DEVICE_ID_FAN, LWMI_FEATURE_ID_FAN_RPM, \ + LWMI_GZ_THERMAL_MODE_NONE, LWMI_FAN_ID(x)) + #define LWMI_OM_FW_ATTR_BASE_PATH "lenovo-wmi-other" #define LWMI_OM_HWMON_NAME "lenovo_wmi_other" @@ -549,6 +546,18 @@ struct tunable_attr_01 { u8 type_id; }; +/** + * tunable_attr_01_id() - Formats a tunable_attr_01 to a capdata attribute ID + * @attr: The tunable_attr_01 to format. + * @mode: The u8 corresponding to the wmi-gamezone mode for set/get. + * + * Return: encoded capability data attribute ID. + */ +static u32 tunable_attr_01_id(struct tunable_attr_01 *attr, u8 mode) +{ + return lwmi_attr_id(attr->device_id, attr->feature_id, mode, attr->type_id); +} + static struct tunable_attr_01 ppt_pl1_spl = { .device_id = LWMI_DEVICE_ID_CPU, .feature_id = LWMI_FEATURE_ID_CPU_SPL, @@ -616,12 +625,7 @@ static ssize_t attr_capdata01_show(struct kobject *kobj, u32 attribute_id; int value, ret; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, - LWMI_GZ_THERMAL_MODE_CUSTOM) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); + attribute_id = tunable_attr_01_id(tunable_attr, LWMI_GZ_THERMAL_MODE_CUSTOM); ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); if (ret) @@ -676,7 +680,6 @@ static ssize_t attr_current_value_store(struct kobject *kobj, struct wmi_method_args_32 args = {}; struct capdata01 capdata; enum thermal_mode mode; - u32 attribute_id; u32 value; int ret; @@ -687,13 +690,9 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (mode != LWMI_GZ_THERMAL_MODE_CUSTOM) return -EBUSY; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); + args.arg0 = tunable_attr_01_id(tunable_attr, mode); - ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); + ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); if (ret) return ret; @@ -704,7 +703,6 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (value < capdata.min_value || value > capdata.max_value) return -EINVAL; - args.arg0 = attribute_id; args.arg1 = value; ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_SET, @@ -738,7 +736,6 @@ static ssize_t attr_current_value_show(struct kobject *kobj, struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); struct wmi_method_args_32 args = {}; enum thermal_mode mode; - u32 attribute_id; int retval; int ret; @@ -746,13 +743,7 @@ static ssize_t attr_current_value_show(struct kobject *kobj, if (ret) return ret; - attribute_id = - FIELD_PREP(LWMI_ATTR_DEV_ID_MASK, tunable_attr->device_id) | - FIELD_PREP(LWMI_ATTR_FEAT_ID_MASK, tunable_attr->feature_id) | - FIELD_PREP(LWMI_ATTR_MODE_ID_MASK, mode) | - FIELD_PREP(LWMI_ATTR_TYPE_ID_MASK, tunable_attr->type_id); - - args.arg0 = attribute_id; + args.arg0 = tunable_attr_01_id(tunable_attr, mode); ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, (unsigned char *)&args, sizeof(args), From 03bb5147da083cb91e5c8c2599fcb2f8fd05cb8f Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Sun, 10 May 2026 04:25:39 +0000 Subject: [PATCH 692/972] platform/x86: lenovo-wmi-other: Limit adding attributes to supported devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds lwmi_is_attr_01_supported, and only creates the attribute subfolder if the attribute is supported by the hardware. Due to some poorly implemented BIOS this is a multi-step sequence of events. This is because: - Some BIOS support getting the capability data from custom mode (0xff), while others only support it in no-mode (0x00). - Some BIOS support get/set for the current value from custom mode (0xff), while others only support it in no-mode (0x00). - Some BIOS report capability data for a method that is not fully implemented. - Some BIOS have methods fully implemented, but no complimentary capability data. To ensure we only expose fully implemented methods with corresponding capability data, we check each outcome before reporting that an attribute can be supported. Checking for lwmi_is_attr_01_supported during remove is not done to ensure that we don't attempt to call cd01 or send WMI events if one of the interfaces being removed was the cause of the driver unloading. Fixes: edc4b183b794 ("platform/x86: Add Lenovo Other Mode WMI Driver") Reported-by: Kurt Borja Closes: https://lore.kernel.org/platform-driver-x86/DG60P3SHXR8H.3NSEHMZ6J7XRC@gmail.com/ Cc: stable@vger.kernel.org Reviewed-by: Rong Zhang Tested-by: Rong Zhang Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Link: https://patch.msgid.link/20260510042546.436874-10-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/lenovo/wmi-other.c | 92 +++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/lenovo/wmi-other.c b/drivers/platform/x86/lenovo/wmi-other.c index a06b188eadacf2..d318ba432fdcc1 100644 --- a/drivers/platform/x86/lenovo/wmi-other.c +++ b/drivers/platform/x86/lenovo/wmi-other.c @@ -544,6 +544,8 @@ struct tunable_attr_01 { u8 feature_id; u8 device_id; u8 type_id; + u8 cd_mode_id; /* mode arg for searching capdata */ + u8 cv_mode_id; /* mode arg for set/get current_value */ }; /** @@ -625,7 +627,7 @@ static ssize_t attr_capdata01_show(struct kobject *kobj, u32 attribute_id; int value, ret; - attribute_id = tunable_attr_01_id(tunable_attr, LWMI_GZ_THERMAL_MODE_CUSTOM); + attribute_id = tunable_attr_01_id(tunable_attr, tunable_attr->cd_mode_id); ret = lwmi_cd01_get_data(priv->cd01_list, attribute_id, &capdata); if (ret) @@ -690,7 +692,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (mode != LWMI_GZ_THERMAL_MODE_CUSTOM) return -EBUSY; - args.arg0 = tunable_attr_01_id(tunable_attr, mode); + args.arg0 = tunable_attr_01_id(tunable_attr, tunable_attr->cd_mode_id); ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); if (ret) @@ -703,6 +705,7 @@ static ssize_t attr_current_value_store(struct kobject *kobj, if (value < capdata.min_value || value > capdata.max_value) return -EINVAL; + args.arg0 = tunable_attr_01_id(tunable_attr, tunable_attr->cv_mode_id); args.arg1 = value; ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_SET, @@ -743,6 +746,10 @@ static ssize_t attr_current_value_show(struct kobject *kobj, if (ret) return ret; + /* If "no-mode" is the supported mode, ensure we never send current mode */ + if (tunable_attr->cv_mode_id == LWMI_GZ_THERMAL_MODE_NONE) + mode = tunable_attr->cv_mode_id; + args.arg0 = tunable_attr_01_id(tunable_attr, mode); ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, @@ -754,6 +761,81 @@ static ssize_t attr_current_value_show(struct kobject *kobj, return sysfs_emit(buf, "%d\n", retval); } +/** + * lwmi_attr_01_is_supported() - Determine if the given attribute is supported. + * @tunable_attr: The attribute to verify. + * + * For an attribute to be supported it must have a functional get/set method, + * as well as associated capability data stored in the capdata01 table. + * + * First check if the attribute has a corresponding data table under custom mode + * (0xff), then under no mode (0x00). If either of those passes, check if the + * supported field of the capdata struct is > 0. If it is supported, store the + * successful mode in the cd_mode_id field of tunable_attr. + * + * If the attribute capdata shows it is supported, attempt to determine the mode + * for the current value property get/set methods using a similar pattern to the + * capdata table check. If the value returned by either mode is 0 or an error, + * assume that mode is not supported. Otherwise, store the successful mode in the + * cv_mode_id field of tunable_attr. + * + * If any of the above checks fail then the attribute is not fully supported. + * + * Return: true if capdata and set/get modes are found, otherwise false. + */ +static bool lwmi_attr_01_is_supported(struct tunable_attr_01 *tunable_attr) +{ + u8 modes[2] = { LWMI_GZ_THERMAL_MODE_CUSTOM, LWMI_GZ_THERMAL_MODE_NONE }; + struct lwmi_om_priv *priv = dev_get_drvdata(tunable_attr->dev); + struct wmi_method_args_32 args = {}; + bool cd_mode_found = false; + bool cv_mode_found = false; + struct capdata01 capdata; + int retval, ret, i; + + /* Determine tunable_attr->cd_mode_id */ + for (i = 0; i < ARRAY_SIZE(modes); i++) { + args.arg0 = tunable_attr_01_id(tunable_attr, modes[i]); + + ret = lwmi_cd01_get_data(priv->cd01_list, args.arg0, &capdata); + if (ret || !capdata.supported) + continue; + + tunable_attr->cd_mode_id = modes[i]; + cd_mode_found = true; + break; + } + + if (!cd_mode_found) + return cd_mode_found; + + dev_dbg(tunable_attr->dev, + "cd_mode_id: %#010x\n", args.arg0); + + /* Determine tunable_attr->cv_mode_id, returns 1 if supported */ + for (i = 0; i < ARRAY_SIZE(modes); i++) { + args.arg0 = tunable_attr_01_id(tunable_attr, modes[i]); + + ret = lwmi_dev_evaluate_int(priv->wdev, 0x0, LWMI_FEATURE_VALUE_GET, + (u8 *)&args, sizeof(args), + &retval); + if (ret || !retval) + continue; + + tunable_attr->cv_mode_id = modes[i]; + cv_mode_found = true; + break; + } + + if (!cv_mode_found) + return cv_mode_found; + + dev_dbg(tunable_attr->dev, "cv_mode_id: %#010x, attribute support level: %#010x\n", + args.arg0, capdata.supported); + + return capdata.supported > 0; +} + /* Lenovo WMI Other Mode Attribute macros */ #define __LWMI_ATTR_RO(_func, _name) \ { \ @@ -877,12 +959,14 @@ static void lwmi_om_fw_attr_add(struct lwmi_om_priv *priv) } for (i = 0; i < ARRAY_SIZE(cd01_attr_groups) - 1; i++) { + cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; + if (!lwmi_attr_01_is_supported(cd01_attr_groups[i].tunable_attr)) + continue; + err = sysfs_create_group(&priv->fw_attr_kset->kobj, cd01_attr_groups[i].attr_group); if (err) goto err_remove_groups; - - cd01_attr_groups[i].tunable_attr->dev = &priv->wdev->dev; } return; From b613e2b4075038501fadc1c2d1a5c2ce0f801655 Mon Sep 17 00:00:00 2001 From: Anupama Kunkulagunta Date: Thu, 7 May 2026 09:44:31 +0000 Subject: [PATCH 693/972] ASoC: rt5640: Handle nested IRQs On some Tegra platforms, the RT5640 codec interrupt line is connected via a GPIO controller that operates as a nested IRQ domain. Since nested IRQ controllers only support threaded handlers, request_irq() returns -EINVAL: rt5640 3-001c: Failed to request IRQ 186: -22 Switch to request_any_context_irq() to let the kernel pick the appropriate handler type based on the parent IRQ controller. Signed-off-by: Anupama Kunkulagunta Signed-off-by: Sheetal Link: https://patch.msgid.link/20260507094431.3316763-1-sheetal@nvidia.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5640.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/sound/soc/codecs/rt5640.c b/sound/soc/codecs/rt5640.c index 1a5bb72b7d0301..f9ee8cc64ee5e2 100644 --- a/sound/soc/codecs/rt5640.c +++ b/sound/soc/codecs/rt5640.c @@ -2554,10 +2554,12 @@ static void rt5640_enable_jack_detect(struct snd_soc_component *component, rt5640->jd_gpio = jack_data->jd_gpio; rt5640->jd_gpio_irq = gpiod_to_irq(rt5640->jd_gpio); - ret = request_irq(rt5640->jd_gpio_irq, rt5640_jd_gpio_irq, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "rt5640-jd-gpio", rt5640); - if (ret) { + ret = request_any_context_irq(rt5640->jd_gpio_irq, + rt5640_jd_gpio_irq, + IRQF_TRIGGER_RISING | + IRQF_TRIGGER_FALLING, + "rt5640-jd-gpio", rt5640); + if (ret < 0) { dev_warn(component->dev, "Failed to request jd GPIO IRQ %d: %d\n", rt5640->jd_gpio_irq, ret); rt5640_disable_jack_detect(component); @@ -2569,10 +2571,10 @@ static void rt5640_enable_jack_detect(struct snd_soc_component *component, if (jack_data && jack_data->use_platform_clock) rt5640->use_platform_clock = jack_data->use_platform_clock; - ret = request_irq(rt5640->irq, rt5640_irq, - IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, - "rt5640", rt5640); - if (ret) { + ret = request_any_context_irq(rt5640->irq, rt5640_irq, + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + "rt5640", rt5640); + if (ret < 0) { dev_warn(component->dev, "Failed to request IRQ %d: %d\n", rt5640->irq, ret); rt5640_disable_jack_detect(component); return; @@ -2623,9 +2625,9 @@ static void rt5640_enable_hda_jack_detect( rt5640->jack = jack; - ret = request_irq(rt5640->irq, rt5640_irq, - IRQF_TRIGGER_RISING, "rt5640", rt5640); - if (ret) { + ret = request_any_context_irq(rt5640->irq, rt5640_irq, + IRQF_TRIGGER_RISING, "rt5640", rt5640); + if (ret < 0) { dev_warn(component->dev, "Failed to request IRQ %d: %d\n", rt5640->irq, ret); rt5640->jack = NULL; return; From dec85d2fbd20de3711a71e65397dfdb40c3fa953 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:02 +0000 Subject: [PATCH 694/972] irqchip/gic-v5: Move LPI allocation into the LPI domain The IPI and ITS MSI domains currently allocate and release LPIs directly, then pass the selected LPI ID to the parent LPI domain. This leaks the LPI domain's allocation policy into its child domains and forces each child to duplicate part of the parent domain's teardown. Make the LPI domain allocate LPIs in its .alloc() callback and release them in a matching .free() callback. Child domains can then request a parent interrupt without passing an implementation-specific LPI ID, and the LPI lifetime is tied to the domain that owns the LPI namespace. Remove the gicv5_alloc_lpi() and gicv5_free_lpi() wrappers now that no external caller needs to manage LPIs directly. This is a preparatory change for an actual leakage problem in the allocation code and therefore tagged with the same Fixes tag. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-2-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5-its.c | 14 ++------ drivers/irqchip/irq-gic-v5.c | 53 +++++++++++++++--------------- include/linux/irqchip/arm-gic-v5.h | 3 -- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 36a8d1368f0e44..36d03f82ef6847 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -929,8 +929,8 @@ static void gicv5_its_free_eventid(struct gicv5_its_dev *its_dev, u32 event_id_b static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - u32 device_id, event_id_base, lpi; struct gicv5_its_dev *its_dev; + u32 device_id, event_id_base; msi_alloc_info_t *info = arg; irq_hw_number_t hwirq; struct irq_data *irqd; @@ -949,16 +949,8 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi device_id = its_dev->device_id; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) { - pr_debug("Failed to find free LPI!\n"); - goto out_free_irqs; - } - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); if (ret) { - gicv5_free_lpi(lpi); goto out_free_irqs; } @@ -983,7 +975,6 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi out_free_irqs: while (--i >= 0) { irqd = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(irqd->parent_data->hwirq); irq_domain_reset_irq_data(irqd); irq_domain_free_irqs_parent(domain, virq + i, 1); } @@ -1013,7 +1004,6 @@ static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int vi for (i = 0; i < nr_irqs; i++) { d = irq_domain_get_irq_data(domain, virq + i); - gicv5_free_lpi(d->parent_data->hwirq); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); } diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 6b0903be8ebfd3..15a2a04398d255 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -59,16 +59,6 @@ static void release_lpi(u32 lpi) ida_free(&lpi_ida, lpi); } -int gicv5_alloc_lpi(void) -{ - return alloc_lpi(); -} - -void gicv5_free_lpi(u32 lpi) -{ - release_lpi(lpi); -} - static void gicv5_ppi_priority_init(void) { write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR0_EL1); @@ -806,18 +796,36 @@ static void gicv5_lpi_config_reset(struct irq_data *d) gicv5_lpi_irq_write_pending_state(d, false); } +static void gicv5_irq_lpi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d; + + if (WARN_ON_ONCE(nr_irqs != 1)) + return; + + d = irq_domain_get_irq_data(domain, virq); + + release_lpi(d->hwirq); + + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(d); +} + static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { irq_hw_number_t hwirq; struct irq_data *irqd; - u32 *lpi = arg; int ret; if (WARN_ON_ONCE(nr_irqs != 1)) return -EINVAL; - hwirq = *lpi; + ret = alloc_lpi(); + if (ret < 0) + return ret; + hwirq = ret; irqd = irq_domain_get_irq_data(domain, virq); @@ -826,8 +834,10 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi irqd_set_single_target(irqd); ret = gicv5_irs_iste_alloc(hwirq); - if (ret < 0) + if (ret < 0) { + release_lpi(hwirq); return ret; + } gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); gicv5_lpi_config_reset(irqd); @@ -837,7 +847,7 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = { .alloc = gicv5_irq_lpi_domain_alloc, - .free = gicv5_irq_domain_free, + .free = gicv5_irq_lpi_domain_free, }; void __init gicv5_init_lpi_domain(void) @@ -859,21 +869,12 @@ static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int vi { struct irq_data *irqd; int ret, i; - u32 lpi; for (i = 0; i < nr_irqs; i++) { - ret = gicv5_alloc_lpi(); - if (ret < 0) + ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); + if (ret) return ret; - lpi = ret; - - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi); - if (ret) { - gicv5_free_lpi(lpi); - return ret; - } - irqd = irq_domain_get_irq_data(domain, virq + i); irq_domain_set_hwirq_and_chip(domain, virq + i, i, @@ -899,8 +900,6 @@ static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int vi if (!d) return; - gicv5_free_lpi(d->parent_data->hwirq); - irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); irq_domain_free_irqs_parent(domain, virq + i, 1); diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h index 40d2fce682940a..f78787e654f4c6 100644 --- a/include/linux/irqchip/arm-gic-v5.h +++ b/include/linux/irqchip/arm-gic-v5.h @@ -425,9 +425,6 @@ struct gicv5_its_itt_cfg { void gicv5_init_lpis(u32 max); void gicv5_deinit_lpis(void); -int gicv5_alloc_lpi(void); -void gicv5_free_lpi(u32 lpi); - void __init gicv5_its_of_probe(struct device_node *parent); void __init gicv5_its_acpi_probe(void); #endif From eb6f6d523813ead9dc2799194a2839d42c049734 Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:23 +0000 Subject: [PATCH 695/972] irqchip/gic-v5: Support range allocation for LPIs The per-IPI parent allocation loop returns immediately on failure and leaks any parent interrupts allocated by earlier iterations. The GICv5 LPI domain now owns LPI allocation and teardown internally, but its irq_domain callbacks still reject requests where nr_irqs is greater than one. This forces child domains to allocate and free LPIs one at a time even when the interrupt core requests a contiguous range. Handle multi-interrupt allocation and teardown in the LPI domain by iterating over the requested range and unwinding any partially allocated state on failure. Allocate the parent LPIs for the IPI domain with a single range request as well, which cures the leakage problem. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-3-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5.c | 77 ++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 15a2a04398d255..c1af07083ceff3 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -801,15 +801,14 @@ static void gicv5_irq_lpi_domain_free(struct irq_domain *domain, unsigned int vi { struct irq_data *d; - if (WARN_ON_ONCE(nr_irqs != 1)) - return; - - d = irq_domain_get_irq_data(domain, virq); + for (unsigned int i = 0; i < nr_irqs; i++, virq++) { + d = irq_domain_get_irq_data(domain, virq); - release_lpi(d->hwirq); + release_lpi(d->hwirq); - irq_set_handler(virq, NULL); - irq_domain_reset_irq_data(d); + irq_set_handler(virq, NULL); + irq_domain_reset_irq_data(d); + } } static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq, @@ -817,32 +816,39 @@ static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int vi { irq_hw_number_t hwirq; struct irq_data *irqd; + unsigned int i; int ret; - if (WARN_ON_ONCE(nr_irqs != 1)) - return -EINVAL; - - ret = alloc_lpi(); - if (ret < 0) - return ret; - hwirq = ret; + for (i = 0; i < nr_irqs; i++) { + ret = alloc_lpi(); + if (ret < 0) + goto out_free_lpis; + hwirq = ret; + + ret = gicv5_irs_iste_alloc(hwirq); + if (ret < 0) { + /* Undo partial state first, then clean up the rest */ + release_lpi(hwirq); + goto out_free_lpis; + } - irqd = irq_domain_get_irq_data(domain, virq); + irqd = irq_domain_get_irq_data(domain, virq + i); - irq_domain_set_info(domain, virq, hwirq, &gicv5_lpi_irq_chip, NULL, - handle_fasteoi_irq, NULL, NULL); - irqd_set_single_target(irqd); + irq_domain_set_info(domain, virq + i, hwirq, &gicv5_lpi_irq_chip, + NULL, handle_fasteoi_irq, NULL, NULL); + irqd_set_single_target(irqd); - ret = gicv5_irs_iste_alloc(hwirq); - if (ret < 0) { - release_lpi(hwirq); - return ret; + gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); + gicv5_lpi_config_reset(irqd); } - gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI); - gicv5_lpi_config_reset(irqd); - return 0; + +out_free_lpis: + if (i) + gicv5_irq_lpi_domain_free(domain, virq, i); + + return ret; } static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = { @@ -868,21 +874,21 @@ static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int vi unsigned int nr_irqs, void *arg) { struct irq_data *irqd; - int ret, i; + int ret; - for (i = 0; i < nr_irqs; i++) { - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); - if (ret) - return ret; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret) + return ret; - irqd = irq_domain_get_irq_data(domain, virq + i); + for (unsigned int i = 0; i < nr_irqs; i++, virq++) { + irqd = irq_domain_get_irq_data(domain, virq); - irq_domain_set_hwirq_and_chip(domain, virq + i, i, - &gicv5_ipi_irq_chip, NULL); + irq_domain_set_hwirq_and_chip(domain, virq, i, + &gicv5_ipi_irq_chip, NULL); irqd_set_single_target(irqd); - irq_set_handler(virq + i, handle_percpu_irq); + irq_set_handler(virq, handle_percpu_irq); } return 0; @@ -902,8 +908,9 @@ static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int vi irq_set_handler(virq + i, NULL); irq_domain_reset_irq_data(d); - irq_domain_free_irqs_parent(domain, virq + i, 1); } + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); } static const struct irq_domain_ops gicv5_irq_ipi_domain_ops = { From a7c7e42654b6a8676610ee09d22901432c4851af Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Wed, 6 May 2026 09:37:43 +0000 Subject: [PATCH 696/972] irqchip/gic-v5: Allocate ITS parent LPIs as a range The ITS MSI domain no longer manages LPI allocation directly. LPIs are allocated and freed by the parent LPI domain, which can now handle a full range of interrupts and unwind partial allocations internally. Make the ITS domain request and release the parent IRQs as a single range instead of iterating over each interrupt. The ITS allocation path then only needs to reserve EventIDs, allocate the parent range, and fill in the ITS irq_data for each MSI. Since no operation in the per-MSI loop can fail, the partial parent-free unwind becomes unnecessary. On teardown, reset the ITS irq_data for the range and then release the parent range in one call, leaving LPI teardown to the LPI domain. Fixes: 0f0101325876 ("irqchip/gic-v5: Add GICv5 LPI/IPI support") Signed-off-by: Sascha Bischoff Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Reviewed-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260506093634.382062-4-sascha.bischoff@arm.com --- drivers/irqchip/irq-gic-v5-its.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c index 36d03f82ef6847..28e39b065de0ee 100644 --- a/drivers/irqchip/irq-gic-v5-its.c +++ b/drivers/irqchip/irq-gic-v5-its.c @@ -937,6 +937,7 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi int ret, i; its_dev = info->scratchpad[0].ptr; + device_id = its_dev->device_id; ret = gicv5_its_alloc_eventid(its_dev, info, nr_irqs, &event_id_base); if (ret) @@ -946,14 +947,11 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi if (ret) goto out_eventid; - device_id = its_dev->device_id; + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, NULL); + if (ret) + goto out_eventid; for (i = 0; i < nr_irqs; i++) { - ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, NULL); - if (ret) { - goto out_free_irqs; - } - /* * Store eventid and deviceid into the hwirq for later use. * @@ -972,12 +970,6 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi return 0; -out_free_irqs: - while (--i >= 0) { - irqd = irq_domain_get_irq_data(domain, virq + i); - irq_domain_reset_irq_data(irqd); - irq_domain_free_irqs_parent(domain, virq + i, 1); - } out_eventid: gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs); return ret; @@ -1000,14 +992,14 @@ static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int vi bitmap_release_region(its_dev->event_map, event_id_base, get_count_order(nr_irqs)); - /* Hierarchically free irq data */ for (i = 0; i < nr_irqs; i++) { d = irq_domain_get_irq_data(domain, virq + i); - irq_domain_reset_irq_data(d); - irq_domain_free_irqs_parent(domain, virq + i, 1); } + /* Hierarchically free irq data */ + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + gicv5_its_syncr(its, its_dev); gicv5_irs_syncr(); } From 512718bbc51b851140380b7068ec7365bd039cba Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 May 2026 12:05:18 +0100 Subject: [PATCH 697/972] genirq/chip: Don't call add_interrupt_randomness() for NMIs Recently handle_percpu_devid_irq() was changed to call add_interrupt_randomness(). This introduced a potential deadlock when handle_percpu_devid_irq() is used to handle an NMI, which can be detected with lockdep, e.g. ================================ WARNING: inconsistent lock state 7.1.0-rc2-pnmi #465 Not tainted -------------------------------- inconsistent {INITIAL USE} -> {IN-NMI} usage. perf/695 [HC1[1]:SC0[0]:HE0:SE1] takes: ffff00837dfd3a18 (&base->lock){-.-.}-{2:2}, at: lock_timer_base+0x6c/0xac {INITIAL USE} state was registered at: _raw_spin_lock_irqsave+0x68/0xb0 lock_timer_base+0x6c/0xac __mod_timer+0x100/0x32c add_timer_global+0x2c/0x40 __queue_delayed_work+0xf0/0x140 queue_delayed_work_on+0x134/0x138 mem_cgroup_css_online+0x30c/0x310 online_css+0x34/0x10c cgroup_init_subsys+0x158/0x1c8 cgroup_init+0x440/0x524 start_kernel+0x888/0x998 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&base->lock); lock(&base->lock); *** DEADLOCK *** Call trace: _raw_spin_lock_irqsave+0x68/0xb0 lock_timer_base+0x6c/0xac add_timer_on+0x78/0x16c add_interrupt_randomness+0x124/0x134 handle_percpu_devid_irq+0xd4/0x16c handle_irq_desc+0x40/0x58 generic_handle_domain_nmi+0x28/0x50 __gic_handle_nmi.isra.0+0x4c/0xa0 gic_handle_irq+0x38/0x2bc call_on_irq_stack+0x30/0x48 do_interrupt_handler+0x80/0x98 el1_interrupt+0x90/0xac el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x80/0x84 [...] During review, Thomas pointed out it wouldn't be safe for handle_percpu_devid_irq() to call add_interrupt_randomness() if it was used to handle NMIs: https://lore.kernel.org/lkml/87bjgik042.ffs@tglx/ ... but evidently people missed that handle_percpu_devid_irq() *is* used for NMIs. While it might seem that NMIs should be handled with a separate handle_percpu_devid_nmi() function, for various structural reasons this was impractical, and handle_percpu_devid_irq() has been expected to be used for NMIs since commits: 21bbbc50f398f ("irqchip/gic-v3: Switch high priority PPIs over to handle_percpu_devid_irq()") 5ff78c8de9d83 ("genirq: Kill handle_percpu_devid_fasteoi_nmi()") Taking the above into account, avoid the deadlock by not calling add_interrupt_randomness() when handle_percpu_devid_irq() is called in an NMI context. This is consistent with other NNI handling flows, which do not call add_interrupt_randomness(). At the same time, update the kernel-doc comment to make it clear that handle_percpu_devid_irq() can be called in NMI context. The rest of handle_percpu_devid_irq() is currently NMI safe and doesn't need to change. Fixes: fd7400cfcbaa ("genirq/chip: Invoke add_interrupt_randomness() in handle_percpu_devid_irq()") Reported-by: Ada Couprie Diaz Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Reviewed-by: Jinjie Ruan Reviewed-by: Marc Zyngier Link: https://patch.msgid.link/20260507110518.3128248-1-mark.rutland@arm.com --- kernel/irq/chip.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6c9b1dc4e7d467..b635e3c5d5b6b9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -893,7 +894,10 @@ void handle_percpu_irq(struct irq_desc *desc) * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is - * called + * called. + * + * May be used for NMI interrupt lines, and so may be called in IRQ or NMI + * context. */ void handle_percpu_devid_irq(struct irq_desc *desc) { @@ -930,7 +934,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) enabled ? " and unmasked" : "", irq, cpu); } - add_interrupt_randomness(irq); + if (!in_nmi()) + add_interrupt_randomness(irq); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); From 0fa10fb77069fb67aa51384868ef3702b7791465 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 6 May 2026 01:55:22 -0700 Subject: [PATCH 698/972] irqchip/ath79-cpu: Remove unused function ath79_cpu_irq_init() was part of the legacy pre-OF code that got removed a while back. Remove it to get rid of a missing prototype warning, reported by the kernel test robot. [ tglx: Fix the subject prefix. Sigh ... ] Fixes: 51fa4f8912c0 ("MIPS: ath79: drop legacy IRQ code") Reported-by: kernel test robot Signed-off-by: Rosen Penev Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260506085522.1210143-1-rosenp@gmail.com Closes: https://lore.kernel.org/oe-kbuild-all/202412011509.kGQkDr1y-lkp@intel.com/ --- drivers/irqchip/irq-ath79-cpu.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c index 923e4bba377676..9b7273a7f8ced9 100644 --- a/drivers/irqchip/irq-ath79-cpu.c +++ b/drivers/irqchip/irq-ath79-cpu.c @@ -85,10 +85,3 @@ static int __init ar79_cpu_intc_of_init( } IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc", ar79_cpu_intc_of_init); - -void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3) -{ - irq_wb_chan[2] = irq_wb_chan2; - irq_wb_chan[3] = irq_wb_chan3; - mips_cpu_irq_init(); -} From 5363b67ac8ebcc3e227dbf59fc8061949109841d Mon Sep 17 00:00:00 2001 From: Xianwei Zhao Date: Fri, 8 May 2026 07:36:54 +0000 Subject: [PATCH 699/972] irqchip/meson-gpio: Use the correct register in meson_s4_gpio_irq_set_type() meson_s4_gpio_irq_set_type() uses the both-edge trigger register for configuring level type and single edge mode interrupts, which is not correct. Use REG_EDGE_POL instead. Fixes: bbd6fcc76b39 ("irqchip: Add support for Amlogic A4 and A5 SoCs") Signed-off-by: Xianwei Zhao Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260508-a9-gpio-irqchip-v1-1-9dc5f3e022e0@amlogic.com --- drivers/irqchip/irq-meson-gpio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-meson-gpio.c b/drivers/irqchip/irq-meson-gpio.c index f722e9c57e2e40..74a376ef452e21 100644 --- a/drivers/irqchip/irq-meson-gpio.c +++ b/drivers/irqchip/irq-meson-gpio.c @@ -415,8 +415,7 @@ static int meson_s4_gpio_irq_set_type(struct meson_gpio_irq_controller *ctl, if (type & (IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING)) val |= BIT(ctl->params->edge_single_offset + idx); - meson_gpio_irq_update_bits(ctl, params->edge_pol_reg, - BIT(idx) | BIT(12 + idx), val); + meson_gpio_irq_update_bits(ctl, REG_EDGE_POL, BIT(idx) | BIT(12 + idx), val); return 0; }; From cefafbd561402b0fe6447449364a30315b9b1570 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 8 May 2026 02:31:21 -0700 Subject: [PATCH 700/972] irqchip/riscv-imsic: Clear interrupt move state during CPU offlining Affinity changes of IMSIC interrupts have to be careful to not lose an interrupt in the process. Each vector keeps track of an affinity change in progress with two pointers in struct imsic_vector. imsic_vector::move_prev points to the previous CPU target data and imsic_vector::move_next to the designated new CPU target data. imsic_vector::move_prev on the new CPU can only be cleared after the previous CPU has cleared imsic_vector::move_next, which ususally happens in __imsic_remote_sync(). In case of CPU hot-unplug __imsic_remote_sync() is not invoked because the CPU is already marked offline. That means imsic_vector::move_prev becomes stale until the CPU is onlined again. The stale pointer prevents further affinity changes for the affected interrupts. Solve this by clearing the imsic_vector::move_prev pointers in the CPU hotplug offline path. [ tglx: Replace word salad in change log ] Fixes: 0f67911e821c ("irqchip/riscv-imsic: Separate next and previous pointers in IMSIC vector") Signed-off-by: Yong-Xuan Wang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260508-imsic-v2-1-e9f08dd46cf5@sifive.com --- drivers/irqchip/irq-riscv-imsic-early.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c index ba903fa689bd52..a7a1852b548c48 100644 --- a/drivers/irqchip/irq-riscv-imsic-early.c +++ b/drivers/irqchip/irq-riscv-imsic-early.c @@ -158,6 +158,8 @@ static int imsic_dying_cpu(unsigned int cpu) /* Cleanup IPIs */ imsic_ipi_dying_cpu(); + imsic_local_sync_all(false); + /* Mark per-CPU IMSIC state as offline */ imsic_state_offline(); From 3799c2570982577551023ae035f5a786cf39a76e Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Sun, 10 May 2026 16:41:19 +0800 Subject: [PATCH 701/972] io_uring/fdinfo: translate SqThread PID through caller's pid_ns SQPOLL stores current->pid (init_pid_ns view) in sqd->task_pid at thread creation. fdinfo prints it raw via seq_printf("SqThread:\t%d\n", sq_pid). A reader inside a non-initial pid_ns sees the host PID, not the kthread's PID in the reader's own pid_ns. The SQPOLL kthread is created with CLONE_THREAD and no CLONE_NEW*, so it lives in the submitter's pid_ns. An unprivileged user_ns + pid_ns submitter can read fdinfo and learn the host PID of a kthread whose in-namespace PID is different. Reproducer (mainline 7.0, KASAN): unshare CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS, mount a private /proc, then have a grandchild that is pid 1 in the new pid_ns open an io_uring ring with IORING_SETUP_SQPOLL. /proc/self/task lists {1, 2}; the SQPOLL kthread is pid 2. Before: fdinfo prints SqThread = . After: SqThread = 2. Use task_pid_nr_ns() against the proc inode's pid_ns to compute sq_pid, instead of reading the stored sq->task_pid (which holds the init_pid_ns view). pidfd_show_fdinfo() in kernel/pid.c follows the same pattern. Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260510084119.457578-1-maoyi.xie@ntu.edu.sg Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index c2d3e45544bb4e..001fb542dc11a8 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -190,8 +190,9 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) get_task_struct(tsk); rcu_read_unlock(); usec = io_sq_cpu_usec(tsk); + sq_pid = task_pid_nr_ns(tsk, + proc_pid_ns(file_inode(m->file)->i_sb)); put_task_struct(tsk); - sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; sq_total_time = usec; sq_work_time = sq->work_time; From 1860c2f85922917d8a46f16a6f4bd2298ffa0fb5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 10 May 2026 22:48:43 +0800 Subject: [PATCH 702/972] ublk: reject max_sectors smaller than PAGE_SECTORS in parameter validation blk_validate_limits() requires max_hw_sectors >= PAGE_SECTORS and fires a WARN_ON_ONCE if this invariant is violated. ublk_validate_params() only checked the upper bound of max_sectors against max_io_buf_bytes, allowing userspace to pass small values (including zero) that trigger the warning when blk_mq_alloc_disk() is called from ublk_ctrl_start_dev(). Before 494ea040bcb5, ublk used blk_queue_max_hw_sectors() which silently clamped small values up to PAGE_SECTORS. The conversion to passing queue_limits directly to blk_mq_alloc_disk() lost that clamping and now hits blk_validate_limits()'s WARN_ON_ONCE instead. Validate that max_sectors is at least PAGE_SECTORS in ublk_validate_params() so invalid values are rejected early with -EINVAL instead of reaching the block layer. Fixes: 494ea040bcb5 ("ublk: pass queue_limits to blk_mq_alloc_disk") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260510144843.769031-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6d13f1481de053..6c041eaebdb911 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -920,6 +920,9 @@ static int ublk_validate_params(const struct ublk_device *ub) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) return -EINVAL; + if (p->max_sectors < PAGE_SECTORS) + return -EINVAL; + if (ublk_dev_is_zoned(ub) && !p->chunk_sectors) return -EINVAL; } else From 2997606dd17729404cef9821ce66dd037b6019eb Mon Sep 17 00:00:00 2001 From: Paolo Pisati Date: Fri, 8 May 2026 09:09:56 +0200 Subject: [PATCH 703/972] platform/x86: asus-nb-wmi: add DMI quirk for ASUS Zenbook Duo UX8407AA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the existing zenbook duo keyboard quirk for the UX8407AA model too. Signed-off-by: Paolo Pisati Reviewed-by: Denis Benato Link: https://patch.msgid.link/20260508070956.62201-1-p.pisati@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-nb-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index b4677c5bba5b44..8005c088e9eeed 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -544,6 +544,15 @@ static const struct dmi_system_id asus_quirks[] = { }, .driver_data = &quirk_asus_zenbook_duo_kbd, }, + { + .callback = dmi_matched, + .ident = "ASUS Zenbook Duo UX8407AA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUS"), + DMI_MATCH(DMI_PRODUCT_NAME, "Zenbook Duo UX8407AA"), + }, + .driver_data = &quirk_asus_zenbook_duo_kbd, + }, { .callback = dmi_matched, .ident = "ASUS ROG Z13", From 91840be8f710370607f949a627e070896faeddb8 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 30 Mar 2026 15:32:29 +0800 Subject: [PATCH 704/972] irq_work: Fix use-after-free in irq_work_single() on PREEMPT_RT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On PREEMPT_RT, non-HARD irq_work runs in per-CPU kthreads via run_irq_workd(), so irq_work_sync() uses rcuwait() to wait for BUSY==0. After irq_work_single() clears BUSY via atomic_cmpxchg(), it still dereferences @work for irq_work_is_hard() and rcuwait_wake_up(). An irq_work_sync() caller on another CPU that enters after BUSY is cleared can observe BUSY==0 immediately, return, and free the work before those accesses complete — causing a use-after-free. Fix this by wrapping run_irq_workd() in guard(rcu)() so that the entire irq_work_single() execution is within an RCU read-side critical section. Then add synchronize_rcu() in irq_work_sync() after rcuwait_wait_event() to ensure the caller waits for the RCU grace period before returning, preventing premature frees. Fixes: 810979682ccc ("irq_work: Allow irq_work_sync() to sleep if irq_work() no IRQ support.") Suggested-by: Sebastian Andrzej Siewior Suggested-by: Steven Rostedt Signed-off-by: Jiayuan Chen Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260330073234.303732-1-jiayuan.chen@linux.dev --- kernel/irq_work.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 120fd7365fbe29..f7e2dc2c30c621 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work) !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); + /* + * Ensure irq_work_single() does not access @work + * after removing IRQ_WORK_BUSY. It is always + * accessed within a RCU-read section. + */ + synchronize_rcu(); return; } @@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { + guard(rcu)(); irq_work_run_list(this_cpu_ptr(&lazy_list)); } From ce28b772c3c28fb1c62a81533045ae90ed3b496c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 May 2026 06:05:14 -0700 Subject: [PATCH 705/972] nvme: make prp passthrough usage less scary The warning is a bit alarming, and it only prints for the very first non-sgl capable device that receives a passthrough command. Just log an informational message on initial discovery for every device. Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/ioctl.c | 13 ++----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc388e24caadeb..1e7e42b43aa3f1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3749,6 +3749,10 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) ret = nvme_hwmon_init(ctrl); if (ret == -EINTR) return ret; + + if (!nvme_ctrl_sgl_supported(ctrl)) + dev_info(ctrl->device, + "passthrough uses implicit buffer lengths\n"); } clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9597a87cf05dc3..e232188ccd0268 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -120,21 +120,12 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); - struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; - if (!nvme_ctrl_sgl_supported(ctrl)) - dev_warn_once(ctrl->device, "using unchecked data buffer\n"); - if (has_metadata) { - if (!supports_metadata) - return -EINVAL; - - if (!nvme_ctrl_meta_sgl_supported(ctrl)) - dev_warn_once(ctrl->device, - "using unchecked metadata buffer\n"); - } + if (has_metadata && !supports_metadata) + return -EINVAL; if (iter) ret = blk_rq_map_user_iov(q, req, NULL, iter, GFP_KERNEL); From 2279cd9c61a330e5de4d6eb0bc422820dd6fdf36 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 May 2026 06:16:02 -0700 Subject: [PATCH 706/972] nvme: fix bio leak on mapping failure The local bio is always NULL, so we'd leak the bio if the integrity mapping failed. Just get it directly from the request. Fixes: d0d1d522316e91f ("blk-map: provide the bdev to bio if one exists") Reviewed-by: Sagi Grimberg Reviewed-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e232188ccd0268..08889b20e5d8c4 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -121,7 +121,6 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct block_device *bdev = ns ? ns->disk->part0 : NULL; bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); bool has_metadata = meta_buffer && meta_len; - struct bio *bio = NULL; int ret; if (has_metadata && !supports_metadata) @@ -145,8 +144,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, return ret; out_unmap: - if (bio) - blk_rq_unmap_user(bio); + if (req->bio) + blk_rq_unmap_user(req->bio); return ret; } From a891962ae5e48fb5476c23631df759482e62ab16 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 30 Apr 2026 15:22:32 +0200 Subject: [PATCH 707/972] nvmet-auth: Do not print DH-HMAC-CHAP secrets From a security standpoint we should not allow to print out the DH-HMAC-CHAP secrets, but at the same time having them is useful for debugging authentication failures. So add a Kconfig option NVME_TARGET_AUTH_DEBUG to only enable debugging if explictly requested at build time. Reviewed-by: Sagi Grimberg Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/Kconfig | 9 +++++++++ drivers/nvme/target/auth.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 4904097dfd490f..69bde270115e29 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -117,6 +117,15 @@ config NVME_TARGET_AUTH If unsure, say N. +config NVME_TARGET_AUTH_DEBUG + bool "NVMe over Fabrics In-band Authentication debug messages" + depends on NVME_TARGET_AUTH + help + This enables additional debug messages including the generated + DH-HMAC-CHAP secrets to help debugging authentication failures. + + If unsure, say N. + config NVME_TARGET_PCI_EPF tristate "NVMe PCI Endpoint Function target support" depends on NVME_TARGET && PCI_ENDPOINT diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 9a2eccdc8b1332..edb9627d97b096 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -144,7 +144,6 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) goto out_unlock; list_for_each_entry(p, &ctrl->subsys->hosts, entry) { - pr_debug("check %s\n", nvmet_host_name(p->host)); if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn)) continue; host = p->host; @@ -189,11 +188,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) ctrl->host_key = NULL; goto out_free_hash; } +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: using hash %s key %*ph\n", __func__, ctrl->host_key->hash > 0 ? nvme_auth_hmac_name(ctrl->host_key->hash) : "none", (int)ctrl->host_key->len, ctrl->host_key->key); - +#endif nvme_auth_free_key(ctrl->ctrl_key); if (!host->dhchap_ctrl_secret) { ctrl->ctrl_key = NULL; @@ -207,11 +207,12 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) ctrl->ctrl_key = NULL; goto out_free_hash; } +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: using ctrl hash %s key %*ph\n", __func__, ctrl->ctrl_key->hash > 0 ? nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none", (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key); - +#endif out_free_hash: if (ret) { if (ctrl->host_key) { @@ -317,7 +318,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, if (ret) goto out_free_challenge; } - pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); @@ -434,8 +434,10 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req, ret = -EINVAL; } else { memcpy(buf, ctrl->dh_key, buf_size); +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG pr_debug("%s: ctrl %d public key %*ph\n", __func__, ctrl->cntlid, (int)buf_size, buf); +#endif } return ret; @@ -458,11 +460,12 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, ctrl->shash_id); if (ret) pr_debug("failed to compute session key, err %d\n", ret); +#ifdef CONFIG_NVME_TARGET_AUTH_DEBUG else pr_debug("%s: session key %*ph\n", __func__, (int)req->sq->dhchap_skey_len, req->sq->dhchap_skey); - +#endif return ret; } From b35a13036755c5803168a7cb93bc66035c3e65b8 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 29 Apr 2026 16:11:16 +0800 Subject: [PATCH 708/972] nvme-pci: fix use-after-free in nvme_free_host_mem() nvme_free_host_mem() frees dev->hmb_sgt via dma_free_noncontiguous() but never clears the pointer afterward. This leads to a use-after-free if nvme_free_host_mem() is called twice in the same error path. This can happen during nvme_probe() when nvme_setup_host_mem() succeeds in allocating the HMB (setting dev->hmb_sgt) but nvme_set_host_mem() fails with an I/O error: nvme_setup_host_mem() nvme_alloc_host_mem_single() -> sets dev->hmb_sgt nvme_set_host_mem() -> fails with -EIO nvme_free_host_mem() -> frees hmb_sgt, but does NOT NULL it return error nvme_probe() error path: nvme_free_host_mem() -> dev->hmb_sgt is stale, use-after-free The second call dereferences the freed sgt, causing a NULL pointer dereference in iommu_dma_free_noncontiguous() when it accesses sgt->sgl->dma_address (the backing memory has been freed and zeroed). This is reproducible on Thunderbolt-attached NVMe devices (e.g., OWC Envoy Express behind a Dell WD22TB4 dock) where the device intermittently returns I/O errors during HMB setup due to PCIe link instability. BUG: kernel NULL pointer dereference, address: 0000000000000010 RIP: 0010:iommu_dma_free_noncontiguous+0x22/0x80 Call Trace: dma_free_noncontiguous+0x3b/0x130 nvme_free_host_mem+0x30/0xf0 [nvme] nvme_probe.cold+0xcc/0x275 [nvme] local_pci_probe+0x43/0xa0 pci_device_probe+0xeea/0x290 really_probe+0xf9/0x3b0 __driver_probe_device+0x8b/0x170 driver_probe_device+0x24/0xd0 __driver_attach_async_helper+0x6b/0x110 async_run_entry_fn+0x37/0x170 process_one_work+0x1ac/0x3d0 worker_thread+0x1b8/0x360 kthread+0xf7/0x130 ret_from_fork+0x2d8/0x3a0 ret_from_fork_asm+0x1a/0x30 Fix this by setting dev->hmb_sgt to NULL after freeing it, so the second call takes the multi-descriptor path which safely handles the already-cleaned-up state. Fixes: 63a5c7a4b4c4 ("nvme-pci: use dma_alloc_noncontigous if possible") Signed-off-by: Chia-Lin Kao (AceLan) Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9fd04cd7c5cb13..94c423bed947bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2533,11 +2533,13 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev) static void nvme_free_host_mem(struct nvme_dev *dev) { - if (dev->hmb_sgt) + if (dev->hmb_sgt) { dma_free_noncontiguous(dev->dev, dev->host_mem_size, dev->hmb_sgt, DMA_BIDIRECTIONAL); - else + dev->hmb_sgt = NULL; + } else { nvme_free_host_mem_multi(dev); + } dma_free_coherent(dev->dev, dev->host_mem_descs_size, dev->host_mem_descs, dev->host_mem_descs_dma); From dbbd07d0a7020b80f6a7028e561908f7b83b3d5a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 10 May 2026 23:30:29 +0300 Subject: [PATCH 709/972] nvmet-tcp: Fix potential UAF when ddgst mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shivam Kumar found via vulnerability testing: When data digest is enabled on an NVMe/TCP connection and a digest mismatch occurs on a non-final H2C_DATA PDU during an R2T-based data transfer, the digest error handler in nvmet_tcp_try_recv_ddgst() calls nvmet_req_uninit() — which performs percpu_ref_put() on the submission queue — but does NOT mark the command as completed. It does not set cqe->status, does not modify rbytes_done, and does not clear any flag. When the subsequent fatal error triggers queue teardown, nvmet_tcp_uninit_data_in_cmds() iterates all commands, checks nvmet_tcp_need_data_in() for each one, and finds that the already-uninited command still appears to need data (because rbytes_done < transfer_len and cqe->status == 0). It therefore calls nvmet_req_uninit() a second time on the same command — a double percpu_ref_put against a single percpu_ref_get. Reported-by: Shivam Kumar Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 164a564ba3b4e9..20f150d17a9625 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1321,8 +1321,10 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) queue->idx, cmd->req.cmd->common.command_id, queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), le32_to_cpu(cmd->exp_ddgst)); - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED)) { + cmd->req.cqe->status = NVME_SC_CMD_SEQ_ERROR; nvmet_req_uninit(&cmd->req); + } nvmet_tcp_free_cmd_buffers(cmd); ret = -EPROTO; goto out; From 40df2172853ffc0f84cca3906f5c4d9a6131195d Mon Sep 17 00:00:00 2001 From: AlanCui4080 Date: Fri, 8 May 2026 17:59:12 +0800 Subject: [PATCH 710/972] Revert "nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808" This reverts commit 7f991e3f9b8f044640bcb5fa8570350a68932843 ("nvme: add quirk NVME_QUIRK_IGNORE_DEV_SUBNQN for 144d:a808") The incorrect implementations of SUBNQN is a known issue in a massive number of NVMe units. However, the warning "nvme nvmex: missing or invalid SUBNQN field." is usually appropriate and will not affect performance or behavior etc. That is because the support for SUBNQN is mandatory if the controller supports NVMe revision 1.2.1 or greater, and it reported itself without a SUBNQN field which breaks compliance with the specification. It should be not quirked by the Linux Kernel. Signed-off-by: Alan Cui Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 94c423bed947bc..139a10cd687f9a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4109,8 +4109,6 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0555), /* Memblaze Pblaze5 adapter */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, }, - { PCI_DEVICE(0x144d, 0xa808), /* Samsung PM981/983 */ - .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ From c1fa0bb633e4a6b11e83ffc57fa5abe8ebb87891 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 11 May 2026 08:55:11 -0700 Subject: [PATCH 711/972] exit: prevent preemption of oopsing TASK_DEAD task When an already-exiting task oopses, make_task_dead() currently calls do_task_dead() with preemption enabled. That is forbidden: do_task_dead() calls __schedule(), which has a comment saying "WARNING: must be called with preemption disabled!". If an oopsing task is preempted in do_task_dead(), between becoming TASK_DEAD and entering the scheduler explicitly, bad things happen: finish_task_switch() assumes that once the scheduler has switched away from a TASK_DEAD task, the task can never run again and its stack is no longer needed; but that assumption apparently doesn't hold if the dead task was preempted (the SM_PREEMPT case). This means that the scheduler ends up repeatedly dropping references on the dead task's stack, which can lead to use-after-free or double-free of the entire task stack; in other words, two tasks can end up running on the same stack, resulting in various kinds of memory corruption. (This does not just affect "recursively oopsing" tasks; it is enough to oops once during task exit, for example in a file_operations::release handler) Fixes: 7f80a2fd7db9 ("exit: Stop poorly open coding do_task_dead in make_task_dead") Cc: stable@kernel.org Signed-off-by: Jann Horn Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e793..9a909993ab1d8b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1073,6 +1073,7 @@ void __noreturn make_task_dead(int signr) futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); + preempt_disable(); do_task_dead(); } From e4865a56d013e86e46ea6acea15bb6eae01898ff Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 May 2026 20:04:33 +0200 Subject: [PATCH 712/972] ACPI: driver: Check ACPI_COMPANION() against NULL during probe Since every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), platform drivers that rely on the existence of a device's ACPI companion object should verify its presence. Accordingly, add requisite ACPI_COMPANION() or ACPI_HANDLE() checks against NULL to 13 platform drivers handling core ACPI devices. Also change the value returned by the ACPI thermal zone driver when the device's ACPI companion is not present to -ENODEV for consistency with the other drivers. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/4516068.ejJDZkT8p0@rafael.j.wysocki Cc: 7.0+ # 7.0+ --- drivers/acpi/ac.c | 6 +++++- drivers/acpi/acpi_pad.c | 6 +++++- drivers/acpi/acpi_tad.c | 6 +++++- drivers/acpi/battery.c | 6 +++++- drivers/acpi/button.c | 9 +++++++-- drivers/acpi/ec.c | 6 +++++- drivers/acpi/hed.c | 6 +++++- drivers/acpi/nfit/core.c | 6 +++++- drivers/acpi/pfr_telemetry.c | 6 +++++- drivers/acpi/pfr_update.c | 6 +++++- drivers/acpi/sbs.c | 6 +++++- drivers/acpi/sbshc.c | 6 +++++- drivers/acpi/thermal.c | 2 +- drivers/acpi/tiny-power-button.c | 6 +++++- 14 files changed, 68 insertions(+), 15 deletions(-) diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index e9e970fd8f33e6..27f31744f29e23 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -192,11 +192,15 @@ static const struct dmi_system_id ac_dmi_table[] __initconst = { static int acpi_ac_probe(struct platform_device *pdev) { - struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); struct power_supply_config psy_cfg = {}; + struct acpi_device *adev; struct acpi_ac *ac; int result; + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; + ac = kzalloc_obj(struct acpi_ac); if (!ac) return -ENOMEM; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 0a8e02bc8c8b6a..ec94b09bb74716 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -423,7 +423,11 @@ static void acpi_pad_notify(acpi_handle handle, u32 event, void *data) static int acpi_pad_probe(struct platform_device *pdev) { - struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct acpi_device *adev; + + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; return acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_pad_notify, adev); diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index cac07e997028a3..386fc1abcbdcb5 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -815,12 +815,16 @@ static void acpi_tad_remove(void *data) static int acpi_tad_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - acpi_handle handle = ACPI_HANDLE(dev); struct acpi_tad_driver_data *dd; + acpi_handle handle; acpi_status status; unsigned long long caps; int ret; + handle = ACPI_HANDLE(dev); + if (!handle) + return -ENODEV; + /* * Initialization failure messages are mostly about firmware issues, so * print them at the "info" level. diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index b4c25474f42f4d..d4ae21a7100796 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -1214,10 +1214,14 @@ static void sysfs_battery_cleanup(struct acpi_battery *battery) static int acpi_battery_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); struct acpi_battery *battery; + struct acpi_device *device; int result; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (device->dep_unmet) return -EPROBE_DEFER; diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index dc064a388c23ea..b47301ee4c8a82 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -531,15 +531,20 @@ static int acpi_lid_input_open(struct input_dev *input) static int acpi_button_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); acpi_notify_handler handler; + struct acpi_device *device; struct acpi_button *button; struct input_dev *input; - const char *hid = acpi_device_hid(device); acpi_status status; char *name, *class; + const char *hid; int error = 0; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + + hid = acpi_device_hid(device); if (!strcmp(hid, ACPI_BUTTON_HID_LID) && lid_init_state == ACPI_BUTTON_LID_INIT_DISABLED) return -ENODEV; diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 45204538ed874e..64ad4cfa6208bd 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1676,10 +1676,14 @@ static int acpi_ec_setup(struct acpi_ec *ec, struct acpi_device *device, bool ca static int acpi_ec_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; struct acpi_ec *ec; int ret; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (boot_ec && (boot_ec->handle == device->handle || !strcmp(acpi_device_hid(device), ACPI_ECDT_HID))) { /* Fast path: this device corresponds to the boot EC. */ diff --git a/drivers/acpi/hed.c b/drivers/acpi/hed.c index 4d5e12ed6f3c25..060e8d670f5d37 100644 --- a/drivers/acpi/hed.c +++ b/drivers/acpi/hed.c @@ -50,9 +50,13 @@ static void acpi_hed_notify(acpi_handle handle, u32 event, void *data) static int acpi_hed_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; int err; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + /* Only one hardware error device */ if (hed_handle) return -EINVAL; diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index d13264fb9e026b..9304ac996d41ad 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3341,12 +3341,16 @@ static int acpi_nfit_probe(struct platform_device *pdev) struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_nfit_desc *acpi_desc; struct device *dev = &pdev->dev; - struct acpi_device *adev = ACPI_COMPANION(dev); struct acpi_table_header *tbl; + struct acpi_device *adev; acpi_status status = AE_OK; acpi_size sz; int rc = 0; + adev = ACPI_COMPANION(&pdev->dev); + if (!adev) + return -ENODEV; + rc = acpi_dev_install_notify_handler(adev, ACPI_DEVICE_NOTIFY, acpi_nfit_notify, dev); if (rc) diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 32bdf8cbe8f237..2387376832a1b6 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -360,10 +360,14 @@ static void pfrt_log_put_idx(void *data) static int acpi_pfrt_log_probe(struct platform_device *pdev) { - acpi_handle handle = ACPI_HANDLE(&pdev->dev); struct pfrt_log_device *pfrt_log_dev; + acpi_handle handle; int ret; + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + if (!acpi_has_method(handle, "_DSM")) { dev_dbg(&pdev->dev, "Missing _DSM\n"); return -ENODEV; diff --git a/drivers/acpi/pfr_update.c b/drivers/acpi/pfr_update.c index 11b1c282800525..6283105bb0e8b2 100644 --- a/drivers/acpi/pfr_update.c +++ b/drivers/acpi/pfr_update.c @@ -538,10 +538,14 @@ static void pfru_put_idx(void *data) static int acpi_pfru_probe(struct platform_device *pdev) { - acpi_handle handle = ACPI_HANDLE(&pdev->dev); struct pfru_device *pfru_dev; + acpi_handle handle; int ret; + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + if (!acpi_has_method(handle, "_DSM")) { dev_dbg(&pdev->dev, "Missing _DSM\n"); return -ENODEV; diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 440f1d69aca86c..86b7c797585263 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -629,11 +629,15 @@ static void acpi_sbs_callback(void *context) static int acpi_sbs_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; struct acpi_sbs *sbs; int result = 0; int id; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + sbs = kzalloc_obj(struct acpi_sbs); if (!sbs) { result = -ENOMEM; diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index f413270415b687..c0ffa267f96cd5 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -237,11 +237,15 @@ static int smbus_alarm(void *context) static int acpi_smbus_hc_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; int status; unsigned long long val; struct acpi_smb_hc *hc; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + status = acpi_evaluate_integer(device->handle, "_EC", NULL, &val); if (ACPI_FAILURE(status)) { pr_err("error obtaining _EC.\n"); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index b8b487d89d25b1..dfc7daa809b50f 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -789,7 +789,7 @@ static int acpi_thermal_probe(struct platform_device *pdev) int i; if (!device) - return -EINVAL; + return -ENODEV; tz = kzalloc_obj(struct acpi_thermal); if (!tz) diff --git a/drivers/acpi/tiny-power-button.c b/drivers/acpi/tiny-power-button.c index 531e65b01bcbe1..92516ef84b0216 100644 --- a/drivers/acpi/tiny-power-button.c +++ b/drivers/acpi/tiny-power-button.c @@ -38,9 +38,13 @@ static u32 acpi_tiny_power_button_event(void *not_used) static int acpi_tiny_power_button_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; acpi_status status; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + if (device->device_type == ACPI_BUS_TYPE_POWER_BUTTON) { status = acpi_install_fixed_event_handler(ACPI_EVENT_POWER_BUTTON, acpi_tiny_power_button_event, From 37953cec775ed34e59cf9a7d7bb9b0610daa3f3e Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 8 May 2026 15:33:29 +0200 Subject: [PATCH 713/972] nvme: fix race condition between connected uevent and STARTED_ONCE flag When a controller connects, nvme_start_ctrl() emits the "NVME_EVENT=connected" uevent and sets the NVME_CTRL_STARTED_ONCE flag. Currently, the uevent is emitted before the flag is set. This creates a race condition for userspace tools (like udev rules) that might rely on the "connected" event to configure other attributes. Swap the order of operations in nvme_start_ctrl() so that the NVME_CTRL_STARTED_ONCE flag is set before the uevent is sent. This guarantees that the admin_timeout can already be changed when userspace is notified. Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1e7e42b43aa3f1..c3032d6ad6b1e2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5045,8 +5045,8 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_mpath_update(ctrl); } - nvme_change_uevent(ctrl, "NVME_EVENT=connected"); set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags); + nvme_change_uevent(ctrl, "NVME_EVENT=connected"); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); From 20c39819a27646573dfa0ac0d01c38895298a6f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:38 -0600 Subject: [PATCH 714/972] io_uring: hold uring_lock when walking link chain in io_wq_free_work() io_wq_free_work() calls io_req_find_next() from io-wq worker context, which reads and clears req->link without holding any lock. This can potentially race with other paths that mutate the same chain under ctx->uring_lock. Take ctx->uring_lock around the io_req_find_next() call. Only requests with IO_REQ_LINK_FLAGS reach this path, which is not the hot path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ed998d60c09cb..2ebb0ba37c4f01 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1452,8 +1452,13 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *nxt = NULL; if (req_ref_put_and_test_atomic(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) + if (req->flags & IO_REQ_LINK_FLAGS) { + struct io_ring_ctx *ctx = req->ctx; + + mutex_lock(&ctx->uring_lock); nxt = io_req_find_next(req); + mutex_unlock(&ctx->uring_lock); + } io_free_req(req); } return nxt ? &nxt->work : NULL; From 49ae66eb8c27375075ffa308cfd4bf25af335d41 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:50 -0600 Subject: [PATCH 715/972] io_uring: defer linked-timeout chain splice out of hrtimer context io_link_timeout_fn() is the hrtimer callback that fires when a linked timeout expires. It currently calls io_remove_next_linked(prev) under ctx->timeout_lock to splice the timeout request out of the link chain. This is the only chain-mutation site that runs without ctx->uring_lock, because hrtimer callbacks cannot take a mutex. Defer the splicing until the task_work callback. Signed-off-by: Jens Axboe --- io_uring/timeout.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e2595cae2b073e..6353a4d979dc00 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -284,6 +284,10 @@ static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, struct io_timeout *timeout = io_kiocb_to_cmd(link, struct io_timeout); io_remove_next_linked(req); + + /* If this is NULL, then timer already claimed it and will complete it */ + if (!timeout->head) + return NULL; timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&timeout->list); @@ -367,6 +371,14 @@ static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) int ret; if (prev) { + /* + * splice the linked timeout out of prev's chain if the regular + * completion path didn't already do it. + */ + if (prev->link == req) + prev->link = req->link; + req->link = NULL; + if (!tw.cancel) { struct io_cancel_data cd = { .ctx = req->ctx, @@ -401,10 +413,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) /* * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. + * race with the completion of the linked work. Splice of prev is + * done in io_req_task_link_timeout(), if needed. */ if (prev) { - io_remove_next_linked(prev); if (!req_ref_inc_not_zero(prev)) prev = NULL; } From a65855ec34aed84e1e5b4aea0323cc1745f83a5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 May 2026 10:58:56 -0600 Subject: [PATCH 716/972] io_uring: hold uring_lock across io_kill_timeouts() in cancel path io_uring_try_cancel_requests() dropped ctx->uring_lock before calling io_kill_timeouts(), which walks each timeout's link chain via io_match_task() to test REQ_F_INFLIGHT. With chain mutation now serialized by ctx->uring_lock, that walk needs the lock too. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 5e5eb9cfc7cd6f..4aa3103ba9c39c 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -561,8 +561,8 @@ __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); - mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, tctx, cancel_all); + mutex_unlock(&ctx->uring_lock); if (tctx) ret |= io_run_task_work() > 0; else From 35d0ed82d03e5ee77ea4f31f20e29562a7721649 Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 5 May 2026 11:08:12 +0200 Subject: [PATCH 717/972] libceph: Fix potential out-of-bounds access in osdmap_decode() When decoding osd_state and osd_weight from an incoming osdmap in osdmap_decode(), both are decoded for each osd, i.e., map->max_osd times. The ceph_decode_need() check only accounts for sizeof(*map->osd_weight) once. This can potentially result in an out-of-bounds memory access if the incoming message is corrupted such that the max_osd value exceeds the actual content of the osdmap message. This patch fixes the issue by changing the corresponding part in the ceph_decode_need() check to account for map->max_osd*sizeof(*map->osd_weight). Cc: stable@vger.kernel.org Fixes: dcbc919a5dc8 ("libceph: switch osdmap decoding to use ceph_decode_entity_addr") Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 669348d883f09b..2095e73ccf6c93 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1705,7 +1705,7 @@ static int osdmap_decode(void **p, void *end, bool msgr2, ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + - sizeof(*map->osd_weight), e_inval); + map->max_osd*sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; From 5dd74441cbf42c22e874450eb6a6bbb19390a216 Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Sat, 9 May 2026 18:20:31 +0800 Subject: [PATCH 718/972] cgroup/cpuset: Reserve DL bandwidth only for root-domain moves cpuset_can_attach() currently adds the bandwidth of all migrating SCHED_DEADLINE tasks to sum_migrate_dl_bw. If the source and destination cpuset effective CPU masks do not overlap, the whole sum is then reserved in the destination root domain. set_cpus_allowed_dl(), however, subtracts bandwidth from the source root domain only when the affinity change really moves the task between root domains. A DL task can move between cpusets that are still in the same root domain, so including that task in sum_migrate_dl_bw can reserve destination bandwidth without a matching source-side subtraction. Share the root-domain move test with set_cpus_allowed_dl(). Keep nr_migrate_dl_tasks counting all migrating deadline tasks for cpuset DL task accounting, but add to sum_migrate_dl_bw only for tasks that need a root-domain bandwidth move. Keep using the destination cpuset effective CPU mask and leave the broader can_attach()/attach() transaction model unchanged. Fixes: 2ef269ef1ac0 ("cgroup/cpuset: Free DL BW in case can_attach() fails") Cc: stable@vger.kernel.org # v6.10+ Signed-off-by: Guopeng Zhang Reviewed-by: Waiman Long Acked-by: Juri Lelli Tested-by: Juri Lelli Signed-off-by: Tejun Heo --- include/linux/sched/deadline.h | 9 +++++++++ kernel/cgroup/cpuset-internal.h | 1 + kernel/cgroup/cpuset.c | 33 ++++++++++++++++++--------------- kernel/sched/deadline.c | 13 ++++++++++--- 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 1198138cb839eb..273538200a4459 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -33,6 +33,15 @@ struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); +/* + * Return whether moving DL task @p to @new_mask requires moving DL + * bandwidth accounting between root domains. This helper is specific to + * DL bandwidth move accounting semantics and is shared by + * cpuset_can_attach() and set_cpus_allowed_dl() so both paths use the + * same source root-domain test. + */ +extern bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask); extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea300c..f7aaf01f7cd5e3 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3fbf6e7f68c311..e84e801e22cf42 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2993,7 +2993,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,28 +3038,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - ret = -EINVAL; - goto out_unlock; - } + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; + } - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) - goto out_unlock; + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; - cs->dl_bw_cpu = cpu; - } + cs->dl_bw_cpu = cpu; out_success: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165d9..7db4c87df83b0a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3107,20 +3107,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3135,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { From 92f3403a7ca5d186b2bad342603410cc6adc95a3 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 May 2026 18:50:27 +0530 Subject: [PATCH 719/972] drm/xe/madvise: Track purgeability with BO-local counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xe_bo_recompute_purgeable_state() walks all VMAs of a BO to determine whether the BO can be made purgeable. This makes VMA create/destroy and madvise updates O(n) in the number of mappings. Replace the walk with BO-local counters protected by the BO dma-resv lock: - vma_count tracks the number of VMAs mapping the BO. - willneed_count tracks active WILLNEED holders, including WILLNEED VMAs and active dma-buf exports for non-imported BOs. A DONTNEED BO is promoted back to WILLNEED on a 0->1 transition of willneed_count. A BO is demoted to DONTNEED on a 1->0 transition only when it still has VMAs, preserving the previous behaviour where a BO with no mappings keeps its current madvise state. PURGED remains terminal, preserving the existing "once purged, always purged" rule. Fixes: 4f44961eab84 ("drm/xe/vm: Prevent binding of purged buffer objects") v2: - Use early return for imported BOs in all four helpers to avoid nesting (Matt B). - Group purgeability state into a purgeable sub-struct on struct xe_bo (Matt B). - Reword xe_bo_willneed_put_locked() kernel-doc to explain that a 1->0 transition means all remaining active VMAs are DONTNEED (Matt B). v3: - Move DONTNEED/PURGED reject from vma_lock_and_validate() into xe_vma_create(), gated on attr->purgeable_state == WILLNEED. Fixes vm_bind bypass and partial-unbind rejection on DONTNEED BOs (Matt B). - Drop .check_purged from MAP and REMAP; keep it for PREFETCH and add a comment why (Matt B). - Skip BO validation in vma_lock_and_validate() for non-WILLNEED VMA remnants so cleanup/remap paths do not repopulate DONTNEED/PURGED BOs. Suggested-by: Thomas Hellström Cc: Matthew Brost Cc: Thomas Hellström Cc: Himal Prasad Ghimiray Signed-off-by: Arvind Yadav Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20260506132027.2556046-1-arvind.yadav@intel.com Signed-off-by: Himal Prasad Ghimiray (cherry picked from commit 23fb2ea56cb4fa2587bc072b04e4e698687a48e4) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_bo.c | 6 +- drivers/gpu/drm/xe/xe_bo.h | 88 +++++++++++++++- drivers/gpu/drm/xe/xe_bo_types.h | 28 ++++- drivers/gpu/drm/xe/xe_dma_buf.c | 28 ++++- drivers/gpu/drm/xe/xe_vm.c | 51 +++++++-- drivers/gpu/drm/xe/xe_vm_madvise.c | 162 ++--------------------------- drivers/gpu/drm/xe/xe_vm_madvise.h | 2 - 7 files changed, 190 insertions(+), 175 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 4075edf974216c..6b518858538f57 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -897,10 +897,10 @@ void xe_bo_set_purgeable_state(struct xe_bo *bo, new_state == XE_MADV_PURGEABLE_PURGED); /* Once purged, always purged - cannot transition out */ - xe_assert(xe, !(bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED && + xe_assert(xe, !(bo->purgeable.state == XE_MADV_PURGEABLE_PURGED && new_state != XE_MADV_PURGEABLE_PURGED)); - bo->madv_purgeable = new_state; + bo->purgeable.state = new_state; xe_bo_set_purgeable_shrinker(bo, new_state); } @@ -2368,7 +2368,7 @@ struct xe_bo *xe_bo_init_locked(struct xe_device *xe, struct xe_bo *bo, INIT_LIST_HEAD(&bo->vram_userfault_link); /* Initialize purge advisory state */ - bo->madv_purgeable = XE_MADV_PURGEABLE_WILLNEED; + bo->purgeable.state = XE_MADV_PURGEABLE_WILLNEED; drm_gem_private_object_init(&xe->drm, &bo->ttm.base, size); diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h index 68dea7d25a6b7c..6340317f7d2e6a 100644 --- a/drivers/gpu/drm/xe/xe_bo.h +++ b/drivers/gpu/drm/xe/xe_bo.h @@ -251,7 +251,7 @@ static inline bool xe_bo_is_protected(const struct xe_bo *bo) static inline bool xe_bo_is_purged(struct xe_bo *bo) { xe_bo_assert_held(bo); - return bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED; + return bo->purgeable.state == XE_MADV_PURGEABLE_PURGED; } /** @@ -268,11 +268,95 @@ static inline bool xe_bo_is_purged(struct xe_bo *bo) static inline bool xe_bo_madv_is_dontneed(struct xe_bo *bo) { xe_bo_assert_held(bo); - return bo->madv_purgeable == XE_MADV_PURGEABLE_DONTNEED; + return bo->purgeable.state == XE_MADV_PURGEABLE_DONTNEED; } void xe_bo_set_purgeable_state(struct xe_bo *bo, enum xe_madv_purgeable_state new_state); +/** + * xe_bo_willneed_get_locked() - Acquire a WILLNEED holder on a BO + * @bo: Buffer object + * + * Increments willneed_count and, on a 0->1 transition, promotes the BO + * from DONTNEED to WILLNEED. PURGED is terminal and is never modified. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_willneed_get_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + /* Imported BOs are owned externally; do not track purgeability. */ + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + if (bo->purgeable.willneed_count++ == 0 && xe_bo_madv_is_dontneed(bo)) + xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_WILLNEED); +} + +/** + * xe_bo_willneed_put_locked() - Release a WILLNEED holder on a BO + * @bo: Buffer object + * + * Decrements willneed_count and, on a 1->0 transition, marks the BO + * DONTNEED only if it still has VMAs (implying all active VMAs are + * DONTNEED). If the last VMA is being removed, preserve the current BO + * state to match the previous VMA-walk semantics. + * + * PURGED is terminal and the BO state is never modified. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_willneed_put_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + xe_assert(xe_bo_device(bo), bo->purgeable.willneed_count > 0); + if (--bo->purgeable.willneed_count == 0 && bo->purgeable.vma_count > 0 && + !xe_bo_is_purged(bo)) + xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_DONTNEED); +} + +/** + * xe_bo_vma_count_inc_locked() - Account a new VMA on a BO + * @bo: Buffer object + * + * Increments vma_count. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_vma_count_inc_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + bo->purgeable.vma_count++; +} + +/** + * xe_bo_vma_count_dec_locked() - Account a VMA removal on a BO + * @bo: Buffer object + * + * Decrements vma_count. + * + * Caller must hold the BO's dma-resv lock. + */ +static inline void xe_bo_vma_count_dec_locked(struct xe_bo *bo) +{ + xe_bo_assert_held(bo); + + if (drm_gem_is_imported(&bo->ttm.base)) + return; + + xe_assert(xe_bo_device(bo), bo->purgeable.vma_count > 0); + bo->purgeable.vma_count--; +} + static inline void xe_bo_unpin_map_no_vm(struct xe_bo *bo) { if (likely(bo)) { diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h index 9d19940b8fc035..077e35b4cdce75 100644 --- a/drivers/gpu/drm/xe/xe_bo_types.h +++ b/drivers/gpu/drm/xe/xe_bo_types.h @@ -111,10 +111,32 @@ struct xe_bo { u64 min_align; /** - * @madv_purgeable: user space advise on BO purgeability, protected - * by BO's dma-resv lock. + * @purgeable: Purgeability state and accounting. + * + * All fields are protected by the BO's dma-resv lock. */ - u32 madv_purgeable; + struct { + /** + * @purgeable.state: BO purgeability state + * (WILLNEED/DONTNEED/PURGED). + */ + u32 state; + + /** + * @purgeable.vma_count: Number of VMAs currently mapping this BO. + */ + u32 vma_count; + + /** + * @purgeable.willneed_count: Number of active WILLNEED holders. + * + * Counts WILLNEED VMAs plus active dma-buf exports for + * non-imported BOs. The BO flips to DONTNEED on a 1->0 + * transition only when VMAs still exist; if the last VMA is + * removed, the previous BO state is preserved. + */ + u32 willneed_count; + } purgeable; }; #endif diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index b9828da1589723..855d32ba314dd5 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -193,6 +193,18 @@ static int xe_dma_buf_begin_cpu_access(struct dma_buf *dma_buf, return 0; } +static void xe_dma_buf_release(struct dma_buf *dmabuf) +{ + struct drm_gem_object *obj = dmabuf->priv; + struct xe_bo *bo = gem_to_xe_bo(obj); + + xe_bo_lock(bo, false); + xe_bo_willneed_put_locked(bo); + xe_bo_unlock(bo); + + drm_gem_dmabuf_release(dmabuf); +} + static const struct dma_buf_ops xe_dmabuf_ops = { .attach = xe_dma_buf_attach, .detach = xe_dma_buf_detach, @@ -200,7 +212,7 @@ static const struct dma_buf_ops xe_dmabuf_ops = { .unpin = xe_dma_buf_unpin, .map_dma_buf = xe_dma_buf_map, .unmap_dma_buf = xe_dma_buf_unmap, - .release = drm_gem_dmabuf_release, + .release = xe_dma_buf_release, .begin_cpu_access = xe_dma_buf_begin_cpu_access, .mmap = drm_gem_dmabuf_mmap, .vmap = drm_gem_dmabuf_vmap, @@ -241,18 +253,26 @@ struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags) ret = -EINVAL; goto out_unlock; } + + xe_bo_willneed_get_locked(bo); xe_bo_unlock(bo); ret = ttm_bo_setup_export(&bo->ttm, &ctx); if (ret) - return ERR_PTR(ret); + goto out_put; buf = drm_gem_prime_export(obj, flags); - if (!IS_ERR(buf)) - buf->ops = &xe_dmabuf_ops; + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_put; + } + buf->ops = &xe_dmabuf_ops; return buf; +out_put: + xe_bo_lock(bo, false); + xe_bo_willneed_put_locked(bo); out_unlock: xe_bo_unlock(bo); return ERR_PTR(ret); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index a717a2b8dea3c9..ab6cc1f0a78949 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1120,6 +1120,25 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm, xe_bo_assert_held(bo); + /* + * Reject only WILLNEED mappings on DONTNEED/PURGED BOs. This + * gates new vm_bind ioctls (user supplies WILLNEED) while + * still allowing partial-unbind / remap splits whose new VMAs + * inherit the parent's DONTNEED attr. It must also run before + * xe_bo_willneed_get_locked() below so a 0->1 holder bump + * cannot silently promote DONTNEED back to WILLNEED. + */ + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) { + if (xe_bo_madv_is_dontneed(bo)) { + xe_vma_free(vma); + return ERR_PTR(-EBUSY); + } + if (xe_bo_is_purged(bo)) { + xe_vma_free(vma); + return ERR_PTR(-EINVAL); + } + } + vm_bo = drm_gpuvm_bo_obtain_locked(vma->gpuva.vm, &bo->ttm.base); if (IS_ERR(vm_bo)) { xe_vma_free(vma); @@ -1131,6 +1150,10 @@ static struct xe_vma *xe_vma_create(struct xe_vm *vm, vma->gpuva.gem.offset = bo_offset_or_userptr; drm_gpuva_link(&vma->gpuva, vm_bo); drm_gpuvm_bo_put(vm_bo); + + xe_bo_vma_count_inc_locked(bo); + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) + xe_bo_willneed_get_locked(bo); } else /* userptr or null */ { if (!is_null && !is_cpu_addr_mirror) { struct xe_userptr_vma *uvma = to_userptr_vma(vma); @@ -1208,7 +1231,10 @@ static void xe_vma_destroy(struct xe_vma *vma, struct dma_fence *fence) xe_bo_assert_held(bo); drm_gpuva_unlink(&vma->gpuva); - xe_bo_recompute_purgeable_state(bo); + + xe_bo_vma_count_dec_locked(bo); + if (vma->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) + xe_bo_willneed_put_locked(bo); } xe_vm_assert_held(vm); @@ -3016,7 +3042,7 @@ static void vm_bind_ioctl_ops_unwind(struct xe_vm *vm, * @res_evict: Allow evicting resources during validation * @validate: Perform BO validation * @request_decompress: Request BO decompression - * @check_purged: Reject operation if BO is purged + * @check_purged: Reject operation if BO is DONTNEED or PURGED */ struct xe_vma_lock_and_validate_flags { u32 res_evict : 1; @@ -3030,6 +3056,7 @@ static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma, { struct xe_bo *bo = xe_vma_bo(vma); struct xe_vm *vm = xe_vma_vm(vma); + bool validate_bo = flags.validate; int err = 0; if (bo) { @@ -3044,7 +3071,11 @@ static int vma_lock_and_validate(struct drm_exec *exec, struct xe_vma *vma, err = -EINVAL; /* BO already purged */ } - if (!err && flags.validate) + /* Don't validate the BO for DONTNEED/PURGED remap remnants. */ + if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_WILLNEED) + validate_bo = false; + + if (!err && validate_bo) err = xe_bo_validate(bo, vm, xe_vm_allow_vm_eviction(vm) && flags.res_evict, exec); @@ -3152,7 +3183,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, op->map.immediate, .request_decompress = op->map.request_decompress, - .check_purged = true, + .check_purged = false, }); break; case DRM_GPUVA_OP_REMAP: @@ -3174,7 +3205,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, .res_evict = res_evict, .validate = true, .request_decompress = false, - .check_purged = true, + .check_purged = false, }); if (!err && op->remap.next) err = vma_lock_and_validate(exec, op->remap.next, @@ -3182,7 +3213,7 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, .res_evict = res_evict, .validate = true, .request_decompress = false, - .check_purged = true, + .check_purged = false, }); break; case DRM_GPUVA_OP_UNMAP: @@ -3211,9 +3242,11 @@ static int op_lock_and_prep(struct drm_exec *exec, struct xe_vm *vm, } /* - * Prefetch attempts to migrate BO's backing store without - * repopulating it first. Purged BOs have no backing store - * to migrate, so reject the operation. + * PREFETCH is the only op that still gates on BO purge state. + * MAP/REMAP handle this inside xe_vma_create() so partial + * unbind on a DONTNEED BO still works. PREFETCH skips + * xe_vma_create() and would migrate a BO with no backing + * store, so reject DONTNEED/PURGED here. */ err = vma_lock_and_validate(exec, gpuva_to_vma(op->base.prefetch.va), diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c index c78906dea82bec..c4fb290041956d 100644 --- a/drivers/gpu/drm/xe/xe_vm_madvise.c +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c @@ -185,147 +185,6 @@ static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm, } } -/** - * xe_bo_is_dmabuf_shared() - Check if BO is shared via dma-buf - * @bo: Buffer object - * - * Prevent marking imported or exported dma-bufs as purgeable. - * For imported BOs, Xe doesn't own the backing store and cannot - * safely reclaim pages (exporter or other devices may still be - * using them). For exported BOs, external devices may have active - * mappings we cannot track. - * - * Return: true if BO is imported or exported, false otherwise - */ -static bool xe_bo_is_dmabuf_shared(struct xe_bo *bo) -{ - struct drm_gem_object *obj = &bo->ttm.base; - - /* Imported: exporter owns backing store */ - if (drm_gem_is_imported(obj)) - return true; - - /* Exported: external devices may be accessing */ - if (obj->dma_buf) - return true; - - return false; -} - -/** - * enum xe_bo_vmas_purge_state - VMA purgeable state aggregation - * - * Distinguishes whether a BO's VMAs are all DONTNEED, have at least - * one WILLNEED, or have no VMAs at all. - * - * Enum values align with XE_MADV_PURGEABLE_* states for consistency. - */ -enum xe_bo_vmas_purge_state { - /** @XE_BO_VMAS_STATE_WILLNEED: At least one VMA is WILLNEED */ - XE_BO_VMAS_STATE_WILLNEED = 0, - /** @XE_BO_VMAS_STATE_DONTNEED: All VMAs are DONTNEED */ - XE_BO_VMAS_STATE_DONTNEED = 1, - /** @XE_BO_VMAS_STATE_NO_VMAS: BO has no VMAs */ - XE_BO_VMAS_STATE_NO_VMAS = 2, -}; - -/* - * xe_bo_recompute_purgeable_state() casts between xe_bo_vmas_purge_state and - * xe_madv_purgeable_state. Enforce that WILLNEED=0 and DONTNEED=1 match across - * both enums so the single-line cast is always valid. - */ -static_assert(XE_BO_VMAS_STATE_WILLNEED == (int)XE_MADV_PURGEABLE_WILLNEED, - "VMA purge state WILLNEED must equal madv purgeable WILLNEED"); -static_assert(XE_BO_VMAS_STATE_DONTNEED == (int)XE_MADV_PURGEABLE_DONTNEED, - "VMA purge state DONTNEED must equal madv purgeable DONTNEED"); - -/** - * xe_bo_all_vmas_dontneed() - Determine BO VMA purgeable state - * @bo: Buffer object - * - * Check all VMAs across all VMs to determine aggregate purgeable state. - * Shared BOs require unanimous DONTNEED state from all mappings. - * - * Caller must hold BO dma-resv lock. - * - * Return: XE_BO_VMAS_STATE_DONTNEED if all VMAs are DONTNEED, - * XE_BO_VMAS_STATE_WILLNEED if at least one VMA is not DONTNEED, - * XE_BO_VMAS_STATE_NO_VMAS if BO has no VMAs - */ -static enum xe_bo_vmas_purge_state xe_bo_all_vmas_dontneed(struct xe_bo *bo) -{ - struct drm_gpuvm_bo *vm_bo; - struct drm_gpuva *gpuva; - struct drm_gem_object *obj = &bo->ttm.base; - bool has_vmas = false; - - xe_bo_assert_held(bo); - - /* Shared dma-bufs cannot be purgeable */ - if (xe_bo_is_dmabuf_shared(bo)) - return XE_BO_VMAS_STATE_WILLNEED; - - drm_gem_for_each_gpuvm_bo(vm_bo, obj) { - drm_gpuvm_bo_for_each_va(gpuva, vm_bo) { - struct xe_vma *vma = gpuva_to_vma(gpuva); - - has_vmas = true; - - /* Any non-DONTNEED VMA prevents purging */ - if (vma->attr.purgeable_state != XE_MADV_PURGEABLE_DONTNEED) - return XE_BO_VMAS_STATE_WILLNEED; - } - } - - /* - * No VMAs => preserve existing BO purgeable state. - * Avoids incorrectly flipping DONTNEED -> WILLNEED when last VMA unmapped. - */ - if (!has_vmas) - return XE_BO_VMAS_STATE_NO_VMAS; - - return XE_BO_VMAS_STATE_DONTNEED; -} - -/** - * xe_bo_recompute_purgeable_state() - Recompute BO purgeable state from VMAs - * @bo: Buffer object - * - * Walk all VMAs to determine if BO should be purgeable or not. - * Shared BOs require unanimous DONTNEED state from all mappings. - * If the BO has no VMAs the existing state is preserved. - * - * Locking: Caller must hold BO dma-resv lock. When iterating GPUVM lists, - * VM lock must also be held (write) to prevent concurrent VMA modifications. - * This is satisfied at both call sites: - * - xe_vma_destroy(): holds vm->lock write - * - madvise_purgeable(): holds vm->lock write (from madvise ioctl path) - * - * Return: nothing - */ -void xe_bo_recompute_purgeable_state(struct xe_bo *bo) -{ - enum xe_bo_vmas_purge_state vma_state; - - if (!bo) - return; - - xe_bo_assert_held(bo); - - /* - * Once purged, always purged. Cannot transition back to WILLNEED. - * This matches i915 semantics where purged BOs are permanently invalid. - */ - if (bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED) - return; - - vma_state = xe_bo_all_vmas_dontneed(bo); - - if (vma_state != (enum xe_bo_vmas_purge_state)bo->madv_purgeable && - vma_state != XE_BO_VMAS_STATE_NO_VMAS) - xe_bo_set_purgeable_state(bo, (enum xe_madv_purgeable_state)vma_state); -} - /** * madvise_purgeable - Handle purgeable buffer object advice * @xe: XE device @@ -359,12 +218,6 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, /* BO must be locked before modifying madv state */ xe_bo_assert_held(bo); - /* Skip shared dma-bufs - no PTEs to zap */ - if (xe_bo_is_dmabuf_shared(bo)) { - vmas[i]->skip_invalidation = true; - continue; - } - /* * Once purged, always purged. Cannot transition back to WILLNEED. * This matches i915 semantics where purged BOs are permanently invalid. @@ -377,13 +230,14 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, switch (op->purge_state_val.val) { case DRM_XE_VMA_PURGEABLE_STATE_WILLNEED: - vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED; vmas[i]->skip_invalidation = true; - - xe_bo_recompute_purgeable_state(bo); + /* Only act on a real DONTNEED -> WILLNEED transition. */ + if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_DONTNEED) { + vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_WILLNEED; + xe_bo_willneed_get_locked(bo); + } break; case DRM_XE_VMA_PURGEABLE_STATE_DONTNEED: - vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED; /* * Don't zap PTEs at DONTNEED time -- pages are still * alive. The zap happens in xe_bo_move_notify() right @@ -391,7 +245,11 @@ static void madvise_purgeable(struct xe_device *xe, struct xe_vm *vm, */ vmas[i]->skip_invalidation = true; - xe_bo_recompute_purgeable_state(bo); + /* Only act on a real WILLNEED -> DONTNEED transition. */ + if (vmas[i]->attr.purgeable_state == XE_MADV_PURGEABLE_WILLNEED) { + vmas[i]->attr.purgeable_state = XE_MADV_PURGEABLE_DONTNEED; + xe_bo_willneed_put_locked(bo); + } break; default: /* Should never hit - values validated in madvise_args_are_sane() */ diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.h b/drivers/gpu/drm/xe/xe_vm_madvise.h index 39acd2689ca0d0..a3078f634c7e95 100644 --- a/drivers/gpu/drm/xe/xe_vm_madvise.h +++ b/drivers/gpu/drm/xe/xe_vm_madvise.h @@ -13,6 +13,4 @@ struct xe_bo; int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file); -void xe_bo_recompute_purgeable_state(struct xe_bo *bo); - #endif From 4568dfbb8363a153e7cef384d23cc6d6563ff316 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Thu, 7 May 2026 14:00:15 -0700 Subject: [PATCH 720/972] drm/xe: Make decision to use Xe2-style blitter instructions a feature flag The blitter engines' MEM_COPY and MEM_SET instructions were added as part of the same hardware change that introduced service copy engines (i.e., BCS1-BCS8) which is why the driver checks for service copy engine presence when deciding whether to use these instructions or the older XY_* instructions. However when making this decision the driver should consider which engines are part of the hardware architecture, not which engines are present/usable on the current device. For graphics IP versions that architecturally include service copy engines (i.e., everything Xe2 and later, plus PVC's Xe_HPC) we should use MEM_SET and MEM_COPY even in if all of the service copy engines wind up getting fused off. I.e., we need to decide based on whether the platform's graphics descriptor contains these engines, rather than whether the usable engine mask contains them. This logic got broken when gt->info.__engine_mask was removed, although in practice that mistake has been harmless so far because there haven't been any hardware SKUs that fuse off all of the service copy engines yet. Replace the incorrect has_service_copy_support() function with a GT feature flag that tracks more accurately whether the new blitter instructions are usable. In addition to fixing incorrect logic if all service copies are fused off, the flag also makes it more obvious what the calling code is trying to do; previously it wasn't terribly obvious why "has service copy engines" was being used as the condition for using different instructions on all copy engine types. The new feature flag is named 'has_xe2_blt_instructions' because we expect this flag to be set for all Xe2 and later platforms (i.e., everything officially supported by the Xe driver). Technically there's also one Xe1-era platform (PVC) that supports these engines/instructions and will set this flag, but this still seems to be the most clear and understandable name for the flag. Fixes: 61549a2ee594 ("drm/xe: Drop __engine_mask") Cc: Balasubramani Vivekanandan Reviewed-by: Balasubramani Vivekanandan Link: https://patch.msgid.link/20260507-xe2_copy-v1-1-26506381b821@intel.com Signed-off-by: Matt Roper (cherry picked from commit 09b399842907565a64e351fb22da790b4c673ffb) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_gt_types.h | 7 +++++++ drivers/gpu/drm/xe/xe_migrate.c | 18 ++---------------- drivers/gpu/drm/xe/xe_pci.c | 9 +++++++++ 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h index 8b55cf25a75fa1..fffb5d631b69b5 100644 --- a/drivers/gpu/drm/xe/xe_gt_types.h +++ b/drivers/gpu/drm/xe/xe_gt_types.h @@ -144,6 +144,13 @@ struct xe_gt { u8 id; /** @info.has_indirect_ring_state: GT has indirect ring state support */ u8 has_indirect_ring_state:1; + /** + * @info.has_xe2_blt_instructions: GT supports Xe2-style MEM_SET + * and MEM_COPY blitter functionality. Note that despite the + * name, some Xe1 platforms may also support this "Xe2-style" + * feature. + */ + u8 has_xe2_blt_instructions:1; /** * @info.num_geometry_xecore_fuse_regs: Number of 32b-bit fuse * registers the geometry XeCore mask spans. diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 5fdc89ed5256b3..a22413f892a096 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1524,23 +1524,9 @@ static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, bb->len += len; } -static bool has_service_copy_support(struct xe_gt *gt) -{ - /* - * What we care about is whether the architecture was designed with - * service copy functionality (specifically the new MEM_SET / MEM_COPY - * instructions) so check the architectural engine list rather than the - * actual list since these instructions are usable on BCS0 even if - * all of the actual service copy engines (BCS1-BCS8) have been fused - * off. - */ - return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, - XE_HW_ENGINE_BCS1); -} - static u32 emit_clear_cmd_len(struct xe_gt *gt) { - if (has_service_copy_support(gt)) + if (gt->info.has_xe2_blt_instructions) return PVC_MEM_SET_CMD_LEN_DW; else return XY_FAST_COLOR_BLT_DW; @@ -1549,7 +1535,7 @@ static u32 emit_clear_cmd_len(struct xe_gt *gt) static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, u32 size, u32 pitch, bool is_vram) { - if (has_service_copy_support(gt)) + if (gt->info.has_xe2_blt_instructions) emit_clear_link_copy(gt, bb, src_ofs, size, pitch); else emit_clear_main_copy(gt, bb, src_ofs, size, pitch, diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c index 9f98d033416490..c2ecd27ec770a2 100644 --- a/drivers/gpu/drm/xe/xe_pci.c +++ b/drivers/gpu/drm/xe/xe_pci.c @@ -851,6 +851,15 @@ static struct xe_gt *alloc_primary_gt(struct xe_tile *tile, gt->info.num_geometry_xecore_fuse_regs = graphics_desc->num_geometry_xecore_fuse_regs; gt->info.num_compute_xecore_fuse_regs = graphics_desc->num_compute_xecore_fuse_regs; + /* + * Even if the service copy engines wind up being fused off, their + * presence in the IP descriptor indicates that the platform supports + * Xe2-style MEM_SET and MEM_COPY functionality. + */ + if (graphics_desc->hw_engine_mask & GENMASK(XE_HW_ENGINE_BCS8, + XE_HW_ENGINE_BCS1)) + gt->info.has_xe2_blt_instructions = true; + /* * Before media version 13, the media IP was part of the primary GT * so we need to add the media engines to the primary GT's engine list. From 981bedbbe61364fcc3a3b87ebaf648a66cd07108 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 8 May 2026 11:26:36 +0100 Subject: [PATCH 721/972] drm/xe/dma-buf: handle empty bo and UAF races MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There look to be some nasty races here when triggering the invalidate_mappings hook: 1) We do xe_bo_alloc() followed by the attach, before the actual full bo init step in xe_dma_buf_init_obj(). However the bo is visible on the attachments list after the attach. This is bad since exporter driver, say amdgpu, can at any time call back into our invalidate_mappings hook, with an empty/bogus bo, leading to potential bugs/crashes. 2) Similar to 1) but here we get a UAF, when the invalidate_mappings hook is triggered. For example, we get as far as xe_bo_init_locked() but this fails in some way. But here the bo will be freed on error, but we still have it attached from dma-buf pov, so if the invalidate_mappings is now triggered then the bo we access is gone and we trigger UAF and more bugs/crashes. To fix this, move the attach step until after we actually have a fully set up buffer object. Note that the bo is not published to userspace until later, so not sure what the comment "Don't publish the bo until we have a valid attachment", is referring to. We have at least two different customers reporting hitting a NULL ptr deref in evict_flags when importing something from amdgpu, followed by triggering the evict flow. Hit rate is also pretty low, which would hint at some kind of race, so something like 1) or 2) might explain this. v2: - Shuffle the order of the ops slightly (no functional change) - Improve the comment to better explain the ordering (Matt B) Assisted-by: Gemini:gemini-3 #debug Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/work_items/7903 Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/work_items/4055 Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.8+ Reviewed-by: Matthew Brost Acked-by: Thomas Hellström Link: https://patch.msgid.link/20260508102635.149172-3-matthew.auld@intel.com (cherry picked from commit af1f2ad0c59fe4e2f924c526f66e968289d77971) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_dma_buf.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 855d32ba314dd5..f7b47e9d1a4cf6 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -377,15 +377,25 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, } } - /* - * Don't publish the bo until we have a valid attachment, and a - * valid attachment needs the bo address. So pre-create a bo before - * creating the attachment and publish. - */ bo = xe_bo_alloc(); if (IS_ERR(bo)) return ERR_CAST(bo); + /* + * xe_dma_buf_init_obj() takes ownership of the raw bo, so do not touch + * on fail, since it will already take care of cleanup. On success we + * still need to drop the ref, if something later fails. + * + * In addition this needs to happen before the attach, since + * it will create a new attachment for this, and add it to the list of + * attachments, at which point it is globally visible, and at any point + * the export side can call into on invalidate_mappings callback, which + * require a working object. + */ + obj = xe_dma_buf_init_obj(dev, bo, dma_buf); + if (IS_ERR(obj)) + return obj; + attach_ops = &xe_dma_buf_attach_ops; #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) if (test) @@ -398,21 +408,12 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, goto out_err; } - /* - * xe_dma_buf_init_obj() takes ownership of bo on both success - * and failure, so we must not touch bo after this call. - */ - obj = xe_dma_buf_init_obj(dev, bo, dma_buf); - if (IS_ERR(obj)) { - dma_buf_detach(dma_buf, attach); - return obj; - } get_dma_buf(dma_buf); obj->import_attach = attach; return obj; out_err: - xe_bo_free(bo); + xe_bo_put(bo); return obj; } From 155a372a1cc50fa93387c5d3cdfd614a61e1afd1 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Fri, 8 May 2026 11:26:37 +0100 Subject: [PATCH 722/972] drm/xe/dma-buf: fix UAF with retry loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Retry doesn't work here, since bo will be freed on error, leading to UAF. However, now that we do the alloc & init before the attach, we can now combine this as one unit and have the init do the alloc for us. This should make the retry safe. Reported by Sashiko. v2: Fix up the error unwind (CI) Closes: https://sashiko.dev/#/patchset/20260506184332.86743-2-matthew.auld%40intel.com Fixes: eb289a5f6cc6 ("drm/xe: Convert xe_dma_buf.c for exhaustive eviction") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.18+ Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20260508102635.149172-4-matthew.auld@intel.com (cherry picked from commit 479669418253e0f27f8cf5db01a731352ea592e7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_dma_buf.c | 49 ++++++++------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index f7b47e9d1a4cf6..8a920e58245cd7 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -278,16 +278,8 @@ struct dma_buf *xe_gem_prime_export(struct drm_gem_object *obj, int flags) return ERR_PTR(ret); } -/* - * Takes ownership of @storage: on success it is transferred to the returned - * drm_gem_object; on failure it is freed before returning the error. - * This matches the contract of xe_bo_init_locked() which frees @storage on - * its error paths, so callers need not (and must not) free @storage after - * this call. - */ static struct drm_gem_object * -xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, - struct dma_buf *dma_buf) +xe_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) { struct dma_resv *resv = dma_buf->resv; struct xe_device *xe = to_xe_device(dev); @@ -298,10 +290,8 @@ xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, int ret = 0; dummy_obj = drm_gpuvm_resv_object_alloc(&xe->drm); - if (!dummy_obj) { - xe_bo_free(storage); + if (!dummy_obj) return ERR_PTR(-ENOMEM); - } dummy_obj->resv = resv; xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, ret) { @@ -310,8 +300,7 @@ xe_dma_buf_init_obj(struct drm_device *dev, struct xe_bo *storage, if (ret) break; - /* xe_bo_init_locked() frees storage on error */ - bo = xe_bo_init_locked(xe, storage, NULL, resv, NULL, dma_buf->size, + bo = xe_bo_init_locked(xe, NULL, NULL, resv, NULL, dma_buf->size, 0, /* Will require 1way or 2way for vm_bind */ ttm_bo_type_sg, XE_BO_FLAG_SYSTEM, &exec); drm_exec_retry_on_contention(&exec); @@ -362,7 +351,6 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, const struct dma_buf_attach_ops *attach_ops; struct dma_buf_attachment *attach; struct drm_gem_object *obj; - struct xe_bo *bo; if (dma_buf->ops == &xe_dmabuf_ops) { obj = dma_buf->priv; @@ -377,22 +365,14 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, } } - bo = xe_bo_alloc(); - if (IS_ERR(bo)) - return ERR_CAST(bo); - /* - * xe_dma_buf_init_obj() takes ownership of the raw bo, so do not touch - * on fail, since it will already take care of cleanup. On success we - * still need to drop the ref, if something later fails. - * - * In addition this needs to happen before the attach, since - * it will create a new attachment for this, and add it to the list of - * attachments, at which point it is globally visible, and at any point - * the export side can call into on invalidate_mappings callback, which - * require a working object. + * This needs to happen before the attach, since it will create a new + * attachment for this, and add it to the list of attachments, at which + * point it is globally visible, and at any point the export side can + * call into on invalidate_mappings callback, which require a working + * object. */ - obj = xe_dma_buf_init_obj(dev, bo, dma_buf); + obj = xe_dma_buf_create_obj(dev, dma_buf); if (IS_ERR(obj)) return obj; @@ -402,20 +382,15 @@ struct drm_gem_object *xe_gem_prime_import(struct drm_device *dev, attach_ops = test->attach_ops; #endif - attach = dma_buf_dynamic_attach(dma_buf, dev->dev, attach_ops, &bo->ttm.base); + attach = dma_buf_dynamic_attach(dma_buf, dev->dev, attach_ops, obj); if (IS_ERR(attach)) { - obj = ERR_CAST(attach); - goto out_err; + xe_bo_put(gem_to_xe_bo(obj)); + return ERR_CAST(attach); } get_dma_buf(dma_buf); obj->import_attach = attach; return obj; - -out_err: - xe_bo_put(bo); - - return obj; } #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) From d5971c5c34303a00bf841a902ca00a703602c500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 20:18:43 +0200 Subject: [PATCH 723/972] drm/amdgpu: remove deadlocks from amdgpu_userq_pre_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The purpose of a GPU reset is to make sure that fence can be signaled again and the signal and resume workers can make progress again. So waiting for the resume worker or any fence in the GPU reset path is just utterly nonsense. Signed-off-by: Christian König Reviewed-by: Prike Liang Signed-off-by: Alex Deucher (cherry picked from commit fcd5f065eab46993af43442fd77ee8d9eb9c5bdf) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 26 +++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index de140a8ed1354a..692f7e3513dfca 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -1504,23 +1504,21 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev) { const struct amdgpu_userq_funcs *userq_funcs; struct amdgpu_usermode_queue *queue; - struct amdgpu_userq_mgr *uqm; unsigned long queue_id; + /* TODO: We probably need a new lock for the queue state */ xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) { - uqm = queue->userq_mgr; - cancel_delayed_work_sync(&uqm->resume_work); - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { - amdgpu_userq_wait_for_last_fence(queue); - userq_funcs = adev->userq_funcs[queue->queue_type]; - userq_funcs->unmap(queue); - /* just mark all queues as hung at this point. - * if unmap succeeds, we could map again - * in amdgpu_userq_post_reset() if vram is not lost - */ - queue->state = AMDGPU_USERQ_STATE_HUNG; - amdgpu_userq_fence_driver_force_completion(queue); - } + if (queue->state != AMDGPU_USERQ_STATE_MAPPED) + continue; + + userq_funcs = adev->userq_funcs[queue->queue_type]; + userq_funcs->unmap(queue); + /* just mark all queues as hung at this point. + * if unmap succeeds, we could map again + * in amdgpu_userq_post_reset() if vram is not lost + */ + queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_fence_driver_force_completion(queue); } } From 44e5bc73bdcbec115cfe1c4b4394d25eb3deaff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 16 Apr 2026 15:32:11 +0200 Subject: [PATCH 724/972] drm/amdgpu: rework amdgpu_userq_signal_ioctl v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This one was fortunately not looking so bad as the wait ioctl path, but there were still a few things which could be fixed/improved: 1. Allocating with GFP_ATOMIC was quite unnecessary, we can do that before taking the userq_lock. 2. Use a new mutex as protection for the fence_drv_xa so that we can do memory allocations while holding it. 3. Starting the reset timer is unnecessary when the fence is already signaled when we create it. 4. Cleanup error handling, avoid trying to free the queue when we don't even got one. v2: fix incorrect usage of xa_find, destroy the new mutex on error v3: cleanup ref ordering Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 1609eb0f81a609d350169839128cecf298c84e7a) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 12 + .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 226 ++++++++---------- 3 files changed, 119 insertions(+), 121 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 692f7e3513dfca..2d4f159f33623e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -800,6 +800,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) } queue->doorbell_index = index; + mutex_init(&queue->fence_drv_lock); xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC); r = amdgpu_userq_fence_driver_alloc(adev, &queue->fence_drv); if (r) { @@ -873,6 +874,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) amdgpu_bo_reserve(fpriv->vm.root.bo, true); amdgpu_userq_buffer_vas_list_cleanup(adev, queue); amdgpu_bo_unreserve(fpriv->vm.root.bo); + mutex_destroy(&queue->fence_drv_lock); free_queue: kfree(queue); err_pm_runtime: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 8b8f345b60b6b1..843ea8ecc5d7d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -66,6 +66,18 @@ struct amdgpu_usermode_queue { struct amdgpu_userq_obj db_obj; struct amdgpu_userq_obj fw_obj; struct amdgpu_userq_obj wptr_obj; + + /** + * @fence_drv_lock: Protecting @fence_drv_xa. + */ + struct mutex fence_drv_lock; + + /** + * @fence_drv_xa: + * + * References to the external fence drivers returned by wait_ioctl. + * Dropped on the next signaled dma_fence or queue destruction. + */ struct xarray fence_drv_xa; struct amdgpu_userq_fence_driver *fence_drv; struct dma_fence *last_fence; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index e2d5f04296e1c0..c9dbf03c4eea9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -121,6 +121,7 @@ amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq) userq->last_fence = NULL; amdgpu_userq_walk_and_drop_fence_drv(&userq->fence_drv_xa); xa_destroy(&userq->fence_drv_xa); + mutex_destroy(&userq->fence_drv_lock); /* Drop the queue's ownership reference to fence_drv explicitly */ amdgpu_userq_fence_driver_put(userq->fence_drv); } @@ -209,80 +210,84 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv) kref_put(&fence_drv->refcount, amdgpu_userq_fence_driver_destroy); } -static int amdgpu_userq_fence_alloc(struct amdgpu_userq_fence **userq_fence) +static int amdgpu_userq_fence_alloc(struct amdgpu_usermode_queue *userq, + struct amdgpu_userq_fence **pfence) { - *userq_fence = kmalloc(sizeof(**userq_fence), GFP_KERNEL); - return *userq_fence ? 0 : -ENOMEM; + struct amdgpu_userq_fence_driver *fence_drv = userq->fence_drv; + struct amdgpu_userq_fence *userq_fence; + void *entry; + + userq_fence = kmalloc(sizeof(*userq_fence), GFP_KERNEL); + if (!userq_fence) + return -ENOMEM; + + /* + * Get the next unused entry, since we fill from the start this can be + * used as size to allocate the array. + */ + mutex_lock(&userq->fence_drv_lock); + XA_STATE(xas, &userq->fence_drv_xa, 0); + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, XA_FREE_MARK); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + userq_fence->fence_drv_array = kvmalloc_array(xas.xa_index, + sizeof(fence_drv), + GFP_KERNEL); + if (!userq_fence->fence_drv_array) { + mutex_unlock(&userq->fence_drv_lock); + kfree(userq_fence); + return -ENOMEM; + } + + userq_fence->fence_drv_array_count = xas.xa_index; + xa_extract(&userq->fence_drv_xa, (void **)userq_fence->fence_drv_array, + 0, ULONG_MAX, xas.xa_index, XA_PRESENT); + xa_destroy(&userq->fence_drv_xa); + + mutex_unlock(&userq->fence_drv_lock); + + amdgpu_userq_fence_driver_get(fence_drv); + userq_fence->fence_drv = fence_drv; + + *pfence = userq_fence; + return 0; } -static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, - struct amdgpu_userq_fence *userq_fence, - u64 seq, struct dma_fence **f) +static void amdgpu_userq_fence_init(struct amdgpu_usermode_queue *userq, + struct amdgpu_userq_fence *fence, + u64 seq) { - struct amdgpu_userq_fence_driver *fence_drv; - struct dma_fence *fence; + struct amdgpu_userq_fence_driver *fence_drv = userq->fence_drv; unsigned long flags; bool signaled = false; - fence_drv = userq->fence_drv; - if (!fence_drv) - return -EINVAL; - - spin_lock_init(&userq_fence->lock); - INIT_LIST_HEAD(&userq_fence->link); - fence = &userq_fence->base; - userq_fence->fence_drv = fence_drv; - - dma_fence_init64(fence, &amdgpu_userq_fence_ops, &userq_fence->lock, + spin_lock_init(&fence->lock); + dma_fence_init64(&fence->base, &amdgpu_userq_fence_ops, &fence->lock, fence_drv->context, seq); - amdgpu_userq_fence_driver_get(fence_drv); - dma_fence_get(fence); - - if (!xa_empty(&userq->fence_drv_xa)) { - struct amdgpu_userq_fence_driver *stored_fence_drv; - unsigned long index, count = 0; - int i = 0; - - xa_lock(&userq->fence_drv_xa); - xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) - count++; - - userq_fence->fence_drv_array = - kvmalloc_objs(struct amdgpu_userq_fence_driver *, count, - GFP_ATOMIC); - - if (userq_fence->fence_drv_array) { - xa_for_each(&userq->fence_drv_xa, index, stored_fence_drv) { - userq_fence->fence_drv_array[i] = stored_fence_drv; - __xa_erase(&userq->fence_drv_xa, index); - i++; - } - } - - userq_fence->fence_drv_array_count = i; - xa_unlock(&userq->fence_drv_xa); - } else { - userq_fence->fence_drv_array = NULL; - userq_fence->fence_drv_array_count = 0; - } + /* Make sure the fence is visible to the hang detect worker */ + dma_fence_put(userq->last_fence); + userq->last_fence = dma_fence_get(&fence->base); - /* Check if hardware has already processed the job */ + /* Check if hardware has already processed the fence */ spin_lock_irqsave(&fence_drv->fence_list_lock, flags); - if (!dma_fence_is_signaled(fence)) { - list_add_tail(&userq_fence->link, &fence_drv->fences); + if (!dma_fence_is_signaled(&fence->base)) { + dma_fence_get(&fence->base); + list_add_tail(&fence->link, &fence_drv->fences); } else { + INIT_LIST_HEAD(&fence->link); signaled = true; - dma_fence_put(fence); } spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); if (signaled) - amdgpu_userq_fence_put_fence_drv_array(userq_fence); - - *f = fence; - - return 0; + amdgpu_userq_fence_put_fence_drv_array(fence); + else + amdgpu_userq_start_hang_detect_work(userq); } static const char *amdgpu_userq_fence_get_driver_name(struct dma_fence *f) @@ -403,11 +408,6 @@ static int amdgpu_userq_fence_read_wptr(struct amdgpu_device *adev, return r; } -static void amdgpu_userq_fence_cleanup(struct dma_fence *fence) -{ - dma_fence_put(fence); -} - static void amdgpu_userq_fence_driver_set_error(struct amdgpu_userq_fence *fence, int error) @@ -451,13 +451,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, const unsigned int num_read_bo_handles = args->num_bo_read_handles; struct amdgpu_fpriv *fpriv = filp->driver_priv; struct amdgpu_userq_mgr *userq_mgr = &fpriv->userq_mgr; + struct drm_gem_object **gobj_write, **gobj_read; u32 *syncobj_handles, num_syncobj_handles; - struct amdgpu_userq_fence *userq_fence; - struct amdgpu_usermode_queue *queue = NULL; - struct drm_syncobj **syncobj = NULL; - struct dma_fence *fence; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_fence *fence; + struct drm_syncobj **syncobj; struct drm_exec exec; + void __user *ptr; int r, i, entry; u64 wptr; @@ -469,13 +470,14 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, return -EINVAL; num_syncobj_handles = args->num_syncobj_handles; - syncobj_handles = memdup_array_user(u64_to_user_ptr(args->syncobj_handles), - num_syncobj_handles, sizeof(u32)); + ptr = u64_to_user_ptr(args->syncobj_handles); + syncobj_handles = memdup_array_user(ptr, num_syncobj_handles, + sizeof(u32)); if (IS_ERR(syncobj_handles)) return PTR_ERR(syncobj_handles); - /* Array of pointers to the looked up syncobjs */ - syncobj = kmalloc_array(num_syncobj_handles, sizeof(*syncobj), GFP_KERNEL); + syncobj = kmalloc_array(num_syncobj_handles, sizeof(*syncobj), + GFP_KERNEL); if (!syncobj) { r = -ENOMEM; goto free_syncobj_handles; @@ -489,21 +491,17 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, } } - r = drm_gem_objects_lookup(filp, - u64_to_user_ptr(args->bo_read_handles), - num_read_bo_handles, - &gobj_read); + ptr = u64_to_user_ptr(args->bo_read_handles); + r = drm_gem_objects_lookup(filp, ptr, num_read_bo_handles, &gobj_read); if (r) goto free_syncobj; - r = drm_gem_objects_lookup(filp, - u64_to_user_ptr(args->bo_write_handles), - num_write_bo_handles, + ptr = u64_to_user_ptr(args->bo_write_handles); + r = drm_gem_objects_lookup(filp, ptr, num_write_bo_handles, &gobj_write); if (r) goto put_gobj_read; - /* Retrieve the user queue */ queue = amdgpu_userq_get(userq_mgr, args->queue_id); if (!queue) { r = -ENOENT; @@ -512,73 +510,61 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, r = amdgpu_userq_fence_read_wptr(adev, queue, &wptr); if (r) - goto put_gobj_write; + goto put_queue; - r = amdgpu_userq_fence_alloc(&userq_fence); + r = amdgpu_userq_fence_alloc(queue, &fence); if (r) - goto put_gobj_write; + goto put_queue; /* We are here means UQ is active, make sure the eviction fence is valid */ amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr); - /* Create a new fence */ - r = amdgpu_userq_fence_create(queue, userq_fence, wptr, &fence); - if (r) { - mutex_unlock(&userq_mgr->userq_mutex); - kfree(userq_fence); - goto put_gobj_write; - } + /* Create the new fence */ + amdgpu_userq_fence_init(queue, fence, wptr); - dma_fence_put(queue->last_fence); - queue->last_fence = dma_fence_get(fence); - amdgpu_userq_start_hang_detect_work(queue); mutex_unlock(&userq_mgr->userq_mutex); + /* + * This needs to come after the fence is created since + * amdgpu_userq_ensure_ev_fence() can't be called while holding the resv + * locks. + */ drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, (num_read_bo_handles + num_write_bo_handles)); - /* Lock all BOs with retry handling */ drm_exec_until_all_locked(&exec) { - r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1); + r = drm_exec_prepare_array(&exec, gobj_read, + num_read_bo_handles, 1); drm_exec_retry_on_contention(&exec); - if (r) { - amdgpu_userq_fence_cleanup(fence); + if (r) goto exec_fini; - } - r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1); + r = drm_exec_prepare_array(&exec, gobj_write, + num_write_bo_handles, 1); drm_exec_retry_on_contention(&exec); - if (r) { - amdgpu_userq_fence_cleanup(fence); + if (r) goto exec_fini; - } } - for (i = 0; i < num_read_bo_handles; i++) { - if (!gobj_read || !gobj_read[i]->resv) - continue; - - dma_resv_add_fence(gobj_read[i]->resv, fence, + /* And publish the new fence in the BOs and syncobj */ + for (i = 0; i < num_read_bo_handles; i++) + dma_resv_add_fence(gobj_read[i]->resv, &fence->base, DMA_RESV_USAGE_READ); - } - for (i = 0; i < num_write_bo_handles; i++) { - if (!gobj_write || !gobj_write[i]->resv) - continue; - - dma_resv_add_fence(gobj_write[i]->resv, fence, + for (i = 0; i < num_write_bo_handles; i++) + dma_resv_add_fence(gobj_write[i]->resv, &fence->base, DMA_RESV_USAGE_WRITE); - } - /* Add the created fence to syncobj/BO's */ for (i = 0; i < num_syncobj_handles; i++) - drm_syncobj_replace_fence(syncobj[i], fence); + drm_syncobj_replace_fence(syncobj[i], &fence->base); +exec_fini: /* drop the reference acquired in fence creation function */ - dma_fence_put(fence); + dma_fence_put(&fence->base); -exec_fini: drm_exec_fini(&exec); +put_queue: + amdgpu_userq_put(queue); put_gobj_write: for (i = 0; i < num_write_bo_handles; i++) drm_gem_object_put(gobj_write[i]); @@ -589,15 +575,11 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, kvfree(gobj_read); free_syncobj: while (entry-- > 0) - if (syncobj[entry]) - drm_syncobj_put(syncobj[entry]); + drm_syncobj_put(syncobj[entry]); kfree(syncobj); free_syncobj_handles: kfree(syncobj_handles); - if (queue) - amdgpu_userq_put(queue); - return r; } @@ -872,8 +854,10 @@ amdgpu_userq_wait_return_fence_info(struct drm_file *filp, * Otherwise, we would gather those references until we don't * have any more space left and crash. */ + mutex_lock(&waitq->fence_drv_lock); r = xa_alloc(&waitq->fence_drv_xa, &index, fence_drv, xa_limit_32b, GFP_KERNEL); + mutex_unlock(&waitq->fence_drv_lock); if (r) goto put_waitq; From d0053441ad7eaf7920b71e2263097ece53c5af34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 15:13:57 +0200 Subject: [PATCH 725/972] drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Well the reset handling seems broken on multiple levels. As first step of fixing this remove most calls to the hang detection. That function should only be called after we run into a timeout! And *NOT* as random check spread over the code in multiple places. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 38 +++++++++-------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 2d4f159f33623e..ba03d2a42e1ef9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->preempt(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } - - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue) @@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - bool found_hung_queue = false; - int r = 0; + int r; if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || - (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { + r = userq_funcs->unmap(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - found_hung_queue = true; + return r; } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; } } - if (found_hung_queue) - amdgpu_userq_detect_and_reset_queues(uq_mgr); - - return r; + return 0; } static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) @@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue) struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; - int r = 0; + int r; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { r = userq_funcs->map(queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } - return r; + return 0; } static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue) @@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); #endif - amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(queue); atomic_dec(&uq_mgr->userq_count[queue->queue_type]); amdgpu_userq_cleanup(queue); @@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) unsigned long queue_id; int ret = 0, r; - amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ xa_for_each(&uq_mgr->userq_xa, queue_id, queue) { r = amdgpu_userq_preempt_helper(queue); @@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) ret = r; } - if (ret) + if (ret) { drm_file_err(uq_mgr->file, "Couldn't unmap all the queues, eviction failed ret=%d\n", ret); + amdgpu_userq_detect_and_reset_queues(uq_mgr); + } return ret; } @@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) uqm = queue->userq_mgr; cancel_delayed_work_sync(&uqm->resume_work); guard(mutex)(&uqm->userq_mutex); - amdgpu_userq_detect_and_reset_queues(uqm); if (adev->in_s0ix) r = amdgpu_userq_preempt_helper(queue); else @@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && (queue->xcp_id == idx)) { - amdgpu_userq_detect_and_reset_queues(uqm); r = amdgpu_userq_preempt_helper(queue); if (r) ret = r; From 0071e01c61aa73f22edd5252f0a3e2c2eff744d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 20 Apr 2026 16:08:35 +0200 Subject: [PATCH 726/972] drm/amdgpu: fix userq hang detection and reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix lock inversions pointed out by Prike and Sunil. The hang detection timeout *CAN'T* grab locks under which we wait for fences, especially not the userq_mutex lock. Then instead of this completely broken handling with the hang_detect_fence just cancel the work when fences are processed and re-start if necessary. Signed-off-by: Christian König Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher (cherry picked from commit 1b62077f045ac6ffde7c97005c6659569ac5c1ec) --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 65 ++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 1 - .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 17 +++-- .../gpu/drm/amd/amdgpu/amdgpu_userq_fence.h | 2 +- 4 files changed, 40 insertions(+), 45 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index ba03d2a42e1ef9..70d74f04d2ddcb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -106,9 +106,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) int r = 0; int i; - /* Warning if current process mutex is not held */ - WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex)); - if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "userq reset disabled by debug mask\n"); return 0; @@ -127,9 +124,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) */ for (i = 0; i < num_queue_types; i++) { int ring_type = queue_types[i]; - const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type]; + const struct amdgpu_userq_funcs *funcs = + adev->userq_funcs[ring_type]; - if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE)) + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, + AMDGPU_RESET_TYPE_PER_QUEUE)) continue; if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && @@ -150,38 +149,22 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) static void amdgpu_userq_hang_detect_work(struct work_struct *work) { - struct amdgpu_usermode_queue *queue = container_of(work, - struct amdgpu_usermode_queue, - hang_detect_work.work); - struct dma_fence *fence; - struct amdgpu_userq_mgr *uq_mgr; - - if (!queue->userq_mgr) - return; - - uq_mgr = queue->userq_mgr; - fence = READ_ONCE(queue->hang_detect_fence); - /* Fence already signaled – no action needed */ - if (!fence || dma_fence_is_signaled(fence)) - return; + struct amdgpu_usermode_queue *queue = + container_of(work, struct amdgpu_usermode_queue, + hang_detect_work.work); - mutex_lock(&uq_mgr->userq_mutex); - amdgpu_userq_detect_and_reset_queues(uq_mgr); - mutex_unlock(&uq_mgr->userq_mutex); + amdgpu_userq_detect_and_reset_queues(queue->userq_mgr); } /* * Start hang detection for a user queue fence. A delayed work will be scheduled - * to check if the fence is still pending after the timeout period. -*/ + * to reset the queues when the fence doesn't signal in time. + */ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) { struct amdgpu_device *adev; unsigned long timeout_ms; - if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev) - return; - adev = queue->userq_mgr->adev; /* Determine timeout based on queue type */ switch (queue->queue_type) { @@ -199,8 +182,6 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) break; } - /* Store the fence to monitor and schedule hang detection */ - WRITE_ONCE(queue->hang_detect_fence, queue->last_fence); schedule_delayed_work(&queue->hang_detect_work, msecs_to_jiffies(timeout_ms)); } @@ -210,18 +191,24 @@ void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell) struct xarray *xa = &adev->userq_doorbell_xa; struct amdgpu_usermode_queue *queue; unsigned long flags; + int r; xa_lock_irqsave(xa, flags); queue = xa_load(xa, doorbell); - if (queue) - amdgpu_userq_fence_driver_process(queue->fence_drv); - xa_unlock_irqrestore(xa, flags); -} + if (queue) { + r = amdgpu_userq_fence_driver_process(queue->fence_drv); + /* + * We are in interrupt context here, this *can't* wait for + * reset work to finish. + */ + if (r >= 0) + cancel_delayed_work(&queue->hang_detect_work); -static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue *queue) -{ - INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work); - queue->hang_detect_fence = NULL; + /* Restart the timer when there are still fences pending */ + if (r == 1) + amdgpu_userq_start_hang_detect_work(queue); + } + xa_unlock_irqrestore(xa, flags); } static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue, @@ -640,7 +627,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que amdgpu_bo_unreserve(vm->root.bo); mutex_lock(&uq_mgr->userq_mutex); - queue->hang_detect_fence = NULL; amdgpu_userq_wait_for_last_fence(queue); #if defined(CONFIG_DEBUG_FS) @@ -847,7 +833,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) up_read(&adev->reset_domain->sem); amdgpu_debugfs_userq_init(filp, queue, qid); - amdgpu_userq_init_hang_detect_work(queue); + INIT_DELAYED_WORK(&queue->hang_detect_work, + amdgpu_userq_hang_detect_work); args->out.queue_id = qid; atomic_inc(&uq_mgr->userq_count[queue->queue_type]); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 843ea8ecc5d7d8..85f460e7c31b24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -85,7 +85,6 @@ struct amdgpu_usermode_queue { int priority; struct dentry *debugfs_queue; struct delayed_work hang_detect_work; - struct dma_fence *hang_detect_fence; struct kref refcount; struct list_head userq_va_list; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index c9dbf03c4eea9c..53a8944bab05ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -135,7 +135,14 @@ amdgpu_userq_fence_put_fence_drv_array(struct amdgpu_userq_fence *userq_fence) userq_fence->fence_drv_array_count = 0; } -void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) +/* + * Returns: + * -ENOENT when no fences were processes + * 1 when more fences are pending + * 0 when no fences are pending any more + */ +int +amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv) { struct amdgpu_userq_fence *userq_fence, *tmp; LIST_HEAD(to_be_signaled); @@ -143,9 +150,6 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d unsigned long flags; u64 rptr; - if (!fence_drv) - return; - spin_lock_irqsave(&fence_drv->fence_list_lock, flags); rptr = amdgpu_userq_fence_read(fence_drv); @@ -158,6 +162,9 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d &userq_fence->link); spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags); + if (list_empty(&to_be_signaled)) + return -ENOENT; + list_for_each_entry_safe(userq_fence, tmp, &to_be_signaled, link) { fence = &userq_fence->base; list_del_init(&userq_fence->link); @@ -169,6 +176,8 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d dma_fence_put(fence); } + /* That doesn't need to be accurate so no locking */ + return list_empty(&fence_drv->fences) ? 0 : 1; } void amdgpu_userq_fence_driver_destroy(struct kref *ref) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h index d355a0eecc07fa..0bd51616cef1bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h @@ -63,7 +63,7 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv); int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_userq_fence_driver **fence_drv_req); void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq); -void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv); +int amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv); void amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq); void amdgpu_userq_fence_driver_destroy(struct kref *ref); int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, From 183182235f6d53bac62c6c39014738a54a68dfa6 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Tue, 5 May 2026 09:05:37 +0800 Subject: [PATCH 727/972] drm/amd/display: Wrap DCN32 phantom-plane allocation in DC_RUN_WITH_PREEMPTION_ENABLED [Why] dcn32_validate_bandwidth() wraps dcn32_internal_validate_bw() with DC_FP_START()/DC_FP_END(). In x86 non-RT, DC_FP_START takes fpregs_lock(), which disables local softirqs. The DML1 path through dcn32_enable_phantom_plane() calls kvzalloc() to allocate ~335 KiB for dc_plane_state. This triggers the vmalloc path, which calls BUG_ON(in_interrupt()) because it's invoked within the FPU-enabled (softirq disabled) region, leading to a kernel crash. [How] Wrap the dc_state_create_phantom_plane() call with the DC_RUN_WITH_PREEMPTION_ENABLED() macro to allow preemption during this memory allocation. Fixes: 235c67634230 ("drm/amd/display: add DCN32/321 specific files for Display Core") Closes: https://gitlab.freedesktop.org/drm/amd/-/work_items/4470 Reviewed-by: Aurabindo Pillai Signed-off-by: Mikhail Gavrilov Signed-off-by: James Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 885ccbef7b94a8b38f69c4211c679021aa27ad11) Cc: stable@vger.kernel.org --- .../drm/amd/display/dc/resource/dcn32/dcn32_resource.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index 82f81b58698660..3751f7a94a059e 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -92,9 +92,14 @@ #include "dml/dcn32/dcn32_fpu.h" #include "dc_state_priv.h" +#include "dc_fpu.h" #include "dml2_0/dml2_wrapper.h" +#if !defined(DC_RUN_WITH_PREEMPTION_ENABLED) +#define DC_RUN_WITH_PREEMPTION_ENABLED(code) code +#endif + #define DC_LOGGER_INIT(logger) enum dcn32_clk_src_array_id { @@ -1684,7 +1689,8 @@ static void dcn32_enable_phantom_plane(struct dc *dc, if (curr_pipe->top_pipe && curr_pipe->top_pipe->plane_state == curr_pipe->plane_state) phantom_plane = prev_phantom_plane; else - phantom_plane = dc_state_create_phantom_plane(dc, context, curr_pipe->plane_state); + DC_RUN_WITH_PREEMPTION_ENABLED(phantom_plane = + dc_state_create_phantom_plane(dc, context, curr_pipe->plane_state)); if (!phantom_plane) continue; From 6bbede02dc62a1021aeeae87ab243bd7a93c61d2 Mon Sep 17 00:00:00 2001 From: Xiang Liu Date: Thu, 7 May 2026 20:56:15 +0800 Subject: [PATCH 728/972] drm/amd/ras: Fix CPER ring debugfs read overflow The legacy CPER debugfs reader can reach the payload path without a valid pointer snapshot. The remaining user byte count is also treated as the ring occupancy in dwords, so reads past the header can copy more than requested. Take the CPER lock before sampling pointers. Resample rptr/wptr for payload reads, bound the payload copy by available dwords and the remaining user size, and advance the file position for each dword copied. Signed-off-by: Xiang Liu Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher (cherry picked from commit 1e40ef87ffdc291e05ccdade8b9170cc9c1c4249) --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 29 +++++++++++++++++------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 66e8a2f7afcf6f..d6bee5c30073dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -552,8 +552,9 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { struct amdgpu_ring *ring = file_inode(f)->i_private; - uint32_t value, result, early[3]; + u32 value, result, early[3] = { 0 }; uint64_t p; + u32 avail_dw, start_dw, read_dw; loff_t i; int r; @@ -565,10 +566,10 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, result = 0; - if (*pos < 12) { - if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) - mutex_lock(&ring->adev->cper.ring_lock); + if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) + mutex_lock(&ring->adev->cper.ring_lock); + if (*pos < 12) { early[0] = amdgpu_ring_get_rptr(ring) & ring->buf_mask; early[1] = amdgpu_ring_get_wptr(ring) & ring->buf_mask; early[2] = ring->wptr & ring->buf_mask; @@ -600,13 +601,24 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, *pos += 4; } } else { + early[0] = amdgpu_ring_get_rptr(ring) & ring->buf_mask; + early[1] = amdgpu_ring_get_wptr(ring) & ring->buf_mask; + p = early[0]; if (early[0] <= early[1]) - size = (early[1] - early[0]); + avail_dw = early[1] - early[0]; else - size = ring->ring_size - (early[0] - early[1]); + avail_dw = ring->buf_mask + 1 - (early[0] - early[1]); - while (size) { + start_dw = (*pos > 12) ? ((*pos - 12) >> 2) : 0; + if (start_dw >= avail_dw) + goto out; + + p = (p + start_dw) & ring->ptr_mask; + avail_dw -= start_dw; + read_dw = min_t(u32, avail_dw, size >> 2); + + while (read_dw) { if (p == early[1]) goto out; @@ -619,9 +631,10 @@ static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, buf += 4; result += 4; - size--; + read_dw--; p++; p &= ring->ptr_mask; + *pos += 4; } } From 5d08559c910cc37673b965a0d4e8d004444d0332 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Fri, 3 Apr 2026 15:58:31 +0800 Subject: [PATCH 729/972] drm/amdgpu/gfx_v12_0: set gfx.rs64_enable from PFP header on GFX12 gfx_v12_0_init_microcode() always loads RS64 CP ucode but never set adev->gfx.rs64_enable, so it stayed false and code that branches on it (e.g. MEC pipe reset) used the legacy CP_MEC_CNTL path incorrectly. Match GFX11: derive RS64 mode from the PFP firmware header (v2.0) via amdgpu_ucode_hdr_version(). Log at debug when RS64 is enabled. Reviewed-by: Alex Deucher Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit b03d53598b0d2048e8fa7303b8d0784768ec4fa6) --- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 0e0b1e5b88fce8..c35372e21261d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -602,6 +602,13 @@ static int gfx_v12_0_init_microcode(struct amdgpu_device *adev) "amdgpu/%s_pfp.bin", ucode_prefix); if (err) goto out; + + adev->gfx.rs64_enable = amdgpu_ucode_hdr_version( + (union amdgpu_firmware_header *) + adev->gfx.pfp_fw->data, 2, 0); + if (adev->gfx.rs64_enable) + dev_dbg(adev->dev, "CP RS64 enable\n"); + amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_PFP); amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_RS64_PFP_P0_STACK); From 9a415cc53711f2238e0f0ca8a6bcc796c003b127 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 12:05:48 -1000 Subject: [PATCH 730/972] sched_ext: Avoid UAF in scx_root_enable_workfn() init failure path In scx_root_enable_workfn(), put_task_struct(p) is called before scx_error() dereferences p->comm and p->pid. If the iterator's reference is the last drop, the task is freed synchronously and the deref becomes a UAF. Move put_task_struct() past scx_error(). Reported-by: Sashiko Closes: https://lore.kernel.org/all/20260511214031.AF5E9C2BCB0@smtp.kernel.org/ Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1efd5d82b08bba..9354da79e162a7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6973,10 +6973,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (scx_get_task_state(p) != SCX_TASK_DEAD) scx_set_task_state(p, SCX_TASK_NONE); task_rq_unlock(rq, p, &rf); - put_task_struct(p); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); + put_task_struct(p); goto err_disable_unlock_all; } From e174929793195e0cd6a4adb0cad731b39f9019b4 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Tue, 5 May 2026 16:43:36 -0700 Subject: [PATCH 731/972] net/rds: reset op_nents when zerocopy page pin fails When iov_iter_get_pages2() fails in rds_message_zcopy_from_user(), the pinned pages are released with put_page(), and rm->data.op_mmp_znotifier is cleared. But we fail to properly clear rm->data.op_nents. Later when rds_message_purge() is called from rds_sendmsg() the cleanup loop iterates over the incorrectly non zero number of op_nents and frees them again. Fix this by properly resetting op_nents when it should be in rds_message_zcopy_from_user(). Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.") Signed-off-by: Allison Henderson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260505234336.2132721-1-achender@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/message.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rds/message.c b/net/rds/message.c index 25fedcb3cd00ec..7feb0eb6537db8 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -448,6 +448,7 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * for (i = 0; i < rm->data.op_nents; i++) put_page(sg_page(&rm->data.op_sg[i])); + rm->data.op_nents = 0; mmp = &rm->data.op_mmp_znotifier->z_mmp; mm_unaccount_pinned_pages(mmp); ret = -EFAULT; From 24a08d7d6218d60c033015cf4870b6096446e734 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Thu, 7 May 2026 00:35:15 +0000 Subject: [PATCH 732/972] net: ena: PHC: Check return code before setting timestamp output ena_phc_gettimex64() is setting the output parameter regardless of whether ena_com_phc_get_timestamp() succeeded or failed. When ena_com_phc_get_timestamp() returns an error, the timestamp parameter may contain uninitialized stack memory (e.g., when PHC is disabled or in blocked state) or invalid hardware values. Passing these to userspace via the PTP ioctl is both a security issue (information leak) and a correctness bug. Fix by checking the return code after releasing the lock and only setting the output timestamp on success. Fixes: e0ea34158ee8 ("net: ena: Add PHC support in the ENA driver") Cc: stable@vger.kernel.org Signed-off-by: Arthur Kiyanovski Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20260507003518.22554-1-akiyano@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_phc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_phc.c b/drivers/net/ethernet/amazon/ena/ena_phc.c index 7867e893fd15f9..c2a3ff1ef645c5 100644 --- a/drivers/net/ethernet/amazon/ena/ena_phc.c +++ b/drivers/net/ethernet/amazon/ena/ena_phc.c @@ -46,9 +46,12 @@ static int ena_phc_gettimex64(struct ptp_clock_info *clock_info, spin_unlock_irqrestore(&phc_info->lock, flags); + if (rc) + return rc; + *ts = ns_to_timespec64(timestamp_nsec); - return rc; + return 0; } static int ena_phc_settime64(struct ptp_clock_info *clock_info, From 03cb001ef87b3f8d859cf7f96329acf3d6235d29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 8 May 2026 12:08:46 +0000 Subject: [PATCH 733/972] tcp: Fix out-of-bounds access for twsk in tcp_ao_established_key(). lockdep_sock_is_held() was added in tcp_ao_established_key() by the cited commit. It can be called from tcp_v[46]_timewait_ack() with twsk. Since it does not have sk->sk_lock, the lockdep annotation results in out-of-bound access. $ pahole -C tcp_timewait_sock vmlinux | grep size /* size: 288, cachelines: 5, members: 8 */ $ pahole -C sock vmlinux | grep sk_lock socket_lock_t sk_lock; /* 440 192 */ Let's not use lockdep_sock_is_held() for TCP_TIME_WAIT. Fixes: 6b2d11e2d8fc ("net/tcp: Add missing lockdep annotations for TCP-AO hlist traversals") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260508120853.4098365-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index a97cdf3e6af4cf..0a4b38b315fed4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -116,7 +116,8 @@ struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { + hlist_for_each_entry_rcu(key, &ao->head, node, + sk_fullsock(sk) && lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; From f8e64e956a635b054227f56e9586a1997add0646 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 8 May 2026 19:05:10 +0200 Subject: [PATCH 734/972] net/sched: dualpi2: initialize timer earlier in dualpi2_init() 'pi2_timer' needs to be initialized in all error paths of dualpi2_init(): otherwise, a failure in qdisc_create_dflt() causes the following crash in dualpi2_destroy(): # tc qdisc add dev crash0 handle 1: root dualpi2 BUG: kernel NULL pointer dereference, address: 0000000000000010 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 4 UID: 0 PID: 471 Comm: tc Tainted: G E 7.1.0-rc1-virtme #2 PREEMPT(full) Tainted: [E]=UNSIGNED_MODULE Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:hrtimer_active+0x39/0x60 Code: f9 eb 23 0f b6 41 38 3c 01 0f 87 87 64 c0 ff 83 e0 01 75 33 48 39 4a 50 74 28 44 3b 42 10 75 06 48 3b 51 30 74 21 48 8b 51 30 <44> 8b 42 10 41 f6 c0 01 74 cf f3 90 44 8b 42 10 41 f6 c0 01 74 c3 RSP: 0018:ffffd0db80b93620 EFLAGS: 00010282 RAX: ffffffffc0400320 RBX: ffff8cf24a4c86b8 RCX: ffff8cf24a4c86b8 RDX: 0000000000000000 RSI: ffff8cf2429c2ab0 RDI: ffff8cf24a4c86b8 RBP: 00000000fffffff4 R08: 0000000000000003 R09: 0000000000000000 R10: 0000000000000001 R11: ffff8cf24a39c500 R12: ffff8cf24822c000 R13: ffffd0db80b936c0 R14: ffffffffc02cf360 R15: 00000000ffffffff FS: 00007fbc01706580(0000) GS:ffff8cf2dc759000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000008e02003 CR4: 0000000000172ef0 Call Trace: hrtimer_cancel+0x15/0x40 dualpi2_destroy+0x20/0x40 [sch_dualpi2] qdisc_create+0x230/0x570 tc_modify_qdisc+0x716/0xc10 rtnetlink_rcv_msg+0x188/0x780 netlink_rcv_skb+0xcd/0x150 netlink_unicast+0x1ba/0x290 netlink_sendmsg+0x242/0x4d0 ____sys_sendmsg+0x39e/0x3e0 ___sys_sendmsg+0xe1/0x130 __sys_sendmsg+0xad/0x110 do_syscall_64+0x14f/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbc0188b08e Code: 4d 89 d8 e8 94 bd 00 00 4c 8b 5d f8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 74 11 c9 c3 0f 1f 80 00 00 00 00 48 8b 45 10 0f 05 c3 83 e2 39 83 fa 08 75 e7 e8 03 ff ff ff 0f 1f 00 f3 0f 1e fa RSP: 002b:00007fff593260e0 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fbc0188b08e RDX: 0000000000000000 RSI: 00007fff59326190 RDI: 0000000000000003 RBP: 00007fff593260f0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000202 R12: 000055f06124f260 R13: 0000000069fca043 R14: 000055f061255640 R15: 000055f06124d3f8 Modules linked in: sch_dualpi2(E) CR2: 0000000000000010 [1] https://lore.kernel.org/netdev/2e78e01c504c633ebdff18d041833cf2e079a3a4.1607020450.git.dcaratti@redhat.com/ [2] https://lore.kernel.org/netdev/20200725201707.16909-1-xiyou.wangcong@gmail.com/ v2: - rebased on top of latest net.git Fixes: 320d031ad6e4 ("sched: Struct definition and parsing of dualpi2 qdisc") Signed-off-by: Davide Caratti Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/1faca91179702b31da5d87653e1e036543e32722.1778259798.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_dualpi2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 241e6a46bd00e3..a22489c14458e6 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -938,6 +938,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, int err; sch->flags |= TCQ_F_DEQUEUE_DROPS; + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1), extack); @@ -950,8 +952,6 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); From be48e5fe51a5864566307998286a699d6b986934 Mon Sep 17 00:00:00 2001 From: Evgenii Burenchev Date: Thu, 7 May 2026 17:55:17 +0300 Subject: [PATCH 735/972] qed: fix division by zero in qed_init_wfq_param when all vports are configured In qed_init_wfq_param(), variable non_requested_count can become zero when the number of vports with the configured flag set (including the current vport being configured) equals total num_vports. This happens when configuring the last unconfigured vport or when re-configuring an already configured vport. The function then calculates left_rate_per_vp = total_left_rate / non_requested_count, which causes division by zero. Fix this by skipping the division when non_requested_count is zero. In that case, there is no remaining bandwidth to distribute, so just record the configuration for the current vport and return success. Fixes: bcd197c81f63 ("qed: Add vport WFQ configuration APIs") Signed-off-by: Evgenii Burenchev Link: https://patch.msgid.link/20260507145520.23106-1-evg28bur@yandex.ru Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 42c6dcfb1f0fa5..dd75c47758e1a3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -5103,6 +5103,13 @@ static int qed_init_wfq_param(struct qed_hwfn *p_hwfn, return -EINVAL; } + /* All vports are already or become configured, nothing to distribute */ + if (non_requested_count == 0) { + p_hwfn->qm_info.wfq_data[vport_id].min_speed = req_rate; + p_hwfn->qm_info.wfq_data[vport_id].configured = true; + return 0; + } + total_left_rate = min_pf_rate - total_req_min_rate; left_rate_per_vp = total_left_rate / non_requested_count; From 2c7b1227e582e88db7917412dca4e752c1aff691 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 11 May 2026 10:36:36 -0500 Subject: [PATCH 736/972] ASoC: SOF: amd: Fix error code handling in psp_send_cmd() The smn_read_register() helper returns negative error codes on failure or the register value on success. When used with read_poll_timeout(), the return value is stored in the 'data' variable. Currently 'data' is declared as u32, which causes negative error codes to be cast to large positive values. This makes the condition 'data > 0' incorrectly treat errors as success. Fix by changing 'data' from u32 to int, matching the pattern used in psp_mbox_ready() which correctly handles the same helper function. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-sound/agGES8vWrLOrBu28@stanley.mountain/ Fixes: f120cf33d232 ("ASoC: SOF: amd: Use AMD_NODE") Signed-off-by: Mario Limonciello Link: https://patch.msgid.link/20260511153638.724810-1-mario.limonciello@amd.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 71a18f156de23b..f615b8d1c80208 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -223,7 +223,7 @@ static int psp_send_cmd(struct acp_dev_data *adata, int cmd) { struct snd_sof_dev *sdev = adata->dev; int ret; - u32 data; + int data; if (!cmd) return -EINVAL; From 8b7c5cc7f6e655087164eb902131de356f6d5984 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 11 May 2026 12:42:39 +0100 Subject: [PATCH 737/972] ASoC: cs35l56: Check for successful runtime-resume in cs35l56_dsp_work() In cs35l56_dsp_work() check that the request for runtime-resume was successful instead of assuming that it can't fail. We may as well do this using the new PM_RUNTIME_ACQUIRE*() macros and remove the manual pm_runtime_put_autosuspend() and associated gotos. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260511114239.44970-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index 849d70ca23d6f5..4fbbdcc8715162 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -867,11 +867,16 @@ static void cs35l56_dsp_work(struct work_struct *work) if (!cs35l56->base.init_done) return; - pm_runtime_get_sync(cs35l56->base.dev); + PM_RUNTIME_ACQUIRE(cs35l56->base.dev, pm); + ret = PM_RUNTIME_ACQUIRE_ERR(&pm); + if (ret) { + dev_err(cs35l56->base.dev, "dsp_work failed to runtime-resume: %d\n", ret); + return; + } ret = cs35l56_read_prot_status(&cs35l56->base, &firmware_missing, &firmware_version); if (ret) - goto err; + return; /* Populate fw file qualifier with the revision and security state */ kfree(cs35l56->dsp.fwf_name); @@ -887,7 +892,7 @@ static void cs35l56_dsp_work(struct work_struct *work) } if (!cs35l56->dsp.fwf_name) - goto err; + return; dev_dbg(cs35l56->base.dev, "DSP fwf name: '%s' system name: '%s'\n", cs35l56->dsp.fwf_name, cs35l56->dsp.system_name); @@ -905,8 +910,6 @@ static void cs35l56_dsp_work(struct work_struct *work) cs35l56_patch(cs35l56, firmware_missing); cs35l56_log_tuning(&cs35l56->base, &cs35l56->dsp.cs_dsp); -err: - pm_runtime_put_autosuspend(cs35l56->base.dev); } static struct snd_soc_dapm_context *cs35l56_power_up_for_cal(struct cs35l56_private *cs35l56) From 53597deca0e38c30e6cd4ba2114fa42d2bcd85bb Mon Sep 17 00:00:00 2001 From: Guangshuo Li Date: Thu, 7 May 2026 18:06:03 +0800 Subject: [PATCH 738/972] drm/bridge: imx8qxp-pxl2dpi: avoid ERR_PTR with device_node cleanup imx8qxp_pxl2dpi_get_available_ep_from_port() returns ERR_PTR() on errors. imx8qxp_pxl2dpi_find_next_bridge() stores its return value in a __free(device_node) variable before checking IS_ERR(). When the function returns on the error path, the cleanup action calls of_node_put() on the ERR_PTR() value. Do not let a device_node cleanup variable hold error pointers. Change imx8qxp_pxl2dpi_get_available_ep_from_port() to return an int and pass the endpoint node through an output argument. Initialize the output argument to NULL so callers hold either NULL on error paths or a valid device_node pointer on successful path. Fixes: ceea3f7806a10 ("drm/bridge: imx8qxp-pxl2dpi: simplify put of device_node pointers") Cc: stable@vger.kernel.org Reviewed-by: Liu Ying Signed-off-by: Guangshuo Li Link: https://patch.msgid.link/20260507100604.667731-1-lgs201920130244@gmail.com Signed-off-by: Liu Ying --- drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c | 40 +++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c b/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c index 441fd32dc91c7a..d64e328bf542f3 100644 --- a/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c +++ b/drivers/gpu/drm/bridge/imx/imx8qxp-pxl2dpi.c @@ -222,52 +222,58 @@ static const struct drm_bridge_funcs imx8qxp_pxl2dpi_bridge_funcs = { imx8qxp_pxl2dpi_bridge_atomic_get_output_bus_fmts, }; -static struct device_node * +static int imx8qxp_pxl2dpi_get_available_ep_from_port(struct imx8qxp_pxl2dpi *p2d, - u32 port_id) + u32 port_id, + struct device_node **ep) { - struct device_node *port, *ep; + struct device_node *port; + int ret = 0; int ep_cnt; + *ep = NULL; + port = of_graph_get_port_by_id(p2d->dev->of_node, port_id); if (!port) { DRM_DEV_ERROR(p2d->dev, "failed to get port@%u\n", port_id); - return ERR_PTR(-ENODEV); + return -ENODEV; } ep_cnt = of_get_available_child_count(port); if (ep_cnt == 0) { DRM_DEV_ERROR(p2d->dev, "no available endpoints of port@%u\n", port_id); - ep = ERR_PTR(-ENODEV); + ret = -ENODEV; goto out; } else if (ep_cnt > 1) { DRM_DEV_ERROR(p2d->dev, "invalid available endpoints of port@%u\n", port_id); - ep = ERR_PTR(-EINVAL); + ret = -EINVAL; goto out; } - ep = of_get_next_available_child(port, NULL); - if (!ep) { + *ep = of_get_next_available_child(port, NULL); + if (!*ep) { DRM_DEV_ERROR(p2d->dev, "failed to get available endpoint of port@%u\n", port_id); - ep = ERR_PTR(-ENODEV); + ret = -ENODEV; goto out; } out: of_node_put(port); - return ep; + return ret; } static int imx8qxp_pxl2dpi_find_next_bridge(struct imx8qxp_pxl2dpi *p2d) { - struct device_node *ep __free(device_node) = - imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 1); - if (IS_ERR(ep)) - return PTR_ERR(ep); + struct device_node *ep __free(device_node) = NULL; + int ret; + + ret = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 1, &ep); + if (ret) + return ret; struct device_node *remote __free(device_node) = of_graph_get_remote_port_parent(ep); if (!remote || !of_device_is_available(remote)) { @@ -291,9 +297,9 @@ static int imx8qxp_pxl2dpi_set_pixel_link_sel(struct imx8qxp_pxl2dpi *p2d) struct of_endpoint endpoint; int ret; - ep = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 0); - if (IS_ERR(ep)) - return PTR_ERR(ep); + ret = imx8qxp_pxl2dpi_get_available_ep_from_port(p2d, 0, &ep); + if (ret) + return ret; ret = of_graph_parse_endpoint(ep, &endpoint); if (ret) { From c21b90f77687075115d989e53a8ec5e2bb427ab1 Mon Sep 17 00:00:00 2001 From: Prathyushi Nangia Date: Tue, 9 Dec 2025 10:01:33 -0600 Subject: [PATCH 739/972] x86/CPU/AMD: Prevent improper isolation of shared resources in Zen2's op cache Make sure resources are not improperly shared in the op cache and cause instruction corruption this way. Signed-off-by: Prathyushi Nangia Co-developed-by: Borislav Petkov (AMD) Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/msr-index.h | 3 ++- arch/x86/kernel/cpu/amd.c | 3 +++ tools/arch/x86/include/asm/msr-index.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a14a0f43e04ae8..86554de9a3f522 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -803,9 +803,10 @@ #define MSR_AMD64_LBR_SELECT 0xc000010e /* Zen4 */ -#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG 0xc001102e #define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +#define MSR_ZEN2_BP_CFG_BUG_FIX_BIT 33 /* Fam 19h MSRs */ #define MSR_F19H_UMC_PERF_CTL 0xc0010800 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d9ae6ab1701c0..2f8e8ff2d000a7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -989,6 +989,9 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) /* Correct misconfigured CPUID on some clients. */ clear_cpu_cap(c, X86_FEATURE_INVLPGB); + + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN2_BP_CFG_BUG_FIX_BIT); } static void init_amd_zen3(struct cpuinfo_x86 *c) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 6673601246b382..eff29645719bc7 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -793,9 +793,10 @@ #define MSR_AMD64_LBR_SELECT 0xc000010e /* Zen4 */ -#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG 0xc001102e #define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 #define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 +#define MSR_ZEN2_BP_CFG_BUG_FIX_BIT 33 /* Fam 19h MSRs */ #define MSR_F19H_UMC_PERF_CTL 0xc0010800 From 8d57bb61734b23f6342e9de781173f1d83f90d3a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 5 May 2026 20:47:56 +0200 Subject: [PATCH 740/972] powerpc/g5: Enable all windfarms by default The G5 defconfig is clearly intended for the G5 Powermac series, and that should enable all the available windfarm drivers, or the machine will overheat a short while after booting and shut itself down, which is annoying. Signed-off-by: Linus Walleij Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260505-powermac-g5-config-v3-1-7747bf72f874@kernel.org --- arch/powerpc/configs/g5_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/configs/g5_defconfig b/arch/powerpc/configs/g5_defconfig index e9996711b362d7..5ca1676e6058d2 100644 --- a/arch/powerpc/configs/g5_defconfig +++ b/arch/powerpc/configs/g5_defconfig @@ -85,6 +85,8 @@ CONFIG_PMAC_SMU=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_WINDFARM=y CONFIG_WINDFARM_PM81=y +CONFIG_WINDFARM_PM72=y +CONFIG_WINDFARM_RM31=y CONFIG_WINDFARM_PM91=y CONFIG_WINDFARM_PM112=y CONFIG_WINDFARM_PM121=y From acd1e47db03d4b528fd5efb8565dd0de1c79f62a Mon Sep 17 00:00:00 2001 From: Ally Heev Date: Sun, 16 Nov 2025 19:55:44 +0530 Subject: [PATCH 741/972] powerpc: 82xx: fix uninitialized pointers with free attribute Uninitialized pointers with `__free` attribute can cause undefined behavior as the memory allocated to the pointer is freed automatically when the pointer goes out of scope. powerpc/km82xx doesn't have any bugs related to this as of now, but, it is better to initialize and assign pointers with `__free` attribute in one statement to ensure proper scope-based cleanup Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/aPiG_F5EBQUjZqsl@stanley.mountain/ Signed-off-by: Ally Heev Fixes: 4aa5cc1e0012 ("powerpc-km82xx.c: replace of_node_put() with __free") Reviewed-by: Christophe Leroy (CS GROUP) Reviewed-by: Krzysztof Kozlowski Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251116-aheev-uninitialized-free-attr-km82xx-v2-1-4307e2b5300d@gmail.com --- arch/powerpc/platforms/82xx/km82xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/82xx/km82xx.c b/arch/powerpc/platforms/82xx/km82xx.c index 99f0f0f4187672..4ad223525e893c 100644 --- a/arch/powerpc/platforms/82xx/km82xx.c +++ b/arch/powerpc/platforms/82xx/km82xx.c @@ -27,8 +27,8 @@ static void __init km82xx_pic_init(void) { - struct device_node *np __free(device_node); - np = of_find_compatible_node(NULL, NULL, "fsl,pq2-pic"); + struct device_node *np __free(device_node) = of_find_compatible_node(NULL, + NULL, "fsl,pq2-pic"); if (!np) { pr_err("PIC init: can not find cpm-pic node\n"); From 108d7f951271cbd36ca36efc5e5d106966f5180c Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Sun, 16 Nov 2025 10:44:11 +0800 Subject: [PATCH 742/972] powerpc/warp: Fix error handling in pika_dtm_thread pika_dtm_thread() acquires client through of_find_i2c_device_by_node() but fails to release it in error handling path. This could result in a reference count leak, preventing proper cleanup and potentially leading to resource exhaustion. Add put_device() to release the reference in the error handling path. Found by code review. Cc: stable@vger.kernel.org Fixes: 3984114f0562 ("powerpc/warp: Platform fix for i2c change") Signed-off-by: Ma Ke Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251116024411.21968-1-make24@iscas.ac.cn --- arch/powerpc/platforms/44x/warp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index a5001d32f978d7..6f674f86dc853c 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -293,6 +293,8 @@ static int pika_dtm_thread(void __iomem *fpga) schedule_timeout(HZ); } + put_device(&client->dev); + return 0; } From f98020c22ad9be07d6feefd0496e70f9f36a2f63 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 16 Mar 2026 10:47:42 -0700 Subject: [PATCH 743/972] powerpc/powermac: Remove pmac_low_i2c_{lock,unlock}() Commit a28d3af2a26c ("[PATCH] 2/5 powerpc: Rework PowerMac i2c part 2") removed the last calls to the pmac_low_i2c_{lock,unlock}() functions. Hence, remove these two functions. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bart Van Assche Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260316174747.3871924-1-bvanassche@acm.org --- arch/powerpc/include/asm/pmac_low_i2c.h | 4 --- arch/powerpc/platforms/powermac/low_i2c.c | 34 ----------------------- 2 files changed, 38 deletions(-) diff --git a/arch/powerpc/include/asm/pmac_low_i2c.h b/arch/powerpc/include/asm/pmac_low_i2c.h index 21bd7297c87f65..fead8fae08ab95 100644 --- a/arch/powerpc/include/asm/pmac_low_i2c.h +++ b/arch/powerpc/include/asm/pmac_low_i2c.h @@ -79,10 +79,6 @@ extern int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter); -/* (legacy) Locking functions exposed to i2c-keywest */ -extern int pmac_low_i2c_lock(struct device_node *np); -extern int pmac_low_i2c_unlock(struct device_node *np); - /* Access functions for platform code */ extern int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled); extern void pmac_i2c_close(struct pmac_i2c_bus *bus); diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 73b7f4e8c04756..da72a30ab8657e 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -1058,40 +1058,6 @@ int pmac_i2c_match_adapter(struct device_node *dev, struct i2c_adapter *adapter) } EXPORT_SYMBOL_GPL(pmac_i2c_match_adapter); -int pmac_low_i2c_lock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - return pmac_i2c_open(bus, 0); -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_lock); - -int pmac_low_i2c_unlock(struct device_node *np) -{ - struct pmac_i2c_bus *bus, *found = NULL; - - list_for_each_entry(bus, &pmac_i2c_busses, link) { - if (np == bus->controller) { - found = bus; - break; - } - } - if (!found) - return -ENODEV; - pmac_i2c_close(bus); - return 0; -} -EXPORT_SYMBOL_GPL(pmac_low_i2c_unlock); - - int pmac_i2c_open(struct pmac_i2c_bus *bus, int polled) { int rc; From aef656a0e6c01796190bb5bd2bdba1c644ed7811 Mon Sep 17 00:00:00 2001 From: Julian Braha Date: Sun, 5 Apr 2026 17:15:45 +0100 Subject: [PATCH 744/972] powerpc: fix dead default for GUEST_STATE_BUFFER_TEST The GUEST_STATE_BUFFER_TEST config option should default to KUNIT_ALL_TESTS so that if all tests are enabled then it is included, but currently the 'default KUNIT_ALL_TESTS' statement is shadowed by 'def_tristate n', meaning that this second default statement is currently dead code. It looks to me like the commit 6ccbbc33f06a ("KVM: PPC: Add helper library for Guest State Buffers") intended to set the default to KUNIT_ALL_TESTS, but mistakenly missed the def_tristate. This dead code was found by kconfirm, a static analysis tool for Kconfig. Fixes: 6ccbbc33f06a ("KVM: PPC: Add helper library for Guest State Buffers") Signed-off-by: Julian Braha Tested-by: Gautam Menghani Reviewed-by: Amit Machhiwal Reviewed-by: Harsh Prateek Bora Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260405161545.161006-1-julianbraha@gmail.com --- arch/powerpc/Kconfig.debug | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index f15e5920080ba5..e8718bc13eeb1b 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -83,11 +83,10 @@ config MSI_BITMAP_SELFTEST depends on DEBUG_KERNEL config GUEST_STATE_BUFFER_TEST - def_tristate n + def_tristate KUNIT_ALL_TESTS prompt "Enable Guest State Buffer unit tests" depends on KUNIT depends on KVM_BOOK3S_HV_POSSIBLE - default KUNIT_ALL_TESTS help The Guest State Buffer is a data format specified in the PAPR. It is by hcalls to communicate the state of L2 guests between From dbc30a57bd8e026995e9fa8e8c31cffd18542c01 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Fri, 8 May 2026 09:42:56 +0530 Subject: [PATCH 745/972] powerpc/hv-gpci: fix preempt count leak in sysfs show paths Four sysfs show() callbacks in hv-gpci take get_cpu_var(hv_gpci_reqb) (which calls preempt_disable()) but only call the matching put_cpu_var() on the error path under the 'out:' label. Every successful read leaks one preempt_disable(): processor_bus_topology_show() processor_config_show() affinity_domain_via_virtual_processor_show() affinity_domain_via_domain_show() (affinity_domain_via_partition_show() was already correct.) On a CONFIG_PREEMPT=y kernel, repeated reads raise preempt_count and eventually return to userspace with preemption still disabled. The next user-mode page fault then hits faulthandler_disabled() == 1, gets forced to SIGSEGV, and the resulting coredump trips 'BUG: scheduling while atomic' in call_usermodehelper_exec -> wait_for_completion_state -> schedule: BUG: scheduling while atomic: //0x00000004 ... __schedule_bug+0x6c/0x90 __schedule+0x58c/0x13a0 schedule+0x48/0x1a0 schedule_timeout+0x104/0x170 wait_for_completion_state+0x16c/0x330 call_usermodehelper_exec+0x254/0x2d0 vfs_coredump+0x1050/0x2590 get_signal+0xb9c/0xc80 do_notify_resume+0xf8/0x470 Add an out_success label that calls put_cpu_var() before returning the byte count, mirroring affinity_domain_via_partition_show(). Fixes: 71f1c39647d8 ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show processor bus topology information") Fixes: 1a160c2a13c6 ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show processor config information") Fixes: 71a7ccb478fc ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show affinity domain via virtual processor information") Fixes: a69a57cac1ec ("powerpc/hv_gpci: Add sysfs file inside hv_gpci device to show affinity domain via domain information") Signed-off-by: Aboorva Devarajan Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260508041256.3447113-1-aboorvad@linux.ibm.com --- arch/powerpc/perf/hv-gpci.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 5cac2cf3bd1e57..10c82cf8f5b393 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -210,7 +210,7 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -244,12 +244,14 @@ static ssize_t processor_bus_topology_show(struct device *dev, struct device_att starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -278,7 +280,7 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -312,12 +314,14 @@ static ssize_t processor_config_show(struct device *dev, struct device_attribute starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -346,7 +350,7 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -382,12 +386,14 @@ static ssize_t affinity_domain_via_virtual_processor_show(struct device *dev, starting_index, secondary_index, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: @@ -416,7 +422,7 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device 0, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; @@ -448,12 +454,14 @@ static ssize_t affinity_domain_via_domain_show(struct device *dev, struct device starting_index, 0, buf, &n, arg); if (!ret) - return n; + goto out_success; if (ret != H_PARAMETER) goto out; } +out_success: + put_cpu_var(hv_gpci_reqb); return n; out: From 4cfe4c0efbdcde742a47813180cc69b132d7598e Mon Sep 17 00:00:00 2001 From: Sebastian Brzezinka Date: Thu, 16 Apr 2026 13:31:18 +0200 Subject: [PATCH 746/972] drm/i915: skip __i915_request_skip() for already signaled requests After a GPU reset the HWSP is zeroed, so previously completed requests appear incomplete. If such a request is picked up during reset_rewind() and marked guilty, i915_request_set_error_once() returns early (fence already signaled), leaving fence.error without a fatal error code. The subsequent __i915_request_skip() then hits: ``` GEM_BUG_ON(!fatal_error(rq->fence.error)) ``` Fixes a kernel BUG observed on Sandy Bridge (Gen6) during heartbeat-triggered engine resets. ``` kernel BUG at drivers/gpu/drm/i915/i915_request.c:556! RIP: __i915_request_skip+0x15e/0x1d0 [i915] ... __i915_request_reset+0x212/0xa70 [i915] reset_rewind+0xe4/0x280 [i915] intel_gt_reset+0x30d/0x5b0 [i915] heartbeat+0x516/0x530 [i915] ``` Guard __i915_request_skip() with i915_request_signaled(), if the fence is already signaled, the ring content is committed and there is nothing left to skip. Fixes: 36e191f0644b ("drm/i915: Apply i915_request_skip() on submission") Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/work_items/13729 Signed-off-by: Sebastian Brzezinka Cc: stable@vger.kernel.org # v5.7+ Reviewed-by: Krzysztof Karas Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/fe76921d35b6ae85aa651822726d0d9815aa5362.1776339012.git.sebastian.brzezinka@intel.com (cherry picked from commit 5ba54393dcd7adf75a9f39f5a933b1538349cad5) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_reset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 984d0056c01c25..adff482a6c9cd9 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -132,7 +132,8 @@ void __i915_request_reset(struct i915_request *rq, bool guilty) rcu_read_lock(); /* protect the GEM context */ if (guilty) { i915_request_set_error_once(rq, -EIO); - __i915_request_skip(rq); + if (!i915_request_signaled(rq)) + __i915_request_skip(rq); banned = mark_guilty(rq); } else { i915_request_set_error_once(rq, -EAGAIN); From 1ae15b6c7965d137eef21f2cc7d367b29cb88369 Mon Sep 17 00:00:00 2001 From: Chaitanya Kumar Borah Date: Tue, 5 May 2026 14:39:20 +0530 Subject: [PATCH 747/972] drm/i915/dp: Fix VSC dynamic range signaling for RGB formats For RGB, set dynamic_range to CTA or VESA based on crtc_state->limited_color_range so sinks apply correct quantization. YCbCr remains limited (CTA) range. (DP v1.4, Table 5-1) v2: - Added Reported-by and Tested-by tags v3: - Add back YCbCr comment(Suraj) Cc: stable@vger.kernel.org #v5.8+ Reported-by: DeepChirp Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/work_items/15874 Tested-by: DeepChirp Fixes: 9799c4c3b76e ("drm/i915/dp: Add compute routine for DP VSC SDP") Assisted-by: GitHub-Copilot:GPT-5.4 Signed-off-by: Chaitanya Kumar Borah Reviewed-by: Suraj Kandpal Signed-off-by: Suraj Kandpal Link: https://patch.msgid.link/20260505090920.2479112-1-chaitanya.kumar.borah@intel.com (cherry picked from commit 38e10ddae6f8d42a2e8437fcd25a1cac51106c64) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_dp.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 4955bd8b11d7ad..50d34b4fdf82a9 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -3119,8 +3119,13 @@ static void intel_dp_compute_vsc_colorimetry(const struct intel_crtc_state *crtc drm_WARN_ON(display->drm, vsc->bpc == 6 && vsc->pixelformat != DP_PIXELFORMAT_RGB); - /* all YCbCr are always limited range */ - vsc->dynamic_range = DP_DYNAMIC_RANGE_CTA; + /* All YCbCr formats are always limited range. */ + if (vsc->pixelformat == DP_PIXELFORMAT_RGB) + vsc->dynamic_range = crtc_state->limited_color_range ? + DP_DYNAMIC_RANGE_CTA : DP_DYNAMIC_RANGE_VESA; + else + vsc->dynamic_range = DP_DYNAMIC_RANGE_CTA; + vsc->content_type = DP_CONTENT_TYPE_NOT_DEFINED; } From 911f54771ca97947cfdca360e9e9b4147a330740 Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Fri, 8 May 2026 20:46:36 +0800 Subject: [PATCH 748/972] net: hsr: fix NULL pointer dereference in hsr_get_node_data() In the HSR (High-availability Seamless Redundancy) protocol, node information is maintained in the node_db. When a supervision frame is received, node->addr_B_port is updated to track the receiving port type (e.g., HSR_PT_SLAVE_B). If the underlying physical interface associated with this slave port is removed (e.g., via `ip link del`), hsr_del_port() frees the hsr_port object. However, the stale node->addr_B_port reference is kept in the node_db until the node ages out. Subsequently, if userspace queries the node status via the Netlink command HSR_C_GET_NODE_STATUS, the kernel calls hsr_get_node_data(). This function unconditionally dereferences the pointer returned by hsr_port_get_hsr(): if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); *addr_b_ifindex = port->dev->ifindex; // <-- NULL deref } If the slave port has been deleted, hsr_port_get_hsr() returns NULL, resulting in a kernel panic. Oops: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:hsr_get_node_data+0x7b6/0x9e0 Call Trace: hsr_get_node_status+0x445/0xa40 Fix this by adding a proper NULL pointer check. If the port lookup fails due to a stale port type, gracefully treat it as if no valid port exists and assign -1 to the interface index. Steps to reproduce: 1. Create an HSR interface with two slave devices. 2. Receive a supervision frame to populate node_db with addr_B_port assigned to SLAVE_B. 3. Delete the underlying slave device B. 4. Send an HSR_C_GET_NODE_STATUS Netlink message. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Link: https://patch.msgid.link/20260508124636.1462346-1-2022090917019@std.uestc.edu.cn Signed-off-by: Paolo Abeni --- net/hsr/hsr_framereg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index d09875b335885e..124619920d3863 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -889,7 +889,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); - *addr_b_ifindex = port->dev->ifindex; + if (port) + *addr_b_ifindex = port->dev->ifindex; + else + *addr_b_ifindex = -1; } else { *addr_b_ifindex = -1; } From 3492e8b494c18028044d4a2e03db5c7331fbd789 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:05 +0100 Subject: [PATCH 749/972] soundwire: Add a helper function to wait for device initialisation Add a new helper function to wait for the device to enumerate and be initialised by the SoundWire core. Most of the SoundWire drivers have very similar boiler plate code in their runtime resume, and that boiler plate tends to access various internals of the SoundWire structs which is a mild layering violation. Adding a new core helper function greatly eases both of these issues. Acked-by: Vinod Koul Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/soundwire/bus.c | 31 +++++++++++++++++++++++++++++++ include/linux/soundwire/sdw.h | 8 ++++++++ 2 files changed, 39 insertions(+) diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index fe5316d93fefe7..ea3a24f805c003 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -1372,6 +1372,37 @@ int sdw_slave_get_current_bank(struct sdw_slave *slave) } EXPORT_SYMBOL_GPL(sdw_slave_get_current_bank); +/** + * sdw_slave_wait_for_init - Wait for device initialisation + * @slave: Pointer to the SoundWire peripheral. + * @timeout_ms: Timeout in milliseconds. + * + * Wait for a peripheral device to enumerate and be initialised by the + * SoundWire core. + * + * Return: Zero on success, and a negative error code on failure. + */ +int sdw_slave_wait_for_init(struct sdw_slave *slave, int timeout_ms) +{ + unsigned long time; + + if (!slave->unattach_request) + return 0; + + time = wait_for_completion_timeout(&slave->initialization_complete, + msecs_to_jiffies(timeout_ms)); + if (!time) { + dev_err(&slave->dev, "Initialization not complete\n"); + sdw_show_ping_status(slave->bus, true); + return -ETIMEDOUT; + } + + slave->unattach_request = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(sdw_slave_wait_for_init); + static int sdw_slave_set_frequency(struct sdw_slave *slave) { int scale_index; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 6147eb1fb210d6..a46cbaec594912 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1093,6 +1093,8 @@ int sdw_slave_get_current_bank(struct sdw_slave *sdev); int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); +int sdw_slave_wait_for_init(struct sdw_slave *slave, int timeout_ms); + /* messaging and data APIs */ int sdw_read(struct sdw_slave *slave, u32 addr); int sdw_write(struct sdw_slave *slave, u32 addr, u8 value); @@ -1136,6 +1138,12 @@ static inline int sdw_slave_get_current_bank(struct sdw_slave *sdev) return -EINVAL; } +static inline int sdw_slave_wait_for_init(struct sdw_slave *slave, int timeout_ms) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return -EINVAL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { From 9dc2b0d599c6e89379bed40f23fdb72a088503f2 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:06 +0100 Subject: [PATCH 750/972] ASoC: cs35l56: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-sdw.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 9dc47fec1ea048..105d38b1c1878b 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -436,6 +436,7 @@ static const struct sdw_slave_ops cs35l56_sdw_ops = { static int __maybe_unused cs35l56_sdw_handle_unattach(struct cs35l56_private *cs35l56) { struct sdw_slave *peripheral = cs35l56->sdw_peripheral; + int ret; dev_dbg(cs35l56->base.dev, "attached:%u unattach_request:%u in_clock_stop_1:%u\n", cs35l56->sdw_attached, peripheral->unattach_request, cs35l56->sdw_in_clock_stop_1); @@ -443,13 +444,10 @@ static int __maybe_unused cs35l56_sdw_handle_unattach(struct cs35l56_private *cs if (cs35l56->sdw_in_clock_stop_1 || peripheral->unattach_request) { /* Cannot access registers until bus is re-initialized. */ dev_dbg(cs35l56->base.dev, "Wait for initialization_complete\n"); - if (!wait_for_completion_timeout(&peripheral->initialization_complete, - msecs_to_jiffies(5000))) { - dev_err(cs35l56->base.dev, "initialization_complete timed out\n"); - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(peripheral, 5000); + if (ret) + return ret; - peripheral->unattach_request = 0; cs35l56->sdw_in_clock_stop_1 = false; /* From 47e2f687b0c50102d487c06f11157389d28846ba Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:07 +0100 Subject: [PATCH 751/972] ASoC: cs42l42: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-4-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs42l42-sdw.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sound/soc/codecs/cs42l42-sdw.c b/sound/soc/codecs/cs42l42-sdw.c index d5999ad9ff9b48..b8256ce0b8fbee 100644 --- a/sound/soc/codecs/cs42l42-sdw.c +++ b/sound/soc/codecs/cs42l42-sdw.c @@ -433,19 +433,16 @@ static const struct reg_sequence cs42l42_soft_reboot_seq[] = { static int cs42l42_sdw_handle_unattach(struct cs42l42_private *cs42l42) { struct sdw_slave *peripheral = cs42l42->sdw_peripheral; + int ret; if (!peripheral->unattach_request) return 0; /* Cannot access registers until master re-attaches. */ dev_dbg(&peripheral->dev, "Wait for initialization_complete\n"); - if (!wait_for_completion_timeout(&peripheral->initialization_complete, - msecs_to_jiffies(5000))) { - dev_err(&peripheral->dev, "initialization_complete timed out\n"); - return -ETIMEDOUT; - } - - peripheral->unattach_request = 0; + ret = sdw_slave_wait_for_init(peripheral, 5000); + if (ret) + return ret; /* * After a bus reset there must be a reconfiguration reset to From 7fa0ff88f88203e137aed8dea9a497b611e5be82 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:08 +0100 Subject: [PATCH 752/972] ASoC: max98363: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-5-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/max98363.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/sound/soc/codecs/max98363.c b/sound/soc/codecs/max98363.c index 25af78ab30d5c5..099dc5bf6195f1 100644 --- a/sound/soc/codecs/max98363.c +++ b/sound/soc/codecs/max98363.c @@ -90,24 +90,15 @@ static int max98363_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct max98363_priv *max98363 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!max98363->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(MAX98363_PROBE_TIMEOUT)); - if (!time) { - dev_err(dev, "Initialization not complete, timed out\n"); - return -ETIMEDOUT; - } - -regmap_sync: + ret = sdw_slave_wait_for_init(slave, MAX98363_PROBE_TIMEOUT); + if (ret) + return ret; - slave->unattach_request = 0; regcache_cache_only(max98363->regmap, false); regcache_sync(max98363->regmap); From 8e5768eb4eaaaaa2703c456e8e96a3c4391d1e21 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:09 +0100 Subject: [PATCH 753/972] ASoC: max98373: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-6-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/max98373-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/max98373-sdw.c b/sound/soc/codecs/max98373-sdw.c index 16673440218cb5..6829fa07c9ecbe 100644 --- a/sound/soc/codecs/max98373-sdw.c +++ b/sound/soc/codecs/max98373-sdw.c @@ -266,25 +266,15 @@ static int max98373_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct max98373_priv *max98373 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!max98373->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(MAX98373_PROBE_TIMEOUT)); - if (!time) { - dev_err(dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, MAX98373_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(max98373->regmap, false); regcache_sync(max98373->regmap); From c5b0783d18a693546ab1a6736e39b0361c527c2b Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:10 +0100 Subject: [PATCH 754/972] ASoC: rt700: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-7-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt700-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/rt700-sdw.c b/sound/soc/codecs/rt700-sdw.c index 9ce36a66fae1d1..30fcca210f0518 100644 --- a/sound/soc/codecs/rt700-sdw.c +++ b/sound/soc/codecs/rt700-sdw.c @@ -522,25 +522,15 @@ static int rt700_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt700_priv *rt700 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt700->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT700_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT700_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt700->regmap, false); regcache_sync_region(rt700->regmap, 0x3000, 0x8fff); regcache_sync_region(rt700->regmap, 0x752010, 0x75206b); From 953ee481561b56d099a0fead0c644273e7d9b672 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:11 +0100 Subject: [PATCH 755/972] ASoC: rt711: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-8-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt711-sdca-sdw.c | 16 ++++------------ sound/soc/codecs/rt711-sdw.c | 14 ++++---------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/sound/soc/codecs/rt711-sdca-sdw.c b/sound/soc/codecs/rt711-sdca-sdw.c index 49dacceddf8150..a8164fc3979ab8 100644 --- a/sound/soc/codecs/rt711-sdca-sdw.c +++ b/sound/soc/codecs/rt711-sdca-sdw.c @@ -438,7 +438,7 @@ static int rt711_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt711_sdca_priv *rt711 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt711->first_hw_init) return 0; @@ -451,20 +451,12 @@ static int rt711_sdca_dev_resume(struct device *dev) rt711->disable_irq = false; } mutex_unlock(&rt711->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT711_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); + ret = sdw_slave_wait_for_init(slave, RT711_PROBE_TIMEOUT); + if (ret) + return ret; - return -ETIMEDOUT; - } - -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt711->regmap, false); regcache_sync(rt711->regmap); regcache_cache_only(rt711->mbq_regmap, false); diff --git a/sound/soc/codecs/rt711-sdw.c b/sound/soc/codecs/rt711-sdw.c index 72ddf4cebdf366..df3c43f2ab6b83 100644 --- a/sound/soc/codecs/rt711-sdw.c +++ b/sound/soc/codecs/rt711-sdw.c @@ -530,7 +530,7 @@ static int rt711_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt711_priv *rt711 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt711->first_hw_init) return 0; @@ -542,18 +542,12 @@ static int rt711_dev_resume(struct device *dev) rt711->disable_irq = false; } mutex_unlock(&rt711->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT711_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT711_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt711->regmap, false); regcache_sync_region(rt711->regmap, 0x3000, 0x8fff); regcache_sync_region(rt711->regmap, 0x752009, 0x752091); From d5bd2f7239f601821bfb24c81b4caeda276c7e30 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:12 +0100 Subject: [PATCH 756/972] ASoC: rt712: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-9-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt712-sdca-dmic.c | 19 ++++--------------- sound/soc/codecs/rt712-sdca-sdw.c | 16 ++++------------ 2 files changed, 8 insertions(+), 27 deletions(-) diff --git a/sound/soc/codecs/rt712-sdca-dmic.c b/sound/soc/codecs/rt712-sdca-dmic.c index 4d83544ef2049e..4c5c2f5ba5edff 100644 --- a/sound/soc/codecs/rt712-sdca-dmic.c +++ b/sound/soc/codecs/rt712-sdca-dmic.c @@ -905,26 +905,15 @@ static int rt712_sdca_dmic_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt712_sdca_dmic_priv *rt712 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt712->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT712_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", - __func__); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT712_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt712->regmap, false); regcache_sync(rt712->regmap); regcache_cache_only(rt712->mbq_regmap, false); diff --git a/sound/soc/codecs/rt712-sdca-sdw.c b/sound/soc/codecs/rt712-sdca-sdw.c index 8c82887174db29..5817321804736d 100644 --- a/sound/soc/codecs/rt712-sdca-sdw.c +++ b/sound/soc/codecs/rt712-sdca-sdw.c @@ -450,7 +450,7 @@ static int rt712_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt712_sdca_priv *rt712 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt712->first_hw_init) return 0; @@ -464,20 +464,12 @@ static int rt712_sdca_dev_resume(struct device *dev) rt712->disable_irq = false; } mutex_unlock(&rt712->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT712_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); + ret = sdw_slave_wait_for_init(slave, RT712_PROBE_TIMEOUT); + if (ret) + return ret; - return -ETIMEDOUT; - } - -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt712->regmap, false); regcache_sync(rt712->regmap); regcache_cache_only(rt712->mbq_regmap, false); From e154b2472f6b99e1f56df6ccdad4606b79563970 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:13 +0100 Subject: [PATCH 757/972] ASoC: rt715: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-10-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt715-sdca-sdw.c | 18 ++++-------------- sound/soc/codecs/rt715-sdw.c | 18 ++++-------------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/sound/soc/codecs/rt715-sdca-sdw.c b/sound/soc/codecs/rt715-sdca-sdw.c index 968bc183b8d8ce..4b9815b5628dbe 100644 --- a/sound/soc/codecs/rt715-sdca-sdw.c +++ b/sound/soc/codecs/rt715-sdca-sdw.c @@ -224,25 +224,15 @@ static int rt715_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt715_sdca_priv *rt715 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt715->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; + ret = sdw_slave_wait_for_init(slave, RT715_PROBE_TIMEOUT); + if (ret) + return ret; - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT715_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } - -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt715->regmap, false); regcache_sync_region(rt715->regmap, SDW_SDCA_CTL(FUN_JACK_CODEC, RT715_SDCA_ST_EN, RT715_SDCA_ST_CTRL, diff --git a/sound/soc/codecs/rt715-sdw.c b/sound/soc/codecs/rt715-sdw.c index 49c91d015be4d3..7f83a8f1a06e98 100644 --- a/sound/soc/codecs/rt715-sdw.c +++ b/sound/soc/codecs/rt715-sdw.c @@ -501,25 +501,15 @@ static int rt715_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt715_priv *rt715 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt715->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT715_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT715_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt715->regmap, false); regcache_sync_region(rt715->regmap, 0x3000, 0x8fff); regcache_sync_region(rt715->regmap, 0x752039, 0x752039); From cc05ab0a664ed8aa055ec41515d1be3da0679353 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:14 +0100 Subject: [PATCH 758/972] ASoc: rt721: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-11-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt721-sdca-sdw.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/rt721-sdca-sdw.c b/sound/soc/codecs/rt721-sdca-sdw.c index 6eb8512975b85b..58606209316a4a 100644 --- a/sound/soc/codecs/rt721-sdca-sdw.c +++ b/sound/soc/codecs/rt721-sdca-sdw.c @@ -489,7 +489,7 @@ static int rt721_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt721_sdca_priv *rt721 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt721->first_hw_init) return 0; @@ -502,20 +502,12 @@ static int rt721_sdca_dev_resume(struct device *dev) rt721->disable_irq = false; } mutex_unlock(&rt721->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT721_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); + ret = sdw_slave_wait_for_init(slave, RT721_PROBE_TIMEOUT); + if (ret) + return ret; - return -ETIMEDOUT; - } - -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt721->regmap, false); regcache_sync(rt721->regmap); regcache_cache_only(rt721->mbq_regmap, false); From 0b64017495866d8e8b8ca4269294aa677c233188 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:15 +0100 Subject: [PATCH 759/972] ASoC: rt722: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-12-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt722-sdca-sdw.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/rt722-sdca-sdw.c b/sound/soc/codecs/rt722-sdca-sdw.c index 0a5b3ffa90daf2..a5feba3d0c182d 100644 --- a/sound/soc/codecs/rt722-sdca-sdw.c +++ b/sound/soc/codecs/rt722-sdca-sdw.c @@ -501,7 +501,7 @@ static int rt722_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt722_sdca_priv *rt722 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt722->first_hw_init) return 0; @@ -514,20 +514,12 @@ static int rt722_sdca_dev_resume(struct device *dev) rt722->disable_irq = false; } mutex_unlock(&rt722->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT722_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); + ret = sdw_slave_wait_for_init(slave, RT722_PROBE_TIMEOUT); + if (ret) + return ret; - return -ETIMEDOUT; - } - -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt722->regmap, false); regcache_sync(rt722->regmap); return 0; From ab579acef4bf10460630d85041f5ff465ffaefbb Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:16 +0100 Subject: [PATCH 760/972] ASoC: rt1017: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-13-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1017-sdca-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/rt1017-sdca-sdw.c b/sound/soc/codecs/rt1017-sdca-sdw.c index 148b36173a2573..d62e8a25367679 100644 --- a/sound/soc/codecs/rt1017-sdca-sdw.c +++ b/sound/soc/codecs/rt1017-sdca-sdw.c @@ -773,25 +773,15 @@ static int rt1017_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt1017_sdca_priv *rt1017 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt1017->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT1017_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT1017_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt1017->regmap, false); regcache_sync(rt1017->regmap); From 1ab950b47b3ae7fe10634044b7fc3e5c080bf4f3 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:17 +0100 Subject: [PATCH 761/972] ASoC: rt1308: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-14-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1308-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/rt1308-sdw.c b/sound/soc/codecs/rt1308-sdw.c index e077d096bc239c..39e06a3a75609a 100644 --- a/sound/soc/codecs/rt1308-sdw.c +++ b/sound/soc/codecs/rt1308-sdw.c @@ -768,25 +768,15 @@ static int rt1308_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt1308_sdw_priv *rt1308 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt1308->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT1308_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "Initialization not complete, timed out\n"); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT1308_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt1308->regmap, false); regcache_sync_region(rt1308->regmap, 0xc000, 0xcfff); From b7997fcacf94dbb332339b7a9ceb4caeba70aa47 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:18 +0100 Subject: [PATCH 762/972] ASoC: rt1316: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-15-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1316-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/rt1316-sdw.c b/sound/soc/codecs/rt1316-sdw.c index 20fc1579eb9cf2..1828fd9d5af6ab 100644 --- a/sound/soc/codecs/rt1316-sdw.c +++ b/sound/soc/codecs/rt1316-sdw.c @@ -745,25 +745,15 @@ static int rt1316_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt1316_sdw_priv *rt1316 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt1316->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT1316_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT1316_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt1316->regmap, false); regcache_sync(rt1316->regmap); From 210f109efdfdd60fd6bec0e3b60af9a53fe998d6 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:19 +0100 Subject: [PATCH 763/972] ASoC: rt1318: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-16-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1318-sdw.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/rt1318-sdw.c b/sound/soc/codecs/rt1318-sdw.c index d28f1afe68f184..51bd11b92a554c 100644 --- a/sound/soc/codecs/rt1318-sdw.c +++ b/sound/soc/codecs/rt1318-sdw.c @@ -821,23 +821,15 @@ static int rt1318_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt1318_sdw_priv *rt1318 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt1318->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT1318_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT1318_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt1318->regmap, false); regcache_sync(rt1318->regmap); From 4461603eda139f72708cf69e77396655294a3f5e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:20 +0100 Subject: [PATCH 764/972] ASoC: rt1320: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-17-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1320-sdw.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/rt1320-sdw.c b/sound/soc/codecs/rt1320-sdw.c index 192faa431b5e9c..13493b85f3c95a 100644 --- a/sound/soc/codecs/rt1320-sdw.c +++ b/sound/soc/codecs/rt1320-sdw.c @@ -3053,23 +3053,15 @@ static int rt1320_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt1320_sdw_priv *rt1320 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt1320->first_hw_init) return 0; - if (!slave->unattach_request) - goto regmap_sync; - - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT1320_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT1320_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt1320->regmap, false); regcache_sync(rt1320->regmap); regcache_cache_only(rt1320->mbq_regmap, false); From fee0a8b4f0da44c7218144d57b72f1035c4ba782 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:21 +0100 Subject: [PATCH 765/972] ASoC: rt5682: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-18-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt5682-sdw.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/sound/soc/codecs/rt5682-sdw.c b/sound/soc/codecs/rt5682-sdw.c index fc464538ceffb0..ec2a35a0cacdea 100644 --- a/sound/soc/codecs/rt5682-sdw.c +++ b/sound/soc/codecs/rt5682-sdw.c @@ -754,7 +754,7 @@ static int rt5682_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct rt5682_priv *rt5682 = dev_get_drvdata(dev); - unsigned long time; + int ret; if (!rt5682->first_hw_init) return 0; @@ -766,20 +766,12 @@ static int rt5682_dev_resume(struct device *dev) rt5682->disable_irq = false; } mutex_unlock(&rt5682->disable_irq_lock); - goto regmap_sync; } - time = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(RT5682_PROBE_TIMEOUT)); - if (!time) { - dev_err(&slave->dev, "%s: Initialization not complete, timed out\n", __func__); - sdw_show_ping_status(slave->bus, true); - - return -ETIMEDOUT; - } + ret = sdw_slave_wait_for_init(slave, RT5682_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: - slave->unattach_request = 0; regcache_cache_only(rt5682->sdw_regmap, false); regcache_cache_only(rt5682->regmap, false); regcache_sync(rt5682->regmap); From ac6d4f298160bebf6979e63c2758414af5266f28 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 12 May 2026 11:30:22 +0100 Subject: [PATCH 766/972] ASoC: tas2783: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20260512103022.1154645-19-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2783-sdw.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/sound/soc/codecs/tas2783-sdw.c b/sound/soc/codecs/tas2783-sdw.c index 90008d2d06e2c7..1bc6da668bdc45 100644 --- a/sound/soc/codecs/tas2783-sdw.c +++ b/sound/soc/codecs/tas2783-sdw.c @@ -1082,22 +1082,12 @@ static s32 tas2783_sdca_dev_resume(struct device *dev) { struct sdw_slave *slave = dev_to_sdw_dev(dev); struct tas2783_prv *tas_dev = dev_get_drvdata(dev); - unsigned long t; - - if (!slave->unattach_request) - goto regmap_sync; - - t = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(TAS2783_PROBE_TIMEOUT)); - if (!t) { - dev_err(&slave->dev, "resume: initialization timed out\n"); - sdw_show_ping_status(slave->bus, true); - return -ETIMEDOUT; - } + int ret; - slave->unattach_request = 0; + ret = sdw_slave_wait_for_init(slave, TAS2783_PROBE_TIMEOUT); + if (ret) + return ret; -regmap_sync: regcache_cache_only(tas_dev->regmap, false); regcache_sync(tas_dev->regmap); return 0; From 5f344d809e015fba3709e5219428c00b8ac5d7df Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 8 May 2026 18:44:10 +0200 Subject: [PATCH 767/972] vsock/virtio: fix length and offset in tap skb for split packets virtio_transport_build_skb() builds a new skb to be delivered to the vsockmon tap device. To build the new skb, it uses the original skb data length as payload length, but as the comment notes, the original packet stored in the skb may have been split in multiple packets, so we need to use the length in the header, which is correctly updated before the packet is delivered to the tap, and the offset for the data. This was also similar to what we did before commit 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") where we probably missed something during the skb conversion. Also update the comment above, which was left stale by the skb conversion and still mentioned a buffer pointer that no longer exists. Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Reviewed-by: Arseniy Krasnov Link: https://patch.msgid.link/20260508164411.261440-2-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9b8014516f4fb1..a678d5d7570466 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -166,12 +166,12 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) struct sk_buff *skb; size_t payload_len; - /* A packet could be split to fit the RX buffer, so we can retrieve - * the payload length from the header and the buffer pointer taking - * care of the offset in the original packet. + /* A packet could be split to fit the RX buffer, so we use + * the payload length from the header, which has been updated + * by the sender to reflect the fragment size. */ pkt_hdr = virtio_vsock_hdr(pkt); - payload_len = pkt->len; + payload_len = le32_to_cpu(pkt_hdr->len); skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, GFP_ATOMIC); @@ -219,7 +219,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); } else { - skb_put_data(skb, pkt->data, payload_len); + skb_put_data(skb, pkt->data + VIRTIO_VSOCK_SKB_CB(pkt)->offset, + payload_len); } } From 3a3e3d90cbc79600544536723911657730759af3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 8 May 2026 18:44:11 +0200 Subject: [PATCH 768/972] vsock/virtio: fix empty payload in tap skb for non-linear buffers For non-linear skbs, virtio_transport_build_skb() goes through virtio_transport_copy_nonlinear_skb() to copy the original payload in the new skb to be delivered to the vsockmon tap device. This manually initializes an iov_iter but does not set iov_iter.count. Since the iov_iter is zero-initialized, the copy length is zero and no payload is actually copied to the monitor interface, leaving data un-initialized. Fix this by removing the linear vs non-linear split and using skb_copy_datagram_iter() with iov_iter_kvec() for all cases, as vhost-vsock already does. This handles both linear and non-linear skbs, properly initializes the iov_iter, and removes the now unused virtio_transport_copy_nonlinear_skb(). While touching this code, let's also check the return value of skb_copy_datagram_iter(), even though it's unlikely to fail. Fixes: 4b0bf10eb077 ("vsock/virtio: non-linear skb handling for tap") Reported-by: Yiqi Sun Signed-off-by: Stefano Garzarella Reviewed-by: Bobby Eshleman Reviewed-by: Arseniy Krasnov Link: https://patch.msgid.link/20260508164411.261440-3-sgarzare@redhat.com Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 40 ++++++++----------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index a678d5d7570466..989cc252d3d34c 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -136,27 +136,6 @@ static void virtio_transport_init_hdr(struct sk_buff *skb, hdr->fwd_cnt = cpu_to_le32(0); } -static void virtio_transport_copy_nonlinear_skb(const struct sk_buff *skb, - void *dst, - size_t len) -{ - struct iov_iter iov_iter = { 0 }; - struct kvec kvec; - size_t to_copy; - - kvec.iov_base = dst; - kvec.iov_len = len; - - iov_iter.iter_type = ITER_KVEC; - iov_iter.kvec = &kvec; - iov_iter.nr_segs = 1; - - to_copy = min_t(size_t, len, skb->len); - - skb_copy_datagram_iter(skb, VIRTIO_VSOCK_SKB_CB(skb)->offset, - &iov_iter, to_copy); -} - /* Packet capture */ static struct sk_buff *virtio_transport_build_skb(void *opaque) { @@ -214,13 +193,18 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); if (payload_len) { - if (skb_is_nonlinear(pkt)) { - void *data = skb_put(skb, payload_len); - - virtio_transport_copy_nonlinear_skb(pkt, data, payload_len); - } else { - skb_put_data(skb, pkt->data + VIRTIO_VSOCK_SKB_CB(pkt)->offset, - payload_len); + struct iov_iter iov_iter; + struct kvec kvec; + void *data = skb_put(skb, payload_len); + + kvec.iov_base = data; + kvec.iov_len = payload_len; + iov_iter_kvec(&iov_iter, ITER_DEST, &kvec, 1, payload_len); + + if (skb_copy_datagram_iter(pkt, VIRTIO_VSOCK_SKB_CB(pkt)->offset, + &iov_iter, payload_len)) { + kfree_skb(skb); + return NULL; } } From 2cb156213093a62b80cf40b1ec71738e93491971 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:36 +0200 Subject: [PATCH 769/972] net: ethernet: cortina: No mapping is a dropped rx Increase stats.rx_dropped++ even if this is the first fragment (skb == NULL) so we are doing proper accounting. Fixes: b266bacba796 ("net: ethernet: cortina: Drop half-assembled SKB") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-1-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 065cbbf52686cb..466445c9e08b47 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1491,9 +1491,9 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) gpage = gmac_get_queue_page(geth, port, mapping + PAGE_SIZE); if (!gpage) { dev_err(geth->dev, "could not find mapping\n"); + port->stats.rx_dropped++; if (skb) { napi_free_frags(&port->napi); - port->stats.rx_dropped++; skb = NULL; } continue; From 06937db21ee311ed07eba47954447245041a982d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:37 +0200 Subject: [PATCH 770/972] net: ethernet: cortina: Make RX SKB per-port The SKB used to assemble packets from fragments in gmac_rx() is static local, but the Gemini has two ethernet ports, meaning there can be races between the ports on a bad day if a device is using both. Make the RX SKB a per-port variable and carry it over between invocations in the port struct instead. Zero the pointer once we call napi_gro_frags(), on error (after calling napi_free_frags()) or if the port is stopped. Zero it in some place where not strictly necessary just to emphasize what is going on. This was found by Sashiko during normal patch review. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-2-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 466445c9e08b47..d5a56366fb432e 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -122,6 +122,8 @@ struct gemini_ethernet_port { struct napi_struct napi; struct hrtimer rx_coalesce_timer; unsigned int rx_coalesce_nsecs; + struct sk_buff *rx_skb; + unsigned int freeq_refill; struct gmac_txq txq[TX_QUEUE_NUM]; unsigned int txq_order; @@ -1442,10 +1444,10 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short m = (1 << port->rxq_order) - 1; struct gemini_ethernet *geth = port->geth; void __iomem *ptr_reg = port->rxq_rwptr; + struct sk_buff *skb = port->rx_skb; unsigned int frame_len, frag_len; struct gmac_rxdesc *rx = NULL; struct gmac_queue_page *gpage; - static struct sk_buff *skb; union gmac_rxdesc_0 word0; union gmac_rxdesc_1 word1; union gmac_rxdesc_3 word3; @@ -1504,6 +1506,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); port->stats.rx_dropped++; + skb = NULL; } skb = gmac_skb_if_good_frame(port, word0, frame_len); @@ -1554,6 +1557,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) port->stats.rx_dropped++; } + port->rx_skb = skb; writew(r, ptr_reg); return budget; } @@ -1881,6 +1885,7 @@ static int gmac_stop(struct net_device *netdev) gmac_disable_tx_rx(netdev); gmac_stop_dma(port); napi_disable(&port->napi); + port->rx_skb = NULL; gmac_enable_irq(netdev, 0); gmac_cleanup_rxq(netdev); From ebd8ec2b309e3a447851b456ccaf8fb39f3661e7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 9 May 2026 00:13:38 +0200 Subject: [PATCH 771/972] net: ethernet: cortina: Carry over frag counter The gmac_rx() NAPI poll function assembles packets in an SKB from a ring buffer. If the ring buffer gets completely emptied during a poll cycle, we exit gmac_rx(), but the packet is not yet completely assembled in the SKB, yet the fragment counter frag_nr is reset to zero on the next invocation. Solve this by making the RX fragment counter a part of the port struct, and carry it over between invocations. Reset the fragment counter only right after calling napi_gro_frags(), on error (after calling napi_free_frags()) or if stopping the port. Reset it in some place where not strictly necessary just to emphasize what is going on. This was found by Sashiko during normal patch review. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Link: https://sashiko.dev/#/patchset/20260505-gemini-ethernet-fix-v2-1-997c31d06079%40kernel.org Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260509-gemini-ethernet-fixes-v1-3-6c5d20ddc35b@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cortina/gemini.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index d5a56366fb432e..4c762229ce420f 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -123,6 +123,7 @@ struct gemini_ethernet_port { struct hrtimer rx_coalesce_timer; unsigned int rx_coalesce_nsecs; struct sk_buff *rx_skb; + unsigned int rx_frag_nr; unsigned int freeq_refill; struct gmac_txq txq[TX_QUEUE_NUM]; @@ -1444,6 +1445,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short m = (1 << port->rxq_order) - 1; struct gemini_ethernet *geth = port->geth; void __iomem *ptr_reg = port->rxq_rwptr; + unsigned int frag_nr = port->rx_frag_nr; struct sk_buff *skb = port->rx_skb; unsigned int frame_len, frag_len; struct gmac_rxdesc *rx = NULL; @@ -1457,7 +1459,6 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) unsigned short r, w; union dma_rwptr rw; dma_addr_t mapping; - int frag_nr = 0; spin_lock_irqsave(&geth->irq_lock, flags); rw.bits32 = readl(ptr_reg); @@ -1497,6 +1498,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); skb = NULL; + frag_nr = 0; } continue; } @@ -1507,6 +1509,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) napi_free_frags(&port->napi); port->stats.rx_dropped++; skb = NULL; + frag_nr = 0; } skb = gmac_skb_if_good_frame(port, word0, frame_len); @@ -1541,6 +1544,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (word3.bits32 & EOF_BIT) { napi_gro_frags(&port->napi); skb = NULL; + frag_nr = 0; --budget; } continue; @@ -1549,6 +1553,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) if (skb) { napi_free_frags(&port->napi); skb = NULL; + frag_nr = 0; } if (mapping) @@ -1558,6 +1563,7 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) } port->rx_skb = skb; + port->rx_frag_nr = frag_nr; writew(r, ptr_reg); return budget; } @@ -1886,6 +1892,7 @@ static int gmac_stop(struct net_device *netdev) gmac_stop_dma(port); napi_disable(&port->napi); port->rx_skb = NULL; + port->rx_frag_nr = 0; gmac_enable_irq(netdev, 0); gmac_cleanup_rxq(netdev); From 7d572b75d54edeb48135f8b029cfdec41277e30f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 12 May 2026 13:31:26 +0100 Subject: [PATCH 772/972] ASoC: cs35l56: Abort TDM mask loop at maximum channel shift Exit the for_each_set_bit() loop in cs35l56_make_tdm_config_word() after all possible channel shifts have been done. This prevents going around the loop with out-of-range shift values, which is technically undefined behaviour. It also shuts up code analysis tools. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260512123126.260148-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index 849d70ca23d6f5..0318cfd87943f5 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -426,6 +426,8 @@ static unsigned int cs35l56_make_tdm_config_word(unsigned int reg_val, unsigned reg_val &= ~(0x3f << channel_shift); reg_val |= bit_num << channel_shift; channel_shift += 8; + if (channel_shift > 24) + break; } return reg_val; From 36a8d04a8293afcb9304cf0cd3741f67698f2a1a Mon Sep 17 00:00:00 2001 From: Ethan Nelson-Moore Date: Fri, 8 May 2026 19:37:28 -0700 Subject: [PATCH 773/972] net: ethernet: cs89x0: remove stale CONFIG_MACH_MX31ADS reference The legacy ARM board file for MACH_MX31ADS was removed in commit c93197b0041d ("ARM: imx: Remove i.MX31 board files"), but a reference to it remained in the cs89x0 driver. Drop this unused code. Signed-off-by: Ethan Nelson-Moore Fixes: c93197b0041d ("ARM: imx: Remove i.MX31 board files") Link: https://patch.msgid.link/20260509023732.42256-1-enelsonmoore@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/cirrus/cs89x0.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index fa5857923db4c2..b4bfd6c174e78e 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -1271,7 +1271,6 @@ static const struct net_device_ops net_ops = { static void __init reset_chip(struct net_device *dev) { -#if !defined(CONFIG_MACH_MX31ADS) struct net_local *lp = netdev_priv(dev); unsigned long reset_start_time; @@ -1298,7 +1297,6 @@ static void __init reset_chip(struct net_device *dev) while ((readreg(dev, PP_SelfST) & INIT_DONE) == 0 && time_before(jiffies, reset_start_time + 2)) ; -#endif /* !CONFIG_MACH_MX31ADS */ } /* This is the real probe routine. From 7cee43fcb0c3f71441d2faaa8c2202b6a88b6bef Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:55 -0700 Subject: [PATCH 774/972] net: shaper: flip the polarity of the valid flag The usual way of inserting entries which are not yet fully ready into XArray is to have a VALID flag. The shaper code has a NOT_VALID flag. Since XArray code does not let us create entries with marks already set - the creation of entries is currently not atomic. Flip the polarity of the VALID flag. This closes the tiny race in net_shaper_pre_insert() of entries being created without the NOT_VALID flag. Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 1069fa4eb9f606..d2b8f1f951b196 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -275,11 +275,13 @@ static void net_shaper_default_parent(const struct net_shaper_handle *handle, parent->id = 0; } -/* - * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as - * it's cleared by xa_store(). +/* MARK_0 is already in use due to XA_FLAGS_ALLOC. The VALID mark is set on + * an entry only after the device-side configuration has completed + * successfully (see net_shaper_commit()). Lookups and dumps must filter on + * this mark to avoid exposing tentative entries inserted by + * net_shaper_pre_insert() while the driver call is still in flight. */ -#define NET_SHAPER_NOT_VALID XA_MARK_1 +#define NET_SHAPER_VALID XA_MARK_1 static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, @@ -289,8 +291,8 @@ net_shaper_lookup(struct net_shaper_binding *binding, struct net_shaper_hierarchy *hierarchy; hierarchy = net_shaper_hierarchy_rcu(binding); - if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID)) + if (!hierarchy || !xa_get_mark(&hierarchy->shapers, index, + NET_SHAPER_VALID)) return NULL; return xa_load(&hierarchy->shapers, index); @@ -370,13 +372,10 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, goto free_id; } - /* Mark 'tentative' shaper inside the hierarchy container. - * xa_set_mark is a no-op if the previous store fails. + /* Insert as 'tentative' (no VALID mark). The mark will be set by + * net_shaper_commit() once the driver-side configuration succeeds. */ - xa_lock(&hierarchy->shapers); - prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); - xa_unlock(&hierarchy->shapers); + prev = xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); @@ -413,8 +412,7 @@ static void net_shaper_commit(struct net_shaper_binding *binding, /* Successful update: drop the tentative mark * and update the hierarchy container. */ - __xa_clear_mark(&hierarchy->shapers, index, - NET_SHAPER_NOT_VALID); + __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); *cur = shapers[i]; } xa_unlock(&hierarchy->shapers); @@ -431,8 +429,9 @@ static void net_shaper_rollback(struct net_shaper_binding *binding) return; xa_lock(&hierarchy->shapers); - xa_for_each_marked(&hierarchy->shapers, index, cur, - NET_SHAPER_NOT_VALID) { + xa_for_each(&hierarchy->shapers, index, cur) { + if (xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_VALID)) + continue; __xa_erase(&hierarchy->shapers, index); kfree(cur); } @@ -836,7 +835,8 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, goto out_unlock; for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, - U32_MAX, XA_PRESENT)); ctx->start_index++) { + U32_MAX, NET_SHAPER_VALID)); + ctx->start_index++) { ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; From 235fb5376139c3419f2218349f1fa2f06f24f7ad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:56 -0700 Subject: [PATCH 775/972] net: shaper: fix trivial ordering issue in net_shaper_commit() We should update the entry before we mark it as valid. Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index d2b8f1f951b196..86319ddbf29058 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -295,6 +295,10 @@ net_shaper_lookup(struct net_shaper_binding *binding, NET_SHAPER_VALID)) return NULL; + /* Pairs with smp_wmb() in net_shaper_commit(): if the entry is + * valid, its contents must be visible too. + */ + smp_rmb(); return xa_load(&hierarchy->shapers, index); } @@ -412,8 +416,9 @@ static void net_shaper_commit(struct net_shaper_binding *binding, /* Successful update: drop the tentative mark * and update the hierarchy container. */ - __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); *cur = shapers[i]; + smp_wmb(); + __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_VALID); } xa_unlock(&hierarchy->shapers); } @@ -837,6 +842,10 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, U32_MAX, NET_SHAPER_VALID)); ctx->start_index++) { + /* Pairs with smp_wmb() in net_shaper_commit(): the entry + * is marked VALID, so its contents must be visible too. + */ + smp_rmb(); ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; From a9a2fa1da619f276580b0d4c5d12efac89e8642b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:57 -0700 Subject: [PATCH 776/972] net: shaper: reject duplicate leaves in GROUP request net_shaper_nl_group_doit() does not deduplicate NET_SHAPER_A_LEAVES entries. When userspace supplies the same leaf handle twice, the same old-parent pointer lands twice in old_nodes[]. The cleanup loop double frees the parent. Of course the same parent may still be in old_nodes[] twice if we are moving multiple of its leaves. Note that this patch also implicitly fixes the fact that the i >= leaves_count path forgets to set ret. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-4-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 60 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 86319ddbf29058..c8960821cf236f 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -941,6 +941,46 @@ static int net_shaper_handle_cmp(const struct net_shaper_handle *a, return memcmp(a, b, sizeof(*a)); } +static int net_shaper_parse_leaves(struct net_shaper_binding *binding, + struct genl_info *info, + const struct net_shaper *node, + struct net_shaper *leaves, + int leaves_count) +{ + struct nlattr *attr; + int i, j, ret, rem; + + i = 0; + nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (WARN_ON_ONCE(i >= leaves_count)) + return -EINVAL; + + ret = net_shaper_parse_leaf(binding, attr, info, + node, &leaves[i]); + if (ret) + return ret; + + /* Reject duplicates */ + for (j = 0; j < i; j++) { + if (net_shaper_handle_cmp(&leaves[i].handle, + &leaves[j].handle)) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "Duplicate leaf shaper %d:%d", + leaves[i].handle.scope, + leaves[i].handle.id); + return -EINVAL; + } + + i++; + } + + return 0; +} + static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, @@ -1197,10 +1237,9 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; - int i, ret, rem, leaves_count; + int i, ret, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; - struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; @@ -1228,19 +1267,10 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto free_leaves; - i = 0; - nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - if (WARN_ON_ONCE(i >= leaves_count)) - goto free_leaves; - - ret = net_shaper_parse_leaf(binding, attr, info, - &node, &leaves[i]); - if (ret) - goto free_leaves; - i++; - } + ret = net_shaper_parse_leaves(binding, info, &node, + leaves, leaves_count); + if (ret) + goto free_leaves; /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. From 6e8ae9d805d4b9ecec49bb9e457d9bae0b21b540 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:58 -0700 Subject: [PATCH 777/972] selftests: drv-net: add shaper test for duplicate leaves Add test exercising duplicate leaves. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-5-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/drivers/net/shaper.py | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/shaper.py b/tools/testing/selftests/drivers/net/shaper.py index 11310f19bfa028..e39d270e688df2 100755 --- a/tools/testing/selftests/drivers/net/shaper.py +++ b/tools/testing/selftests/drivers/net/shaper.py @@ -1,7 +1,10 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 -from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_true, KsftSkipEx +import errno + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_raises, ksft_true, KsftSkipEx from lib.py import EthtoolFamily, NetshaperFamily from lib.py import NetDrvEnv from lib.py import NlError @@ -438,6 +441,21 @@ def queue_update(cfg, nl_shaper) -> None: nl_shaper.delete({'ifindex': cfg.ifindex, 'handle': {'scope': 'queue', 'id': i}}) +def dup_leaves(cfg, nl_shaper) -> None: + """ Ensure that the kernel rejects duplicate leaves. """ + if not cfg.groups: + raise KsftSkipEx("device does not support node scope") + + with ksft_raises(NlError) as cm: + nl_shaper.group({ + 'ifindex': cfg.ifindex, + 'leaves':[{'handle': {'scope': 'queue', 'id': 0}}, + {'handle': {'scope': 'queue', 'id': 0}}], + 'handle': {'scope':'node'}, + 'metric': 'bps', + 'bw-max': 10000}) + ksft_eq(cm.exception.error, errno.EINVAL) + def main() -> None: with NetDrvEnv(__file__, queue_count=4) as cfg: cfg.queues = False @@ -453,7 +471,9 @@ def main() -> None: basic_groups, qgroups, delegation, - queue_update], args=(cfg, NetshaperFamily())) + dup_leaves, + queue_update], + args=(cfg, NetshaperFamily())) ksft_exit() From 8054f85b83f42a37d482fc77ea7c9ff06a9407d9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:28:59 -0700 Subject: [PATCH 778/972] net: shaper: set ret to -ENOMEM when genlmsg_new() fails in group_doit genlmsg_new() alloc failure path in net_shaper_nl_group_doit() forgets to set ret before jumping to error handling. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-6-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index c8960821cf236f..12e5e0c18643b9 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1276,8 +1276,10 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) * rollback on allocation failure. */ msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); - if (!msg) + if (!msg) { + ret = -ENOMEM; goto free_leaves; + } hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { From 0f9a857e34d0f8c018a3e4435c6f0e92e8d2f38c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:00 -0700 Subject: [PATCH 779/972] net: shaper: fix undersized reply skb allocation in GROUP command net_shaper_group_send_reply() writes both the NET_SHAPER_A_IFINDEX attribute (via net_shaper_fill_binding()) and the nested NET_SHAPER_A_HANDLE attribute (via net_shaper_fill_handle()), but the reply skb at the call site in net_shaper_nl_group_doit() is allocated using net_shaper_handle_size(), which only accounts for the nested handle. The allocation is therefore short by nla_total_size(sizeof(u32)) (8 bytes) for the IFINDEX attribute. In practice the slab allocator rounds up the small allocation so the bug is latent, but the size accounting is wrong and could bite if the reply grew further. Introduce net_shaper_group_reply_size() that accounts for the full reply payload and use it both at the genlmsg_new() call site and in the defensive WARN_ONCE message. Fixes: 5d5d4700e75d ("net-shapers: implement NL group operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-7-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 12e5e0c18643b9..08fde2d9e8aa8e 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -90,6 +90,12 @@ static int net_shaper_handle_size(void) nla_total_size(sizeof(u32))); } +static int net_shaper_group_reply_size(void) +{ + return nla_total_size(sizeof(u32)) + /* NET_SHAPER_A_IFINDEX */ + net_shaper_handle_size(); /* NET_SHAPER_A_HANDLE */ +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -1227,7 +1233,7 @@ static int net_shaper_group_send_reply(struct net_shaper_binding *binding, free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", - net_shaper_handle_size()); + net_shaper_group_reply_size()); nlmsg_free(msg); return -EMSGSIZE; } @@ -1275,7 +1281,7 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ - msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); + msg = genlmsg_new(net_shaper_group_reply_size(), GFP_KERNEL); if (!msg) { ret = -ENOMEM; goto free_leaves; From fbf5df34a4dbcd09d433dd4f0916bf9b2ddb16de Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:01 -0700 Subject: [PATCH 780/972] tools: ynl: add scope qualifier for definitions Using definitions in kernel policies is awkward right now. On one hand we want defines for max values and such. On the other we don't have a way of adding kernel-only defines. Adding unnecessary defines to uAPI is a bad idea, we won't be able to delete them. And when it comes to policy user space should just query it via the policy dump, not use hard coded defines. Add a "scope" property to definitions, which will let us tell the codegen that a definition is for kernel use only. Support following values: - uapi: render into the uAPI header (default, today's behavior) - kernel: render to kernel header only - user: same as kernel but for the user-side generated header Definitions may have a header property (definition is "external", provided by existing header). Extend the scope to headers, too. If definition has both scope and header properties we will only generate the includes in the right scope. Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-8-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/genetlink-c.yaml | 9 ++++++ Documentation/netlink/genetlink-legacy.yaml | 9 ++++++ Documentation/netlink/genetlink.yaml | 9 ++++++ Documentation/netlink/netlink-raw.yaml | 9 ++++++ tools/net/ynl/pyynl/ynl_gen_c.py | 31 +++++++++++++++++++-- 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 57f59fe23e3f8c..4ea31e8fc4d19d 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -69,6 +69,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags ] doc: diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 66fb8653a34423..f9c44747729ae3 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -83,6 +83,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags, struct ] # Trim doc: diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index a1194d5d93fc3c..d3f3f3399ddf85 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -55,6 +55,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags ] doc: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index dd98dda55bd0fd..4c436b59a34be5 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -87,6 +87,15 @@ properties: header: description: For C-compatible languages, header which already defines this value. type: string + scope: + description: | + Visibility of this definition. "uapi" (default) renders into + the uAPI header, "kernel" renders into the kernel-side + generated header, "user" renders into the user-side + generated header. When combined with `header:`, the + definition is not rendered, and the named header is + included only by code matching the scope. + enum: [ uapi, kernel, user ] type: enum: [ const, enum, flags, struct ] # Trim doc: diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 0e1e486c1185a4..cdc3646f2642c4 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -3212,6 +3212,8 @@ def render_uapi(family, cw): for const in family['definitions']: if const.get('header'): continue + if const.get('scope', 'uapi') != 'uapi': + continue if const['type'] != 'const': cw.writes_defines(defines) @@ -3339,6 +3341,25 @@ def render_uapi(family, cw): cw.p(f'#endif /* {hdr_prot} */') +def render_scoped_consts(family, cw, scope): + defines = [] + for const in family['definitions']: + if const['type'] != 'const': + continue + if const.get('header'): + continue + if const.get('scope') != scope: + continue + name_pfx = const.get('name-prefix', f"{family.ident_name}-") + defines.append([ + c_upper(family.get('c-define-name', + f"{name_pfx}{const['name']}")), + const['value']]) + if defines: + cw.writes_defines(defines) + cw.nl() + + def _render_user_ntf_entry(ri, op): if not ri.family.is_classic(): ri.cw.block_start(line=f"[{op.enum_name}] = ") @@ -3504,8 +3525,12 @@ def main(): cw.p('#include "ynl.h"') headers = [] for definition in parsed['definitions'] + parsed['attribute-sets']: - if 'header' in definition: - headers.append(definition['header']) + if 'header' not in definition: + continue + scope = definition.get('scope', 'uapi') + if scope != 'uapi' and scope != args.mode: + continue + headers.append(definition['header']) if args.mode == 'user': headers.append(parsed.uapi_header) seen_header = [] @@ -3522,6 +3547,7 @@ def main(): for one in args.user_header: cw.p(f'#include "{one}"') else: + render_scoped_consts(parsed, cw, 'user') cw.p('struct ynl_sock;') cw.nl() render_user_family(parsed, cw, True) @@ -3529,6 +3555,7 @@ def main(): if args.mode == "kernel": if args.header: + render_scoped_consts(parsed, cw, 'kernel') for _, struct in sorted(parsed.pure_nested_structs.items()): if struct.request: cw.p('/* Common nested types */') From 8d5806c600fddb907ebe378f9c366d4b52ac3a39 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:02 -0700 Subject: [PATCH 781/972] net: shaper: reject handle IDs exceeding internal bit-width net_shaper_parse_handle() reads the user-supplied handle ID via nla_get_u32(), accepting the full u32 range. However, the xarray key is built by net_shaper_handle_to_index() using FIELD_PREP(NET_SHAPER_ID_MASK, handle->id), where NET_SHAPER_ID_MASK is GENMASK(25, 0) - only 26 bits wide. FIELD_PREP silently masks off the upper bits at runtime. A user-supplied NODE id like 0x04000123 becomes id 0x123. Additionally, a user-supplied id equal to NET_SHAPER_ID_UNSPEC (0x03FFFFFF, which is NET_SHAPER_ID_MASK itself) would collide with the sentinel used internally by the group operation to signal "allocate a new NODE id". Reject user-supplied IDs >= NET_SHAPER_ID_MASK (i.e., >= 0x03FFFFFF) in the policy. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-9-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/net_shaper.yaml | 7 +++++++ net/shaper/shaper.c | 4 +++- net/shaper/shaper_nl_gen.c | 7 ++++++- net/shaper/shaper_nl_gen.h | 2 ++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml index 3f2ad772b64b15..de01f922040a5c 100644 --- a/Documentation/netlink/specs/net_shaper.yaml +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -33,6 +33,11 @@ doc: | @cap-get operation. definitions: + - + type: const + name: max-handle-id + value: 0x3fffffe + scope: kernel - type: enum name: scope @@ -140,6 +145,8 @@ attribute-sets: - name: id type: u32 + checks: + max: max-handle-id doc: | Numeric identifier of a shaper. The id semantic depends on the scope. For @queue scope it's the queue id and for @node diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 08fde2d9e8aa8e..eb049847fed65e 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -21,6 +21,8 @@ #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK +static_assert(NET_SHAPER_ID_UNSPEC == NET_SHAPER_MAX_HANDLE_ID + 1); + struct net_shaper_hierarchy { struct xarray shapers; }; @@ -360,7 +362,7 @@ static int net_shaper_pre_insert(struct net_shaper_binding *binding, handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; - handle->id = NET_SHAPER_ID_MASK - 1; + handle->id = NET_SHAPER_MAX_HANDLE_ID; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 9b29be3ef19a85..76eff85ec66dfc 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -11,10 +11,15 @@ #include +/* Integer value ranges */ +static const struct netlink_range_validation net_shaper_a_handle_id_range = { + .max = NET_SHAPER_MAX_HANDLE_ID, +}; + /* Common nested types */ const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), - [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &net_shaper_a_handle_id_range), }; const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 42c46c52c77513..2406652a9014a6 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -12,6 +12,8 @@ #include +#define NET_SHAPER_MAX_HANDLE_ID 67108862 + /* Common nested types */ extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; From b62b29e6de6711f5918940aa6ff2bbab6d6af502 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:03 -0700 Subject: [PATCH 782/972] net: shaper: enforce singleton NETDEV scope with id 0 The NETDEV scope represents a singleton root shaper in the per-device hierarchy. All code assumes NETDEV shapers have id 0: net_shaper_default_parent() hardcodes parent->id = 0 when returning the NETDEV parent for QUEUE/NODE children, and the UAPI documentation describes NETDEV scope as "the main shaper" (singular, not plural). Make sure we reject non-0 IDs. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-10-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index eb049847fed65e..4ae3ee6764a0a4 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -482,6 +482,12 @@ static int net_shaper_parse_handle(const struct nlattr *attr, else if (handle->scope == NET_SHAPER_SCOPE_NODE) id = NET_SHAPER_ID_UNSPEC; + if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_ATTR(info->extack, id_attr, + "Netdev scope is a singleton, must use ID 0"); + return -EINVAL; + } + handle->id = id; return 0; } From ce372e869f9f492f3d5aa9a0ae75ed52c61d2d6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 10 May 2026 12:29:04 -0700 Subject: [PATCH 783/972] net: shaper: reject QUEUE scope handle with missing id net_shaper_parse_handle() does not enforce that the user provides the handle ID. For NODE the ID defaults to UNSPEC for all other cases it defaults to 0. For NETDEV 0 is the only option. For QUEUE defaulting to 0 makes less intuitive sense. Specifically because the behavior should (IMHO) be the same for all cases where there may be more than one ID (QUEUE and NODE). We should either document this as intentional or reject. I picked the latter with no strong conviction. Fixes: 4b623f9f0f59 ("net-shapers: implement NL get operation") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260510192904.3987113-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/shaper/shaper.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 4ae3ee6764a0a4..b1c65110f04d37 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -477,10 +477,15 @@ static int net_shaper_parse_handle(const struct nlattr *attr, * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; - if (id_attr) + if (id_attr) { id = nla_get_u32(id_attr); - else if (handle->scope == NET_SHAPER_SCOPE_NODE) + } else if (handle->scope == NET_SHAPER_SCOPE_NODE) { id = NET_SHAPER_ID_UNSPEC; + } else if (handle->scope == NET_SHAPER_SCOPE_QUEUE) { + NL_SET_ERR_ATTR_MISS(info->extack, attr, + NET_SHAPER_A_HANDLE_ID); + return -EINVAL; + } if (id && handle->scope == NET_SHAPER_SCOPE_NETDEV) { NL_SET_ERR_MSG_ATTR(info->extack, id_attr, From 603ab5ea6482c723216b59cb733e8ba248619ee9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 May 2026 21:55:23 -0500 Subject: [PATCH 784/972] SMB3.1.1: add missing QUERY_DIR info levels New Infolevels for QUERY_DIR (and QUERY_INFO) levels 78 through 81 are now being used by Windows clients and were added to the documentation. Add defines for them (and correct some typos in documentation). See MS-SMB2 2.2.33 and MS-FSCC 2.4 Signed-off-by: Steve French --- fs/smb/common/fscc.h | 4 ++-- fs/smb/common/smb2pdu.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/common/fscc.h b/fs/smb/common/fscc.h index b4ccddca925656..bc3012cc295da6 100644 --- a/fs/smb/common/fscc.h +++ b/fs/smb/common/fscc.h @@ -260,12 +260,12 @@ typedef struct { char FileName[]; } __packed FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ -/* See MS-FSCC 2.4.13 */ +/* See MS-FSCC 2.4.14 */ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ -/* See MS-FSCC 2.4.14 */ +/* See MS-FSCC 2.4.15 */ typedef struct { __le32 NextEntryOffset; __u32 FileIndex; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index a4b12eb8df81ec..aeb0a245c5324e 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1566,6 +1566,10 @@ struct validate_negotiate_info_rsp { #define FILE_STANDARD_LINK_INFORMATION 54 #define FILE_ID_INFORMATION 59 #define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 /* also for QUERY_DIR */ +#define FileId64ExtdDirectoryInformation 78 /* also for QUERY_DIR */ +#define FileId64ExtdBothDirectoryInformation 79 /* also for QUERY_DIR */ +#define FileIdAllExtdDirectoryInformation 80 /* also for QUERY_DIR */ +#define FileIdAllExtdBothDirectoryInformation 81 /* also for QUERY_DIR */ /* Used for Query Info and Find File POSIX Info for SMB3.1.1 and SMB1 */ #define SMB_FIND_FILE_POSIX_INFO 0x064 From 17ee873dba04d05090dfc5b2b9e08cfc8e4f147f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 3 Mar 2026 10:48:54 +0100 Subject: [PATCH 785/972] HID: hid-sjoy: race between init and usage The driver uses an initial IO to set the device to a default state. That initialization is currently being done after the device node has been created. That means that the single buffer used for output can be altered while IO is in progress. Move the intialization before announcement to user space. Fixes: fac733f029251 ("HID: force feedback support for SmartJoy PLUS PS2/USB adapter") Signed-off-by: Oliver Neukum Signed-off-by: Jiri Kosina --- drivers/hid/hid-sjoy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-sjoy.c b/drivers/hid/hid-sjoy.c index bab93d71b7608f..963c451132045b 100644 --- a/drivers/hid/hid-sjoy.c +++ b/drivers/hid/hid-sjoy.c @@ -91,17 +91,17 @@ static int sjoyff_init(struct hid_device *hid) set_bit(FF_RUMBLE, dev->ffbit); - error = input_ff_create_memless(dev, sjoyff, hid_sjoyff_play); - if (error) { - kfree(sjoyff); - return error; - } - sjoyff->report = report; sjoyff->report->field[0]->value[0] = 0x01; sjoyff->report->field[0]->value[1] = 0x00; sjoyff->report->field[0]->value[2] = 0x00; hid_hw_request(hid, sjoyff->report, HID_REQ_SET_REPORT); + + error = input_ff_create_memless(dev, sjoyff, hid_sjoyff_play); + if (error) { + kfree(sjoyff); + return error; + } } hid_info(hid, "Force feedback for SmartJoy PLUS PS2/USB adapter\n"); From 637ad3a56a3b889527d1dacea6fea2a8bd648140 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Mon, 11 May 2026 22:51:51 +0100 Subject: [PATCH 786/972] block: don't overwrite bip_vcnt in bio_integrity_copy_user() bio_integrity_add_page() already sets bip_vcnt to 1 for the bounce segment. Overwriting it with nr_vecs breaks bip_vcnt <= bip_max_vcnt on WRITE (bip_max_vcnt is 1), so the gap-merge checks in block/blk.h read past the bip_vec[] flex array. On READ the read is in bounds but lands on a saved user bvec instead of the bounce. The line was added for split propagation, but bio_integrity_clone() doesn't copy bip_vcnt and BIP_CLONE_FLAGS excludes BIP_COPY_USER. Fixes: 3991657ae707 ("block: set bip_vcnt correctly") Signed-off-by: David Carlier Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260511215151.346228-1-devnexen@gmail.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index e54c6e06e1cbba..78e678d4104b69 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -308,7 +308,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, } bip->bip_flags |= BIP_COPY_USER; - bip->bip_vcnt = nr_vecs; return 0; free_bip: bio_integrity_free(bio); From 2c6e6a18a37b905cb584eb0dda3ae482162a81ca Mon Sep 17 00:00:00 2001 From: Casey Chen Date: Mon, 11 May 2026 15:22:30 -0600 Subject: [PATCH 787/972] block: recompute nr_integrity_segments in blk_insert_cloned_request blk_insert_cloned_request() already recomputes nr_phys_segments against the bottom queue, because "the queue settings related to segment counting may differ from the original queue." The exact same reasoning applies to integrity segments: a stacked driver's underlying queue can have tighter virt_boundary_mask, seg_boundary_mask, or max_segment_size than the top queue, in which case blk_rq_count_integrity_sg() against the bottom queue produces a different count than the cached rq->nr_integrity_segments inherited from the source request by blk_rq_prep_clone(). When the cached count is lower than the bottom queue's actual count, blk_rq_map_integrity_sg() trips BUG_ON(segments > rq->nr_integrity_segments); on dispatch. The same families of stacked setups that motivated the existing nr_phys_segments recompute -- dm-multipath fanning out to nvme-rdma in particular -- can produce this. Mirror the nr_phys_segments handling: when the request carries integrity, recompute nr_integrity_segments against the bottom queue and reject the request if it exceeds the bottom queue's max_integrity_segments. blk_rq_count_integrity_sg() and queue_max_integrity_segments() are both already available via , which blk-mq.c includes. This closes a latent gap in the stacking contract and brings the integrity-segment accounting in line with the existing phys-segment accounting. Fixes: 76c313f658d2 ("blk-integrity: improved sg segment mapping") Signed-off-by: Casey Chen Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260511212230.27511-1-cachen@purestorage.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c5c16cce4f8f3..d0c37daf568f24 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3307,6 +3307,25 @@ blk_status_t blk_insert_cloned_request(struct request *rq) return BLK_STS_IOERR; } + /* + * Integrity segment counting depends on the same queue limits + * (virt_boundary_mask, seg_boundary_mask, max_segment_size) that + * vary across stacked queues, so recompute against the bottom + * queue just like nr_phys_segments above. + */ + if (blk_integrity_rq(rq) && rq->bio) { + unsigned short max_int_segs = queue_max_integrity_segments(q); + + rq->nr_integrity_segments = + blk_rq_count_integrity_sg(rq->q, rq->bio); + if (rq->nr_integrity_segments > max_int_segs) { + printk(KERN_ERR "%s: over max integrity segments limit. (%u > %u)\n", + __func__, rq->nr_integrity_segments, + max_int_segs); + return BLK_STS_IOERR; + } + } + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; From 5f90dcfa8dc32a488581b78e575cdd7808ba5c78 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 5 Feb 2026 09:11:31 +0100 Subject: [PATCH 788/972] HID: quirks: really enable the intended work around for appledisplay Commit c7fabe4ad921 ("HID: quirks: work around VID/PID conflict for appledisplay") intends to add a quirk for kernels built with Apple Cinema Display support, but it refers to the non-existing config option CONFIG_APPLEDISPLAY, whereas the config option for Apple Cinema Display support is named CONFIG_USB_APPLEDISPLAY. Refer to the intended config option CONFIG_USB_APPLEDISPLAY in the ifdef directive. Fixes: c7fabe4ad921 ("HID: quirks: work around VID/PID conflict for appledisplay") Signed-off-by: Lukas Bulwahn Signed-off-by: Jiri Kosina --- drivers/hid/hid-quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 9e88c9d6c6dc0b..512049963978a7 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -235,7 +235,7 @@ static const struct hid_device_id hid_quirks[] = { * used as a driver. See hid_scan_report(). */ static const struct hid_device_id hid_have_special_driver[] = { -#if IS_ENABLED(CONFIG_APPLEDISPLAY) +#if IS_ENABLED(CONFIG_USB_APPLEDISPLAY) { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x9218) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x9219) }, { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, 0x921c) }, From 8582792cf23b3d94674d4d838f7cde9a28d0fcaf Mon Sep 17 00:00:00 2001 From: Sungwoo Kim Date: Tue, 12 May 2026 01:09:29 -0400 Subject: [PATCH 789/972] block: bio-integrity: Fix null-ptr-deref in bio_integrity_map_user() pin_user_pages_fast() can partially succeed and return the number of pages that were actually pinned. However, the bio_integrity_map_user() does not handle this partial pinning. This leads to a general protection fault since bvec_from_pages() dereferences an unpinned page address, which is 0. To fix this, add a check to verify that all requested memory is pinned. If partial pinning occurs, unpin the memory and return -EFAULT. Kernel Oops: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 0 UID: 0 PID: 1061 Comm: nvme-passthroug Not tainted 7.0.0-11783-g90957f9314e8-dirty #16 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 RIP: 0010:bio_integrity_map_user.cold+0x1b0/0x9d6 Fixes: 492c5d455969 ("block: bio-integrity: directly map user buffers") Acked-by: Chao Shi Acked-by: Weidong Zhu Acked-by: Dave Tian Signed-off-by: Sungwoo Kim Tested-by: Shin'ichiro Kawasaki Link: https://github.com/linux-blktests/blktests/pull/244 Link: https://patch.msgid.link/20260512050929.541397-2-iam@sung-woo.kim Signed-off-by: Jens Axboe --- block/bio-integrity.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 78e678d4104b69..e796de1a749e33 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -402,6 +402,24 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) if (unlikely(ret < 0)) goto free_bvec; + /* + * Handle partial pinning. This can happen when pin_user_pages_fast() + * returns fewer pages than requested. + */ + if (user_backed_iter(iter) && unlikely(ret != bytes)) { + if (ret > 0) { + int npinned = DIV_ROUND_UP(offset + ret, PAGE_SIZE); + int i; + + for (i = 0; i < npinned; i++) + unpin_user_page(pages[i]); + } + if (pages != stack_pages) + kvfree(pages); + ret = -EFAULT; + goto free_bvec; + } + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset, &is_p2p); if (pages != stack_pages) From 11f152c0acaa924d93339000cb785d34e003aff5 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 21 Apr 2026 16:27:01 +0200 Subject: [PATCH 790/972] xen/arm: Replace __ASSEMBLY__ with __ASSEMBLER__ in interface.h While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize now on the __ASSEMBLER__ macro that is provided by the compilers. Signed-off-by: Thomas Huth Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20260421142701.548978-1-thuth@redhat.com> --- include/xen/arm/interface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xen/arm/interface.h b/include/xen/arm/interface.h index c3eada2642aa9a..61360b89da405e 100644 --- a/include/xen/arm/interface.h +++ b/include/xen/arm/interface.h @@ -30,7 +30,7 @@ #define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Explicitly size integers that represent pfns in the interface with * Xen so that we can have one ABI that works for 32 and 64 bit guests. * Note that this means that the xen_pfn_t type may be capable of From f097d246677b03db814c5862f368cea341b76a00 Mon Sep 17 00:00:00 2001 From: Florian Pradines Date: Sat, 9 May 2026 09:45:17 +0000 Subject: [PATCH 791/972] HID: mcp2221: fix OOB write in mcp2221_raw_event() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mcp2221_raw_event() copies device-supplied data into mcp->rxbuf at offset rxbuf_idx without checking that the copy fits within the destination buffer. A device responding with up to 60 bytes to a small I2C/SMBus read can overflow the buffer. Add a rxbuf_size field to struct mcp2221, set it alongside rxbuf in mcp_i2c_smbus_read(), and check rxbuf_idx + data[3] <= rxbuf_size before the memcpy. Reported-by: Benoît Sevens Signed-off-by: Florian Pradines Signed-off-by: Jiri Kosina --- drivers/hid/hid-mcp2221.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c index be80970ab48e27..e4ddd8e9293b68 100644 --- a/drivers/hid/hid-mcp2221.c +++ b/drivers/hid/hid-mcp2221.c @@ -128,6 +128,7 @@ struct mcp2221 { u8 *rxbuf; u8 txbuf[64]; int rxbuf_idx; + int rxbuf_size; int status; u8 cur_i2c_clk_div; struct gpio_chip *gc; @@ -330,12 +331,14 @@ static int mcp_i2c_smbus_read(struct mcp2221 *mcp, mcp->txbuf[3] = (u8)(msg->addr << 1); total_len = msg->len; mcp->rxbuf = msg->buf; + mcp->rxbuf_size = msg->len; } else { mcp->txbuf[1] = smbus_len; mcp->txbuf[2] = 0; mcp->txbuf[3] = (u8)(smbus_addr << 1); total_len = smbus_len; mcp->rxbuf = smbus_buf; + mcp->rxbuf_size = smbus_len; } ret = mcp_send_data_req_status(mcp, mcp->txbuf, 4); @@ -919,6 +922,10 @@ static int mcp2221_raw_event(struct hid_device *hdev, mcp->status = -EINVAL; break; } + if (mcp->rxbuf_idx + data[3] > mcp->rxbuf_size) { + mcp->status = -EINVAL; + break; + } buf = mcp->rxbuf; memcpy(&buf[mcp->rxbuf_idx], &data[4], data[3]); mcp->rxbuf_idx = mcp->rxbuf_idx + data[3]; From d93ba918a185aca2594da63e92fdc5495b559c0f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 16 Apr 2026 14:16:54 +0100 Subject: [PATCH 792/972] HID: magicmouse: Prevent out-of-bounds (OOB) read during DOUBLE_REPORT_ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is currently possible for a malicious or misconfigured USB device to cause an out-of-bounds (OOB) read when submitting reports using DOUBLE_REPORT_ID by specifying a large report length and providing a smaller one. Let's prevent that by comparing the specified report length with the actual size of the data read in from userspace. If the actual data length ends up being smaller than specified, we'll politely warn the user and prevent any further processing. Signed-off-by: Lee Jones Reviewed-by: Günther Noack Signed-off-by: Jiri Kosina --- drivers/hid/hid-magicmouse.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/hid/hid-magicmouse.c b/drivers/hid/hid-magicmouse.c index e70bd3dc07ab72..802a3479e24b92 100644 --- a/drivers/hid/hid-magicmouse.c +++ b/drivers/hid/hid-magicmouse.c @@ -390,6 +390,10 @@ static int magicmouse_raw_event(struct hid_device *hdev, struct input_dev *input = msc->input; int x = 0, y = 0, ii, clicks = 0, npoints; + /* Protect against zero sized recursive calls from DOUBLE_REPORT_ID */ + if (size < 1) + return 0; + switch (data[0]) { case TRACKPAD_REPORT_ID: case TRACKPAD2_BT_REPORT_ID: @@ -490,6 +494,18 @@ static int magicmouse_raw_event(struct hid_device *hdev, /* Sometimes the trackpad sends two touch reports in one * packet. */ + + /* Ensure that we have at least 2 elements (report type and size) */ + if (size < 2) + return 0; + + if (size < data[1] + 2) { + hid_warn(hdev, + "received report length (%d) was smaller than specified (%d)", + size, data[1] + 2); + return 0; + } + magicmouse_raw_event(hdev, report, data + 2, data[1]); magicmouse_raw_event(hdev, report, data + 2 + data[1], size - 2 - data[1]); From cac61b58a3b6340c52afa06bb15eac033158db2f Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Fri, 17 Apr 2026 08:47:02 -0700 Subject: [PATCH 793/972] HID: playstation: Clamp num_touch_reports A device would never lie about the number of touch reports would it? If it does the loop in dualshock4_parse_report will read off the end of the touch_reports array, up to about 2 KiB for the maximum number of 256 loop iteraions. The data that is read is emitted via evdev if the DS4_TOUCH_POINT_INACTIVE bit happens to be set. Protect against this by clamping the num_touch_reports value provided by the device to the maximum size of the touch_reports array. Fixes: 752038248808 ("HID: playstation: add DualShock4 touchpad support.") Cc: stable@vger.kernel.org Reported-by: Xingyu Jin Signed-off-by: T.J. Mercier Signed-off-by: Jiri Kosina --- drivers/hid/hid-playstation.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hid/hid-playstation.c b/drivers/hid/hid-playstation.c index c43caac20b61b1..e485373316756a 100644 --- a/drivers/hid/hid-playstation.c +++ b/drivers/hid/hid-playstation.c @@ -2384,7 +2384,8 @@ static int dualshock4_parse_report(struct ps_device *ps_dev, struct hid_report * } ds4_report = &usb->common; - num_touch_reports = usb->num_touch_reports; + num_touch_reports = min_t(u8, usb->num_touch_reports, + ARRAY_SIZE(usb->touch_reports)); touch_reports = usb->touch_reports; } else if (hdev->bus == BUS_BLUETOOTH && report->id == DS4_INPUT_REPORT_BT && size == DS4_INPUT_REPORT_BT_SIZE) { @@ -2404,7 +2405,8 @@ static int dualshock4_parse_report(struct ps_device *ps_dev, struct hid_report * } ds4_report = &bt->common; - num_touch_reports = bt->num_touch_reports; + num_touch_reports = min_t(u8, bt->num_touch_reports, + ARRAY_SIZE(bt->touch_reports)); touch_reports = bt->touch_reports; } else if (hdev->bus == BUS_BLUETOOTH && report->id == DS4_INPUT_REPORT_BT_MINIMAL && From 4db2af929279c799b5653a39eb0795c72baffca4 Mon Sep 17 00:00:00 2001 From: Sangyun Kim Date: Mon, 20 Apr 2026 14:13:17 +0900 Subject: [PATCH 794/972] HID: appletb-kbd: fix UAF in inactivity-timer cleanup path Commit 38224c472a03 ("HID: appletb-kbd: fix slab use-after-free bug in appletb_kbd_probe") added timer_delete_sync(&kbd->inactivity_timer) to both the probe close_hw error path and appletb_kbd_remove(), but the way it was wired in left the inactivity timer reachable during driver tear-down via two distinct windows. Window A -- put_device() before timer_delete_sync(): put_device(&kbd->backlight_dev->dev); timer_delete_sync(&kbd->inactivity_timer); The inactivity_timer softirq reads kbd->backlight_dev and calls backlight_device_set_brightness() -> mutex_lock(&ops_lock). If a concurrent hid_appletb_bl unbind drops the last devm reference between these two calls, the backlight_device is freed and the mutex_lock() touches freed memory. Window B -- backlight cleanup before hid_hw_stop(): if (kbd->backlight_dev) { timer_delete_sync(...); put_device(...); } hid_hw_close(hdev); hid_hw_stop(hdev); Even after Window A is closed, hid_hw_close()/hid_hw_stop() still run afterwards, so a late ".event" callback from the HID core (USB URB completion on real Apple hardware) can arrive after timer_delete_sync() drained the softirq but before put_device() drops the reference. That callback reaches reset_inactivity_timer(), which calls mod_timer() and re-arms the timer. The freshly re-armed timer can then fire on the about-to-be-freed backlight_device. Both windows produce the same KASAN slab-use-after-free: BUG: KASAN: slab-use-after-free in __mutex_lock+0x1aab/0x21c0 Read of size 8 at addr ffff88803ee9a108 by task swapper/0/0 Call Trace: __mutex_lock backlight_device_set_brightness appletb_inactivity_timer call_timer_fn run_timer_softirq handle_softirqs Allocated by task N: devm_backlight_device_register appletb_bl_probe Freed by task M: (concurrent hid_appletb_bl unbind path) Close both windows at once by reworking the tear-down in appletb_kbd_remove() and in the probe close_hw error path so that 1) hid_hw_close()/hid_hw_stop() run before the backlight cleanup, guaranteeing no further .event callback can fire and re-arm the timer, and 2) inside the "if (kbd->backlight_dev)" block, timer_delete_sync() runs before put_device(), so the softirq is drained before the final reference is dropped. Fixes: 38224c472a03 ("HID: appletb-kbd: fix slab use-after-free bug in appletb_kbd_probe") Cc: stable@vger.kernel.org Signed-off-by: Sangyun Kim Signed-off-by: Jiri Kosina --- drivers/hid/hid-appletb-kbd.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c index 0fdc0968b9ef26..8feac9e3589b83 100644 --- a/drivers/hid/hid-appletb-kbd.c +++ b/drivers/hid/hid-appletb-kbd.c @@ -440,13 +440,13 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id unregister_handler: input_unregister_handler(&kbd->inp_handler); close_hw: - if (kbd->backlight_dev) { - put_device(&kbd->backlight_dev->dev); - timer_delete_sync(&kbd->inactivity_timer); - } hid_hw_close(hdev); stop_hw: hid_hw_stop(hdev); + if (kbd->backlight_dev) { + timer_delete_sync(&kbd->inactivity_timer); + put_device(&kbd->backlight_dev->dev); + } return ret; } @@ -457,13 +457,13 @@ static void appletb_kbd_remove(struct hid_device *hdev) appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_OFF); input_unregister_handler(&kbd->inp_handler); + hid_hw_close(hdev); + hid_hw_stop(hdev); + if (kbd->backlight_dev) { - put_device(&kbd->backlight_dev->dev); timer_delete_sync(&kbd->inactivity_timer); + put_device(&kbd->backlight_dev->dev); } - - hid_hw_close(hdev); - hid_hw_stop(hdev); } static int appletb_kbd_suspend(struct hid_device *hdev, pm_message_t msg) From 1654e53349d4e657b331de354313461f401f5063 Mon Sep 17 00:00:00 2001 From: Sangyun Kim Date: Mon, 20 Apr 2026 14:13:18 +0900 Subject: [PATCH 795/972] HID: appletb-kbd: run inactivity autodim from workqueues The autodim code in hid-appletb-kbd takes backlight_device->ops_lock via backlight_device_set_brightness() -> mutex_lock() from two different atomic contexts: * appletb_inactivity_timer() is a struct timer_list callback, so it runs in softirq context. Every expiry triggers BUG: sleeping function called from invalid context at kernel/locking/mutex.c:591 Call Trace: __might_resched __mutex_lock backlight_device_set_brightness appletb_inactivity_timer call_timer_fn run_timer_softirq * reset_inactivity_timer() is called from appletb_kbd_hid_event() and appletb_kbd_inp_event(). On real USB hardware these run in softirq/IRQ context (URB completion and input-event dispatch). When the Touch Bar has already been dimmed or turned off, the reset path calls backlight_device_set_brightness() directly to restore brightness, producing the same warning. Both call sites hit the same mutex_lock()-from-atomic bug. Fix them together by moving the blocking work onto the system workqueue: * Convert the inactivity timer from struct timer_list to struct delayed_work; the callback (appletb_inactivity_work) now runs in process context where mutex_lock() is legal. * Add a dedicated struct work_struct restore_brightness_work and have reset_inactivity_timer() schedule it instead of calling backlight_device_set_brightness() directly. Cancel both works synchronously during driver tear-down alongside the existing backlight reference drop. The semantics are unchanged (same delays, same state transitions on dim, turn-off and user activity); only the execution context of the sleeping call changes. The timer field and callback are renamed to match their new type; reset_inactivity_timer() keeps its name because it is invoked from input event paths that read naturally as "reset the inactivity timer". Fixes: 93a0fc489481 ("HID: hid-appletb-kbd: add support for automatic brightness control while using the touchbar") Cc: stable@vger.kernel.org Signed-off-by: Sangyun Kim Signed-off-by: Jiri Kosina --- drivers/hid/hid-appletb-kbd.c | 44 ++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c index 8feac9e3589b83..462010a758993e 100644 --- a/drivers/hid/hid-appletb-kbd.c +++ b/drivers/hid/hid-appletb-kbd.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "hid-ids.h" @@ -62,7 +62,8 @@ struct appletb_kbd { struct input_handle kbd_handle; struct input_handle tpd_handle; struct backlight_device *backlight_dev; - struct timer_list inactivity_timer; + struct delayed_work inactivity_work; + struct work_struct restore_brightness_work; bool has_dimmed; bool has_turned_off; u8 saved_mode; @@ -164,16 +165,18 @@ static int appletb_tb_key_to_slot(unsigned int code) } } -static void appletb_inactivity_timer(struct timer_list *t) +static void appletb_inactivity_work(struct work_struct *work) { - struct appletb_kbd *kbd = timer_container_of(kbd, t, inactivity_timer); + struct appletb_kbd *kbd = container_of(to_delayed_work(work), + struct appletb_kbd, + inactivity_work); if (kbd->backlight_dev && appletb_tb_autodim) { if (!kbd->has_dimmed) { backlight_device_set_brightness(kbd->backlight_dev, 1); kbd->has_dimmed = true; - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_idle_timeout)); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_idle_timeout)); } else if (!kbd->has_turned_off) { backlight_device_set_brightness(kbd->backlight_dev, 0); kbd->has_turned_off = true; @@ -181,16 +184,25 @@ static void appletb_inactivity_timer(struct timer_list *t) } } +static void appletb_restore_brightness_work(struct work_struct *work) +{ + struct appletb_kbd *kbd = container_of(work, struct appletb_kbd, + restore_brightness_work); + + if (kbd->backlight_dev) + backlight_device_set_brightness(kbd->backlight_dev, 2); +} + static void reset_inactivity_timer(struct appletb_kbd *kbd) { if (kbd->backlight_dev && appletb_tb_autodim) { if (kbd->has_dimmed || kbd->has_turned_off) { - backlight_device_set_brightness(kbd->backlight_dev, 2); kbd->has_dimmed = false; kbd->has_turned_off = false; + schedule_work(&kbd->restore_brightness_work); } - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_dim_timeout)); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_dim_timeout)); } } @@ -408,9 +420,11 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id dev_err_probe(dev, -ENODEV, "Failed to get backlight device\n"); } else { backlight_device_set_brightness(kbd->backlight_dev, 2); - timer_setup(&kbd->inactivity_timer, appletb_inactivity_timer, 0); - mod_timer(&kbd->inactivity_timer, - jiffies + secs_to_jiffies(appletb_tb_dim_timeout)); + INIT_DELAYED_WORK(&kbd->inactivity_work, appletb_inactivity_work); + INIT_WORK(&kbd->restore_brightness_work, + appletb_restore_brightness_work); + mod_delayed_work(system_wq, &kbd->inactivity_work, + secs_to_jiffies(appletb_tb_dim_timeout)); } kbd->inp_handler.event = appletb_kbd_inp_event; @@ -444,7 +458,8 @@ static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id stop_hw: hid_hw_stop(hdev); if (kbd->backlight_dev) { - timer_delete_sync(&kbd->inactivity_timer); + cancel_delayed_work_sync(&kbd->inactivity_work); + cancel_work_sync(&kbd->restore_brightness_work); put_device(&kbd->backlight_dev->dev); } return ret; @@ -461,7 +476,8 @@ static void appletb_kbd_remove(struct hid_device *hdev) hid_hw_stop(hdev); if (kbd->backlight_dev) { - timer_delete_sync(&kbd->inactivity_timer); + cancel_delayed_work_sync(&kbd->inactivity_work); + cancel_work_sync(&kbd->restore_brightness_work); put_device(&kbd->backlight_dev->dev); } } From b08665fe80fab0956e64741c07d9bbcec635c34d Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Fri, 24 Apr 2026 21:50:41 +0900 Subject: [PATCH 796/972] HID: google: hammer: stop hardware on devres action failure hammer_probe() starts the HID hardware before registering the devres action that stops it. If devm_add_action() fails, probe returns an error with the hardware still started because the cleanup action was never registered and the driver's remove callback is not called after a failed probe. Use devm_add_action_or_reset() so the stop action runs immediately on registration failure while preserving the existing devres-managed cleanup path for later probe failures and remove. Signed-off-by: Myeonghun Pak Signed-off-by: Jiri Kosina --- drivers/hid/hid-google-hammer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c index 1af477e58480b7..c99c3c0d442e1b 100644 --- a/drivers/hid/hid-google-hammer.c +++ b/drivers/hid/hid-google-hammer.c @@ -496,7 +496,7 @@ static int hammer_probe(struct hid_device *hdev, if (error) return error; - error = devm_add_action(&hdev->dev, hammer_stop, hdev); + error = devm_add_action_or_reset(&hdev->dev, hammer_stop, hdev); if (error) return error; From 2c85c61d1332e1e16f020d76951baf167dcb6f7a Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:22 +0200 Subject: [PATCH 797/972] HID: pass the buffer size to hid_report_raw_event commit 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") enforced the provided data to be at least the size of the declared buffer in the report descriptor to prevent a buffer overflow. However, we can try to be smarter by providing both the buffer size and the data size, meaning that hid_report_raw_event() can make better decision whether we should plaining reject the buffer (buffer overflow attempt) or if we can safely memset it to 0 and pass it to the rest of the stack. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Acked-by: Johan Hovold Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- drivers/hid/bpf/hid_bpf_dispatch.c | 6 +++-- drivers/hid/hid-core.c | 42 ++++++++++++++++++++---------- drivers/hid/hid-gfrm.c | 4 +-- drivers/hid/hid-logitech-hidpp.c | 2 +- drivers/hid/hid-multitouch.c | 2 +- drivers/hid/hid-primax.c | 2 +- drivers/hid/hid-vivaldi-common.c | 2 +- drivers/hid/wacom_sys.c | 6 ++--- drivers/staging/greybus/hid.c | 2 +- include/linux/hid.h | 4 +-- include/linux/hid_bpf.h | 14 ++++++---- 11 files changed, 53 insertions(+), 33 deletions(-) diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c index 50c7b45c59e3fb..d0130658091b02 100644 --- a/drivers/hid/bpf/hid_bpf_dispatch.c +++ b/drivers/hid/bpf/hid_bpf_dispatch.c @@ -24,7 +24,8 @@ EXPORT_SYMBOL(hid_ops); u8 * dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf) + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf) { struct hid_bpf_ctx_kern ctx_kern = { .ctx = { @@ -74,6 +75,7 @@ dispatch_hid_bpf_device_event(struct hid_device *hdev, enum hid_report_type type *size = ret; } + *buf_size = ctx_kern.ctx.allocated_size; return ctx_kern.data; } EXPORT_SYMBOL_GPL(dispatch_hid_bpf_device_event); @@ -505,7 +507,7 @@ __hid_bpf_input_report(struct hid_bpf_ctx *ctx, enum hid_report_type type, u8 *b if (ret) return ret; - return hid_ops->hid_input_report(ctx->hid, type, buf, size, 0, (u64)(long)ctx, true, + return hid_ops->hid_input_report(ctx->hid, type, buf, size, size, 0, (u64)(long)ctx, true, lock_already_taken); } diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 61afec5915ecfa..a806820df7e553 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2033,24 +2033,32 @@ int __hid_request(struct hid_device *hid, struct hid_report *report, } EXPORT_SYMBOL_GPL(__hid_request); -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt) +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; + size_t bsize = bufsize; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) - goto out; + return 0; + + if (unlikely(bsize < csize)) { + hid_warn_ratelimited(hid, "Event data for report %d is incorrect (%d vs %ld)\n", + report->id, csize, bsize); + return -EINVAL; + } if (report_enum->numbered) { cdata++; csize--; + bsize--; } rsize = hid_compute_report_size(report); @@ -2063,11 +2071,16 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * else if (rsize > max_buffer_size) rsize = max_buffer_size; + if (bsize < rsize) { + hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %ld)\n", + report->id, rsize, bsize); + return -EINVAL; + } + if (csize < rsize) { - hid_warn_ratelimited(hid, "Event data for report %d was too short (%d vs %d)\n", - report->id, rsize, csize); - ret = -EINVAL; - goto out; + dbg_hid("report %d is too short, (%d < %d)\n", report->id, + csize, rsize); + memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) @@ -2075,7 +2088,7 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) - goto out; + return ret; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { @@ -2087,15 +2100,15 @@ int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 * if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); -out: + return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken) + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; @@ -2120,7 +2133,8 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, report_enum = hid->report_enum + type; hdrv = hid->driver; - data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); + data = dispatch_hid_bpf_device_event(hid, type, data, &bufsize, &size, interrupt, + source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; @@ -2149,7 +2163,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, goto unlock; } - ret = hid_report_raw_event(hid, type, data, size, interrupt); + ret = hid_report_raw_event(hid, type, data, bufsize, size, interrupt); unlock: if (!lock_already_taken) @@ -2171,7 +2185,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { - return __hid_input_report(hid, type, data, size, interrupt, 0, + return __hid_input_report(hid, type, data, size, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } diff --git a/drivers/hid/hid-gfrm.c b/drivers/hid/hid-gfrm.c index 699186ff2349e9..d2a56bf92b416e 100644 --- a/drivers/hid/hid-gfrm.c +++ b/drivers/hid/hid-gfrm.c @@ -66,7 +66,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, switch (data[1]) { case GFRM100_SEARCH_KEY_DOWN: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_dn, - sizeof(search_key_dn), 1); + sizeof(search_key_dn), sizeof(search_key_dn), 1); break; case GFRM100_SEARCH_KEY_AUDIO_DATA: @@ -74,7 +74,7 @@ static int gfrm_raw_event(struct hid_device *hdev, struct hid_report *report, case GFRM100_SEARCH_KEY_UP: ret = hid_report_raw_event(hdev, HID_INPUT_REPORT, search_key_up, - sizeof(search_key_up), 1); + sizeof(search_key_up), sizeof(search_key_up), 1); break; default: diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b1330d23bd2d03..b3ff9265377b95 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -3673,7 +3673,7 @@ static int hidpp10_consumer_keys_raw_event(struct hidpp_device *hidpp, memcpy(&consumer_report[1], &data[3], 4); /* We are called from atomic context */ hid_report_raw_event(hidpp->hid_dev, HID_INPUT_REPORT, - consumer_report, 5, 1); + consumer_report, sizeof(consumer_report), 5, 1); return 1; } diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index e82a3c4e5b44ef..eeab0b6e32ccce 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -533,7 +533,7 @@ static void mt_get_feature(struct hid_device *hdev, struct hid_report *report) } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, buf, - size, 0); + size, size, 0); if (ret) dev_warn(&hdev->dev, "failed to report feature\n"); } diff --git a/drivers/hid/hid-primax.c b/drivers/hid/hid-primax.c index e44d79dff8de63..8db054280afbcd 100644 --- a/drivers/hid/hid-primax.c +++ b/drivers/hid/hid-primax.c @@ -44,7 +44,7 @@ static int px_raw_event(struct hid_device *hid, struct hid_report *report, data[0] |= (1 << (data[idx] - 0xE0)); data[idx] = 0; } - hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, 0); + hid_report_raw_event(hid, HID_INPUT_REPORT, data, size, size, 0); return 1; default: /* unknown report */ diff --git a/drivers/hid/hid-vivaldi-common.c b/drivers/hid/hid-vivaldi-common.c index bf734055d4b69d..b12bb5cc091aa3 100644 --- a/drivers/hid/hid-vivaldi-common.c +++ b/drivers/hid/hid-vivaldi-common.c @@ -85,7 +85,7 @@ void vivaldi_feature_mapping(struct hid_device *hdev, } ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, report_data, - report_len, 0); + report_len, report_len, 0); if (ret) { dev_warn(&hdev->dev, "failed to report feature %d\n", field->report->id); diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index 0d1c6d90fe21c5..a32320b351e3ee 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -90,7 +90,7 @@ static void wacom_wac_queue_flush(struct hid_device *hdev, kfree(buf); continue; } - err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, false); + err = hid_report_raw_event(hdev, HID_INPUT_REPORT, buf, size, size, false); if (err) { hid_warn(hdev, "%s: unable to flush event due to error %d\n", __func__, err); @@ -334,7 +334,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n && features->type == HID_GENERIC) { ret = hid_report_raw_event(hdev, - HID_FEATURE_REPORT, data, n, 0); + HID_FEATURE_REPORT, data, n, n, 0); } else if (ret == 2 && features->type != HID_GENERIC) { features->touch_max = data[1]; } else { @@ -395,7 +395,7 @@ static void wacom_feature_mapping(struct hid_device *hdev, data, n, WAC_CMD_RETRIES); if (ret == n) { ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, - data, n, 0); + data, n, n, 0); } else { hid_warn(hdev, "%s: could not retrieve sensor offsets\n", __func__); diff --git a/drivers/staging/greybus/hid.c b/drivers/staging/greybus/hid.c index 1f58c907c03683..f1f9f6fbc00e52 100644 --- a/drivers/staging/greybus/hid.c +++ b/drivers/staging/greybus/hid.c @@ -201,7 +201,7 @@ static void gb_hid_init_report(struct gb_hid *ghid, struct hid_report *report) * we just need to setup the input fields, so using * hid_report_raw_event is safe. */ - hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, size, 1); + hid_report_raw_event(ghid->hid, report->type, ghid->inbuf, ghid->bufsize, size, 1); } static void gb_hid_init_reports(struct gb_hid *ghid) diff --git a/include/linux/hid.h b/include/linux/hid.h index 442a80d79e899d..ac432a2ef415aa 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1298,8 +1298,8 @@ static inline u32 hid_report_len(struct hid_report *report) return DIV_ROUND_UP(report->size, 8) + (report->id > 0); } -int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, - int interrupt); +int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); /* HID quirks API */ unsigned long hid_lookup_quirk(const struct hid_device *hdev); diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index a2e47dbcf82c8b..19fffa4574a47c 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -72,8 +72,8 @@ struct hid_ops { int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf); int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, - bool lock_already_taken); + u8 *data, size_t bufsize, u32 size, int interrupt, u64 source, + bool from_bpf, bool lock_already_taken); struct module *owner; const struct bus_type *bus_type; }; @@ -200,7 +200,8 @@ struct hid_bpf { #ifdef CONFIG_HID_BPF u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, - u32 *size, int interrupt, u64 source, bool from_bpf); + size_t *buf_size, u32 *size, int interrupt, u64 source, + bool from_bpf); int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, u32 size, enum hid_report_type rtype, @@ -215,8 +216,11 @@ int hid_bpf_device_init(struct hid_device *hid); const u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, - u8 *data, u32 *size, int interrupt, - u64 source, bool from_bpf) { return data; } + u8 *data, size_t *buf_size, u32 *size, + int interrupt, u64 source, bool from_bpf) +{ + return data; +} static inline int dispatch_hid_bpf_raw_requests(struct hid_device *hdev, unsigned char reportnum, u8 *buf, u32 size, enum hid_report_type rtype, From 206342541fc887ae919774a43942dc883161fece Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Mon, 4 May 2026 10:47:23 +0200 Subject: [PATCH 798/972] HID: core: introduce hid_safe_input_report() hid_input_report() is used in too many places to have a commit that doesn't cross subsystem borders. Instead of changing the API, introduce a new one when things matters in the transport layers: - usbhid - i2chid This effectively revert to the old behavior for those two transport layers. Fixes: 0a3fe972a7cb ("HID: core: Mitigate potential OOB by removing bogus memset()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 25 +++++++++++++++++++++++++ drivers/hid/i2c-hid/i2c-hid-core.c | 7 ++++--- drivers/hid/usbhid/hid-core.c | 11 ++++++----- include/linux/hid.h | 2 ++ 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index a806820df7e553..b3596851c7191a 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2181,6 +2181,7 @@ static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. + * Legacy, please use hid_safe_input_report() instead. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) @@ -2191,6 +2192,30 @@ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data } EXPORT_SYMBOL_GPL(hid_input_report); +/** + * hid_safe_input_report - report data from lower layer (usb, bt...) + * + * @hid: hid device + * @type: HID report type (HID_*_REPORT) + * @data: report contents + * @bufsize: allocated size of the data buffer + * @size: useful size of data parameter + * @interrupt: distinguish between interrupt and control transfers + * + * This is data entry for lower layers. + * Please use this function instead of the non safe version because we provide + * here the size of the buffer, allowing hid-core to make smarter decisions + * regarding the incoming buffer. + */ +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt) +{ + return __hid_input_report(hid, type, data, bufsize, size, interrupt, 0, + false, /* from_bpf */ + false /* lock_already_taken */); +} +EXPORT_SYMBOL_GPL(hid_safe_input_report); + bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index 5a183af3d5c6a6..e0a302544cef4b 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -574,9 +574,10 @@ static void i2c_hid_get_input(struct i2c_hid *ihid) if (ihid->hid->group != HID_GROUP_RMI) pm_wakeup_event(&ihid->client->dev, 0); - hid_input_report(ihid->hid, HID_INPUT_REPORT, - ihid->inbuf + sizeof(__le16), - ret_size - sizeof(__le16), 1); + hid_safe_input_report(ihid->hid, HID_INPUT_REPORT, + ihid->inbuf + sizeof(__le16), + ihid->bufsize - sizeof(__le16), + ret_size - sizeof(__le16), 1); } return; diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index fbbfc0f60829be..5af93b9b1fb560 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -283,9 +283,9 @@ static void hid_irq_in(struct urb *urb) break; usbhid_mark_busy(usbhid); if (!test_bit(HID_RESUME_RUNNING, &usbhid->iofl)) { - hid_input_report(urb->context, HID_INPUT_REPORT, - urb->transfer_buffer, - urb->actual_length, 1); + hid_safe_input_report(urb->context, HID_INPUT_REPORT, + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 1); /* * autosuspend refused while keys are pressed * because most keyboards don't wake up when @@ -482,9 +482,10 @@ static void hid_ctrl(struct urb *urb) switch (status) { case 0: /* success */ if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN) - hid_input_report(urb->context, + hid_safe_input_report(urb->context, usbhid->ctrl[usbhid->ctrltail].report->type, - urb->transfer_buffer, urb->actual_length, 0); + urb->transfer_buffer, urb->transfer_buffer_length, + urb->actual_length, 0); break; case -ESHUTDOWN: /* unplug */ unplug = 1; diff --git a/include/linux/hid.h b/include/linux/hid.h index ac432a2ef415aa..bfb9859f391ee5 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1030,6 +1030,8 @@ struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_ty int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); +int hid_safe_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, + size_t bufsize, u32 size, int interrupt); struct hid_field *hidinput_get_led_field(struct hid_device *hid); unsigned int hidinput_count_leds(struct hid_device *hid); __s32 hidinput_calc_abs_res(const struct hid_field *field, __u16 code); From a991aa5e89365ba1959fae6847fd288125b209e5 Mon Sep 17 00:00:00 2001 From: Xu Rao Date: Sat, 9 May 2026 16:21:32 +0800 Subject: [PATCH 799/972] HID: i2c-hid: add reset quirk for BLTP7853 touchpad The BLTP7853 I2C HID touchpad may fail to probe after reboot or reprobe because reset completion is not signalled to the host. The driver then waits for the reset-complete interrupt until it times out and the device probe fails: i2c_hid i2c-BLTP7853:00: failed to reset device. i2c_hid i2c-BLTP7853:00: can't add hid device: -61 i2c_hid: probe of i2c-BLTP7853:00 failed with error -61 Add I2C_HID_QUIRK_NO_IRQ_AFTER_RESET for the device so i2c-hid does not wait for a reset interrupt that may never arrive. Signed-off-by: Xu Rao Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 3 +++ drivers/hid/i2c-hid/i2c-hid-core.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 8cfec7dced6645..4657d96fb0836f 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -277,6 +277,9 @@ #define USB_VENDOR_ID_BIGBEN 0x146b #define USB_DEVICE_ID_BIGBEN_PS3OFMINIPAD 0x0902 +#define I2C_VENDOR_ID_BLTP 0x36b6 +#define I2C_PRODUCT_ID_BLTP7853 0xc001 + #define USB_VENDOR_ID_BTC 0x046e #define USB_DEVICE_ID_BTC_EMPREX_REMOTE 0x5578 #define USB_DEVICE_ID_BTC_EMPREX_REMOTE_2 0x5577 diff --git a/drivers/hid/i2c-hid/i2c-hid-core.c b/drivers/hid/i2c-hid/i2c-hid-core.c index e0a302544cef4b..3adb16366e9394 100644 --- a/drivers/hid/i2c-hid/i2c-hid-core.c +++ b/drivers/hid/i2c-hid/i2c-hid-core.c @@ -149,6 +149,8 @@ static const struct i2c_hid_quirks { I2C_HID_QUIRK_BOGUS_IRQ }, { I2C_VENDOR_ID_GOODIX, I2C_DEVICE_ID_GOODIX_0D42, I2C_HID_QUIRK_DELAY_WAKEUP_AFTER_RESUME }, + { I2C_VENDOR_ID_BLTP, I2C_PRODUCT_ID_BLTP7853, + I2C_HID_QUIRK_NO_IRQ_AFTER_RESET }, { 0, 0 } }; From 48d1677779ad6816978ad4a4f7588aec5ec960fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Paku=C5=82a?= Date: Sun, 10 May 2026 14:23:52 +0200 Subject: [PATCH 800/972] HID: pidff: Fix integer overflow in pidff_rescale MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rescaling values close to the max (U16_MAX) temporarily creates values that exceed the s32 range. This caused value overflow in case when, for example, a periodic effect phase was higer than 180 degrees. In turn, rescale function could return values outised of the logical range of the HID field. Fix by using 64 bit signed integer to store the value during calculation but still return only 32 bit integer. Closes: https://github.com/JacKeTUs/universal-pidff/issues/116 Fixes: 224ee88fe395 ("Input: add force feedback driver for PID devices") Cc: stable@vger.kernel.org Signed-off-by: Tomasz Pakuła Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-pidff.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/hid/usbhid/hid-pidff.c b/drivers/hid/usbhid/hid-pidff.c index aee8a44433059e..c45f182d044800 100644 --- a/drivers/hid/usbhid/hid-pidff.c +++ b/drivers/hid/usbhid/hid-pidff.c @@ -11,6 +11,7 @@ #include "hid-pidff.h" #include #include +#include #include #include #include @@ -326,8 +327,10 @@ static s32 pidff_clamp(s32 i, struct hid_field *field) */ static int pidff_rescale(int i, int max, struct hid_field *field) { - return i * (field->logical_maximum - field->logical_minimum) / max + - field->logical_minimum; + /* 64 bits needed for big values during rescale */ + s64 result = field->logical_maximum - field->logical_minimum; + + return div_s64(result * i, max) + field->logical_minimum; } /* From 64ffa2e5e02ff54b23221d0282155f37283fabea Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Tue, 12 May 2026 13:22:44 +0000 Subject: [PATCH 801/972] HID: logitech-hidpp: Add support for newer Bluetooth keyboards Add product IDs (PIDs) for several newer Logitech Bluetooth keyboards to the hidpp_devices matching table, enabling full HID++ support for them. The added keyboards are: - Logitech Signature K650 & B2B - Logitech Pebble Keys 2 K380S - Logitech Casa Pop-Up Desk & B2B - Logitech Wave Keys & B2B - Logitech Signature Slim K950 & B2B - Logitech MX Keys S & B2B - Logitech Keys-To-Go 2 - Logitech Pop Icon Keys - Logitech MX Keys Mini & B2B - Logitech Signature Slim Solar+ K980 B2B - Logitech Bluetooth Keyboard K250/K251 - Logitech Signature Comfort K880 & B2B Signed-off-by: Alain Michaud Reviewed-by: Olivier Gay Signed-off-by: Jiri Kosina --- drivers/hid/hid-logitech-hidpp.c | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c index b3ff9265377b95..ccbf28869a968d 100644 --- a/drivers/hid/hid-logitech-hidpp.c +++ b/drivers/hid/hid-logitech-hidpp.c @@ -4685,6 +4685,44 @@ static const struct hid_device_id hidpp_devices[] = { HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb391) }, { /* MX Master 4 mouse over Bluetooth */ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb042) }, + { /* Logitech Signature K650 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb36f) }, + { /* Logitech Signature K650 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb370) }, + { /* Logitech Pebble Keys 2 K380S over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb377) }, + { /* Logitech Casa Pop-Up Desk over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb371) }, + { /* Logitech Casa Pop-Up Desk B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb374) }, + { /* Logitech Wave Keys over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb383) }, + { /* Logitech Wave Keys B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb384) }, + { /* Logitech Signature Slim K950 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb386) }, + { /* Logitech Signature Slim K950 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb388) }, + { /* Logitech MX Keys S over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb378) }, + { /* Logitech MX Keys S B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb380) }, + { /* Logitech Keys-To-Go 2 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb38c) }, + { /* Logitech Pop Icon Keys over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb38f) }, + { /* Logitech MX Keys Mini over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb369) }, + { /* Logitech MX Keys Mini B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb36e) }, + { /* Logitech Signature Slim Solar+ K980 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb394) }, + { /* Logitech Bluetooth Keyboard K250/K251 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb397) }, + { /* Logitech Signature Comfort K880 over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb39c) }, + { /* Logitech Signature Comfort K880 B2B over Bluetooth */ + HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_LOGITECH, 0xb39d) }, {} }; From aa16b2bc0f02709919e2435f531406531e5bcc69 Mon Sep 17 00:00:00 2001 From: Zack McKevitt Date: Thu, 30 Apr 2026 12:39:01 -0700 Subject: [PATCH 802/972] accel/qaic: Add overflow check to remap_pfn_range during mmap The call to remap_pfn_range in qaic_gem_object_mmap is susceptible to (re)mapping beyond the VMA if the BO is too large. This can cause use after free issues when munmap() unmaps only the VMA region and not the additional mappings. To prevent this, check the remaining size of the VMA before remapping and truncate the remapped length if sg->length is too large. Reported-by: Lukas Maar Fixes: ff13be830333 ("accel/qaic: Add datapath") Reviewed-by: Karol Wachowski Signed-off-by: Zack McKevitt Reviewed-by: Jeff Hugo [jhugo: fix braces from checkpatch --strict] Signed-off-by: Jeff Hugo Link: https://patch.msgid.link/20260430193858.1178641-1-zachary.mckevitt@oss.qualcomm.com --- drivers/accel/qaic/qaic_data.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c index 95300c2f7d8af0..1e4c579d272562 100644 --- a/drivers/accel/qaic/qaic_data.c +++ b/drivers/accel/qaic/qaic_data.c @@ -606,8 +606,11 @@ static const struct vm_operations_struct drm_vm_ops = { static int qaic_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) { struct qaic_bo *bo = to_qaic_bo(obj); + unsigned long remap_start; unsigned long offset = 0; + unsigned long remap_end; struct scatterlist *sg; + unsigned long length; int ret = 0; if (drm_gem_is_imported(obj)) @@ -615,11 +618,27 @@ static int qaic_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struc for (sg = bo->sgt->sgl; sg; sg = sg_next(sg)) { if (sg_page(sg)) { + /* if sg is too large for the VMA, so truncate it to fit */ + if (check_add_overflow(vma->vm_start, offset, &remap_start)) + return -EINVAL; + if (check_add_overflow(remap_start, sg->length, &remap_end)) + return -EINVAL; + + if (remap_end > vma->vm_end) { + if (check_sub_overflow(vma->vm_end, remap_start, &length)) + return -EINVAL; + } else { + length = sg->length; + } + + if (length == 0) + goto out; + ret = remap_pfn_range(vma, vma->vm_start + offset, page_to_pfn(sg_page(sg)), - sg->length, vma->vm_page_prot); + length, vma->vm_page_prot); if (ret) goto out; - offset += sg->length; + offset += length; } } From b7cdd59de5ae8062d2cb0121c429a271eb70daec Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 12 May 2026 18:25:17 +0200 Subject: [PATCH 803/972] ACPI: PAD: xen: Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_COMPANION() check against NULL to the Xen variant of the ACPI processor aggregator device (PAD) driver. Fixes: 112b2f978afe ("ACPI: PAD: xen: Convert to a platform driver") Signed-off-by: Rafael J. Wysocki Acked-by: Juergen Gross Link: https://patch.msgid.link/3427762.aeNJFYEL58@rafael.j.wysocki --- drivers/xen/xen-acpi-pad.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c index 75a39862c1df3e..5b98e0e9380768 100644 --- a/drivers/xen/xen-acpi-pad.c +++ b/drivers/xen/xen-acpi-pad.c @@ -110,9 +110,13 @@ static void acpi_pad_notify(acpi_handle handle, u32 event, static int acpi_pad_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + struct acpi_device *device; acpi_status status; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); From aed3c3346765e4317bb2ec6ff872e1c952e128ab Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:53 +0200 Subject: [PATCH 804/972] Documentation: security-bugs: do not systematically Cc the security team With the increase of automated reports, the security team is dealing with way more messages than really needed. The reporting process works well with most teams so there is no need to systematically involve the security team in reports. Let's suggest to keep it for small lists of recipients and new reporters only. This should continue to cover the risk of lost messages while reducing the volume from prolific reporters. Cc: Greg KH Cc: Leon Romanovsky Reviewed-by: Leon Romanovsky Reviewed-by: Greg Kroah-Hartman Signed-off-by: Willy Tarreau Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-2-w@1wt.eu> --- Documentation/process/security-bugs.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 27b028e8586104..6dc525858125eb 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -148,7 +148,15 @@ run additional tests. Reports where the reporter does not respond promptly or cannot effectively discuss their findings may be abandoned if the communication does not quickly improve. -The report must be sent to maintainers, with the security team in ``Cc:``. +The report must be sent to maintainers. If there are two or fewer +recipients in your message, you must also always Cc: the Linux kernel +security team who will ensure the message is delivered to the proper +people, and will be able to assist small maintainer teams with processes +they may not be familiar with. For larger teams, Cc: the Linux kernel +security team for your first few reports or when seeking specific help, +such as when resending a message which got no response within a week. +Once you have become comfortable with the process for a few reports, it is +no longer necessary to Cc: the security list when sending to large teams. The Linux kernel security team can be contacted by email at . This is a private list of security officers who will help verify the bug report and assist developers working on a fix. From a03ef333fbd6cd861c8457c3d055ee3643a9baad Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:54 +0200 Subject: [PATCH 805/972] Documentation: security-bugs: explain what is and is not a security bug The use of automated tools to find bugs in random locations of the kernel induces a raise of security reports even if most of them should just be reported as regular bugs. This patch is an attempt at drawing a line between what qualifies as a security bug and what does not, hoping to improve the situation and ease decision on the reporter's side. It defers the enumeration to a new file, threat-model.rst, that tries to enumerate various classes of issues that are and are not security bugs. This should permit to more easily update this file for various subsystem-specific rules without having to revisit the security bug reporting guide. Cc: Greg KH Cc: Leon Romanovsky Suggested-by: Leon Romanovsky Suggested-by: Greg KH Reviewed-by: Leon Romanovsky Reviewed-by: Shuah Khan Signed-off-by: Willy Tarreau Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-3-w@1wt.eu> --- Documentation/process/index.rst | 1 + Documentation/process/security-bugs.rst | 38 +++- Documentation/process/threat-model.rst | 236 ++++++++++++++++++++++++ 3 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 Documentation/process/threat-model.rst diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst index dbd6ea16aca702..aa7c959a52b875 100644 --- a/Documentation/process/index.rst +++ b/Documentation/process/index.rst @@ -86,6 +86,7 @@ regressions and security problems. debugging/index handling-regressions security-bugs + threat-model cve embargoed-hardware-issues diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 6dc525858125eb..54260dbfc64d5a 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -66,6 +66,42 @@ In addition, the following information are highly desirable: the issue appear. It is useful to share them, as they can be helpful to keep end users protected during the time it takes them to apply the fix. +What qualifies as a security bug +-------------------------------- + +It is important that most bugs are handled publicly so as to involve the widest +possible audience and find the best solution. By nature, bugs that are handled +in closed discussions between a small set of participants are less likely to +produce the best possible fix (e.g., risk of missing valid use cases, limited +testing abilities). + +It turns out that the majority of the bugs reported via the security team are +just regular bugs that have been improperly qualified as security bugs due to +a lack of awareness of the Linux kernel's threat model, as described in +Documentation/process/threat-model.rst, and ought to have been sent through +the normal channels described in Documentation/admin-guide/reporting-issues.rst +instead. + +The security list exists for urgent bugs that grant an attacker a capability +they are not supposed to have on a correctly configured production system, and +can be easily exploited, representing an imminent threat to many users. Before +reporting, consider whether the issue actually crosses a trust boundary on such +a system. + +**If you resorted to AI assistance to identify a bug, you must treat it as +public**. While you may have valid reasons to believe it is not, the security +team's experience shows that bugs discovered this way systematically surface +simultaneously across multiple researchers, often on the same day. In this +case, do not publicly share a reproducer, as this could cause unintended harm; +just mention that one is available and maintainers might ask for it privately +if they need it. + +If you are unsure whether an issue qualifies, err on the side of reporting +privately: the security team would rather triage a borderline report than miss +a real vulnerability. Reporting ordinary bugs to the security list, however, +does not make them move faster and consumes triage capacity that other reports +need. + Identifying contacts -------------------- @@ -74,7 +110,7 @@ affected subsystem's maintainers and Cc: the Linux kernel security team. Do not send it to a public list at this stage, unless you have good reasons to consider the issue as being public or trivial to discover (e.g. result of a widely available automated vulnerability scanning tool that can be repeated by -anyone). +anyone, or use of AI-based tools). If you're sending a report for issues affecting multiple parts in the kernel, even if they're fairly similar issues, please send individual messages (think diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst new file mode 100644 index 00000000000000..ecb432390e7924 --- /dev/null +++ b/Documentation/process/threat-model.rst @@ -0,0 +1,236 @@ +.. _threatmodel: + +The Linux Kernel threat model +============================= + +There are a lot of assumptions regarding what the kernel does and does not +protect against. These assumptions tend to cause confusion for bug reports +(:doc:`security-related ones ` vs :doc:`non-security ones +<../admin-guide/reporting-issues>`), and can complicate security enforcement +when the responsibilities for some boundaries is not clear between the kernel, +distros, administrators and users. + +This document tries to clarify the responsibilities of the kernel in this +domain. + +The kernel's responsibilities +----------------------------- + +The kernel abstracts access to local hardware resources and to remote systems +in a way that allows multiple local users to get a fair share of the available +resources granted to them, and, when the underlying hardware permits, to assign +a level of confidentiality to their communications and to the data they are +processing or storing. + +The kernel assumes that the underlying hardware behaves according to its +specifications. This includes the integrity of the CPU's instruction set, the +transparency of the branch prediction unit and the cache units, the consistency +of the Memory Management Unit (MMU), the isolation of DMA-capable peripherals +(e.g., via IOMMU), state transitions in controllers, ranges of values read from +registers, the respect of documented hardware limitations, etc. + +When hardware fails to maintain its specified isolation (e.g., CPU bugs, +side-channels, hardware response to unexpected inputs), the kernel will usually +attempt to implement reasonable mitigations. These are best-effort measures +intended to reduce the attack surface or elevate the cost of an attack within +the limits of the hardware's facilities; they do not constitute a +kernel-provided safety guarantee. + +Users always perform their activities under the authority of an administrator +who is able to grant or deny various types of permissions that may affect how +users benefit from available resources, or the level of confidentiality of +their activities. Administrators may also delegate all or part of their own +permissions to some users, particularly via capabilities but not only. All this +is performed via configuration (sysctl, file-system permissions etc). + +The Linux Kernel applies a certain collection of default settings that match +its threat model. Distros have their own threat model and will come with their +own configuration presets, that the administrator may have to adjust to better +suit their expectations (relax or restrict). + +By default, the Linux Kernel guarantees the following protections when running +on common processors featuring privilege levels and memory management units: + +* **User-based isolation**: an unprivileged user may restrict access to their + own data from other unprivileged users running on the same system. This + includes: + + * stored data, via file system permissions + * in-memory data (pages are not accessible by default to other users) + * process activity (ptrace is not permitted to other users) + * inter-process communication (other users may not observe data exchanged via + UNIX domain sockets or other IPC mechanisms). + * network communications within the same or with other systems + +* **Capability-based protection**: + + * users not having the ``CAP_SYS_ADMIN`` capability may not alter the + kernel's configuration, memory nor state, change other users' view of the + file system layout, grant any user capabilities they do not have, nor + affect the system's availability (shutdown, reboot, panic, hang, or making + the system unresponsive via unbounded resource exhaustion). + * users not having the ``CAP_NET_ADMIN`` capability may not alter the network + configuration, intercept nor spoof network communications from other users + nor systems. + * users not having ``CAP_SYS_PTRACE`` may not observe other users' processes + activities. + +When ``CONFIG_USER_NS`` is set, the kernel also permits unprivileged users to +create their own user namespace in which they have all capabilities, but with a +number of restrictions (they may not perform actions that have impacts on the +initial user namespace, such as changing time, loading modules or mounting +block devices). Please refer to ``user_namespaces(7)`` for more details, the +possibilities of user namespaces are not covered in this document. + +The kernel also offers a lot of troubleshooting and debugging facilities, which +can constitute attack vectors when placed in wrong hands. While some of them +are designed to be accessible to regular local users with a low risk (e.g. +kernel logs via ``/proc/kmsg``), some would expose enough information to +represent a risk in most places and the decision to expose them is under the +administrator's responsibility (perf events, traces), and others are not +designed to be accessed by non-privileged users (e.g. debugfs). Access to these +facilities by a user who has been explicitly granted permission by an +administrator does not constitute a security breach. + +Bugs that permit to violate the principles above constitute security breaches. +However, bugs that permit one violation only once another one was already +achieved are only weaknesses. The kernel applies a number of self-protection +measures whose purpose is to avoid crossing a security boundary when certain +classes of bugs are found, but a failure of these extra protections do not +constitute a vulnerability alone. + +What does not constitute a security bug +--------------------------------------- + +In the Linux kernel's threat model, the following classes of problems are +**NOT** considered as Linux Kernel security bugs. However, when it is believed +that the kernel could do better, they should be reported, so that they can be +reviewed and fixed where reasonably possible, but they will be handled as any +regular bug: + +* **Configuration**: + + * outdated kernels and particularly end-of-life branches are out of the scope + of the kernel's threat model: administrators are responsible for keeping + their system up to date. For a bug to qualify as a security bug, it must be + demonstrated that it affects actively maintained versions. + + * build-level: changes to the kernel configuration that are explicitly + documented as lowering the security level (e.g. ``CONFIG_NOMMU``), or + targeted at developers only. + + * OS-level: changes to command line parameters, sysctls, filesystem + permissions, user capabilities, exposure of privileged interfaces, that + explicitly increase exposure by either offering non-default access to + unprivileged users, or reduce the kernel's ability to enforce some + protections or mitigations. Example: write access to procfs or debugfs. + + * issues triggered only when using features intended for development or + debugging (e.g., LOCKDEP, KASAN, FAULT_INJECTION): these features are known + to introduce overhead and potential instability and are not intended for + production use. + + * issues affecting drivers exposed under CONFIG_STAGING, as well as features + marked EXPERIMENTAL in the configuration. + + * loading of explicitly insecure/broken/staging modules, and generally any + using any subsystem marked as experimental or not intended for production + use. + + * running out-of-tree modules or unofficial kernel forks; these should be + reported to the relevant vendor. + +* **Excess of initial privileges**: + + * actions performed by a user already possessing the privileges required to + perform that action or modify that state (e.g. ``CAP_SYS_ADMIN``, + ``CAP_NET_ADMIN``, ``CAP_SYS_RAWIO``, ``CAP_SYS_MODULE`` with no further + boundary being crossed). + + * actions performed in user namespace that do not bypass the restrictions + imposed to the initial user (e.g. ptrace usage, signal delivery, resource + usage, access to FS/device/sysctl/memory, network binding, system/network + configuration etc). + + * anything performed by the root user in the initial namespace (e.g. kernel + oops when writing to a privileged device). + +* **Out of production use**: + + This covers theoretical/probabilistic attacks that rely on laboratory + conditions with zero system noise, or those requiring an unrealistic number + of attempts (e.g., billions of trials) that would be detected by standard + system monitoring long before success, such as: + + * prediction of random numbers that only works in a totally silent + environment (such as IP ID, TCP ports or sequence numbers that can only be + guessed in a lab). + + * activity observation and information leaks based on probabilistic + approaches that are prone to measurement noise and not realistically + reproducible on a production system. + + * issues that can only be triggered by heavy attacks (e.g. brute force) whose + impact on the system makes it unlikely or impossible to remain undetected + before they succeed (e.g. consuming all memory before succeeding). + + * problems seen only under development simulators, emulators, or combinations + that do not exist on real systems at the time of reporting (issues + involving tens of millions of threads, tens of thousands of CPUs, + unrealistic CPU frequencies, RAM sizes or disk capacities, network speeds. + + * issues whose reproduction requires hardware modification or emulation, + including fake USB devices that pretend to be another one. + + * as well as issues that can be triggered at a cost that is orders of + magnitude higher than the expected benefits (e.g. fully functional keyboard + emulator only to retrieve 7 uninitialized bytes in a structure, or + brute-force method involving millions of connection attempts to guess a + port number). + +* **Hardening failures**: + + * ability to bypass some of the kernel's hardening measures with no + demonstrable exploit path (e.g. ASLR bypass, events timing or probing with + no demonstrable consequence). These are just weaknesses, not + vulnerabilities. + + * missing argument checks and failure to report certain errors with no + immediate consequence. + +* **Random information leaks**: + + This concerns information leaks of small data parts that happen to be there + and that cannot be chosen by the attacker, or face access restrictions: + + * structure padding reported by syscalls or other interfaces. + + * identifiers, partial data, non-terminated strings reported in error + messages. + + * Leaks of kernel memory addresses/pointers do not constitute an immediately + exploitable vector and are not security bugs, though they must be reported + and fixed. + +* **Crafted file system images**: + + * bugs triggered by mounting a corrupted or maliciously crafted file system + image are generally not security bugs, as the kernel assumes the underlying + storage media is under the administrator's control, unless the filesystem + driver is specifically documented as being hardened against untrusted media. + + * issues that are resolved, mitigated, or detected by running a filesystem + consistency check (fsck) on the image prior to mounting. + +* **Physical access**: + + Issues that require physical access to the machine, hardware modification, or + the use of specialized hardware (e.g., logic analyzers, DMA-attack tools over + PCI-E/Thunderbolt) are out of scope unless the system is explicitly + configured with technologies meant to defend against such attacks + (e.g. IOMMU). + +* **Functional and performance regressions**: + + Any issue that can be mitigated by setting proper permissions and limits + doesn't qualify as a security bug. From 4bf85afb9f3ecd7c3b5d15a85b0902f8e725cd06 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 May 2026 11:47:55 +0200 Subject: [PATCH 806/972] Documentation: security-bugs: clarify requirements for AI-assisted reports AI tools are increasingly used to assist in bug discovery. While these tools can identify valid issues, reports that are submitted without manual verification often lack context, contain speculative impact assessments, or include unnecessary formatting. Such reports increase triage effort, waste maintainers' time and may be ignored. Reports where the reporter has verified the issue and the proposed fix typically meet quality standards. This documentation outlines specific requirements for length, formatting, and impact evaluation to reduce the effort needed to deal with these reports. Cc: Greg KH Acked-by: Greg Kroah-Hartman Reviewed-by: Leon Romanovsky Signed-off-by: Willy Tarreau Signed-off-by: Jonathan Corbet Message-ID: <20260509094755.2838-4-w@1wt.eu> --- Documentation/process/security-bugs.rst | 57 +++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index 54260dbfc64d5a..f85c65f31f12f7 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -167,6 +167,63 @@ the Linux kernel security team only. Your message will be triaged, and you will receive instructions about whom to contact, if needed. Your message may equally be forwarded as-is to the relevant maintainers. +Responsible use of AI to find bugs +---------------------------------- + +A significant fraction of bug reports submitted to the security team are +actually the result of code reviews assisted by AI tools. While this can be an +efficient means to find bugs in rarely explored areas, it causes an overload on +maintainers, who are sometimes forced to ignore such reports due to their poor +quality or accuracy. As such, reporters must be particularly cautious about a +number of points which tend to make these reports needlessly difficult to +handle: + + * **Length**: AI-generated reports tend to be excessively long, containing + multiple sections and excessive detail. This makes it difficult to spot + important information such as affected files, versions, and impact. Please + ensure that a clear summary of the problem and all critical details are + presented first. Do not require triage engineers to scan multiple pages of + text. Configure your tools to produce concise, human-style reports. + + * **Formatting**: Most AI-generated reports are littered with Markdown tags. + These decorations complicate the search for important information and do + not survive the quoting processes involved in forwarding or replying. + Please **always convert your report to plain text** without any formatting + decorations before sending it. + + * **Impact Evaluation**: Many AI-generated reports lack an understanding of + the kernel's threat model and go to great lengths inventing theoretical + consequences. This adds noise and complicates triage. Please stick to + verifiable facts (e.g., "this bug permits any user to gain CAP_NET_ADMIN") + without enumerating speculative implications. Have your tool read this + documentation as part of the evaluation process. + + * **Reproducer**: AI-based tools are often capable of generating reproducers. + Please always ensure your tool provides one and **test it thoroughly**. If + the reproducer does not work, or if the tool cannot produce one, the + validity of the report should be seriously questioned. Note that since the + report will be posted to a public list, the reproducer should only be + shared upon maintainers' request. + + * **Propose a Fix**: Many AI tools are actually better at writing code than + evaluating it. Please ask your tool to propose a fix and **test it** before + reporting the problem. If the fix cannot be tested because it relies on + rare hardware or almost extinct network protocols, the issue is likely not + a security bug. In any case, if a fix is proposed, it must adhere to + Documentation/process/submitting-patches.rst and include a 'Fixes:' tag + designating the commit that introduced the bug. + +Failure to consider these points exposes your report to the risk of being +ignored. + +Use common sense when evaluating the report. If the affected file has not been +touched for more than one year and is maintained by a single individual, it is +likely that usage has declined and exposed users are virtually non-existent +(e.g., drivers for very old hardware, obsolete filesystems). In such cases, +there is no need to consume a maintainer's time with an unimportant report. If +the issue is clearly trivial and publicly discoverable, you should report it +directly to the public mailing lists. + Sending the report ------------------ From 793d2a057f7e7bc647c5401413f7bf7d4f08b969 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 May 2026 21:54:51 +0200 Subject: [PATCH 807/972] hwmon: (acpi_power_meter) Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_COMPANION() check against NULL to the acpi_power_meter hwmon driver. Fixes: afc6c4aedea5 ("hwmon: (acpi_power_meter) Convert ACPI driver to a platform one") Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/5068745.GXAFRqVoOG@rafael.j.wysocki Signed-off-by: Guenter Roeck --- drivers/hwmon/acpi_power_meter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c index be7f702dcde9c0..0c9b9f4180fb77 100644 --- a/drivers/hwmon/acpi_power_meter.c +++ b/drivers/hwmon/acpi_power_meter.c @@ -884,10 +884,14 @@ static void acpi_power_meter_notify(acpi_handle handle, u32 event, void *data) static int acpi_power_meter_probe(struct platform_device *pdev) { - struct acpi_device *device = ACPI_COMPANION(&pdev->dev); struct acpi_power_meter_resource *resource; + struct acpi_device *device; int res; + device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + resource = kzalloc_obj(*resource); if (!resource) return -ENOMEM; From f06035ab324e52a845079c2e5f2380fa3cebde9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 May 2026 21:56:14 +0200 Subject: [PATCH 808/972] hwmon: (asus_atk0110) Check ACPI_COMPANION() against NULL Every platform driver can be forced to match a device that doesn't match its list of device IDs because of device_match_driver_override(), so platform drivers that rely on the existence of a device's ACPI companion object need to verify its presence. Accordingly, add a requisite ACPI_HANDLE() check against NULL to the asus_atk0110 hwmon driver. Fixes: ee1752590733 ("hwmon: (asus_atk0110) Convert ACPI driver to a platform one") Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/2261594.irdbgypaU6@rafael.j.wysocki Signed-off-by: Guenter Roeck --- drivers/hwmon/asus_atk0110.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c index 5688ff5f7c28d8..109318b0434d99 100644 --- a/drivers/hwmon/asus_atk0110.c +++ b/drivers/hwmon/asus_atk0110.c @@ -1273,15 +1273,20 @@ static int atk_probe(struct platform_device *pdev) struct acpi_buffer buf; union acpi_object *obj; struct atk_data *data; + acpi_handle handle; dev_dbg(&pdev->dev, "adding...\n"); + handle = ACPI_HANDLE(&pdev->dev); + if (!handle) + return -ENODEV; + data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->dev = &pdev->dev; - data->atk_handle = ACPI_HANDLE(&pdev->dev); + data->atk_handle = handle; INIT_LIST_HEAD(&data->sensor_list); data->disable_ec = false; From d289478cfc0bcf81c7914200d6abdcb78bd04ded Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 12 May 2026 09:29:30 +0200 Subject: [PATCH 809/972] libceph: handle rbtree insertion error in decode_choose_args() A message of type CEPH_MSG_OSD_MAP contains an OSD map that itself contains a CRUSH map. The received CRUSH map may optionally contain choose_args that get decoded in decode_choose_args(). In this function, num_choose_arg_maps is read from the message, and a corresponding number of crush_choose_arg_maps gets decoded afterwards. Each crush_choose_arg_map has a choose_args_index, which serves as the key when inserting it into the choose_args rbtree of the decoded crush_map. If a (potentially corrupted) message contains two crush_choose_arg_maps with the same index, the assertion in insert_choose_arg_map() triggers a kernel BUG when trying to insert the second crush_choose_arg_map. This patch fixes the issue by switching to the non-asserting rbtree insertion function and rejecting the message if the insertion fails. [ idryomov: changelog ] Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2095e73ccf6c93..0752a3808b91aa 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -392,7 +392,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto e_inval; } - insert_choose_arg_map(&c->choose_args, arg_map); + if (!__insert_choose_arg_map(&c->choose_args, arg_map)) { + ret = -EEXIST; + goto fail; + } } return 0; From 28b0a2ab8c82d0bbdeb8013029c67c978ce6e4bf Mon Sep 17 00:00:00 2001 From: Raphael Zimmer Date: Tue, 12 May 2026 18:16:40 +0200 Subject: [PATCH 810/972] libceph: Fix potential null-ptr-deref in decode_choose_args() A message of type CEPH_MSG_OSD_MAP contains an OSD map that itself contains a CRUSH map. When decoding this CRUSH map in crush_decode(), an array of max_buckets CRUSH buckets is decoded, where some indices may not refer to actual buckets and are therefore set to NULL. The received CRUSH map may optionally contain choose_args that get decoded in decode_choose_args(). When decoding a crush_choose_arg_map, a series of choose_args for different buckets is decoded, with the bucket_index being read from the incoming message. It is only checked that the bucket index does not exceed max_buckets, but not that it doesn't point to an index with a NULL bucket. If a (potentially corrupted) message contains a crush_choose_arg_map including such a bucket_index, a null pointer dereference may occur in the subsequent processing when attempting to access the bucket with the given index. This patch fixes the issue by extending the affected check. Now, it is only attempted to access the bucket if it is not NULL. Cc: stable@vger.kernel.org Signed-off-by: Raphael Zimmer Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/osdmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 0752a3808b91aa..8b5b0587a0cfa2 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -388,7 +388,8 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c) goto fail; if (arg->ids_size && - arg->ids_size != c->buckets[bucket_index]->size) + (!c->buckets[bucket_index] || + arg->ids_size != c->buckets[bucket_index]->size)) goto e_inval; } From e4a640475e43f406fdfd56d370b1f34b0cbbc18d Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Tue, 12 May 2026 14:28:33 +0100 Subject: [PATCH 811/972] audit: fix incorrect inheritable capability in CAPSET records __audit_log_capset() records the effective capability set into the inheritable field due to a copy-paste error. Every CAPSET audit record therefore reports cap_pi (process inheritable) with the value of cap_effective instead of cap_inheritable. This silently corrupts audit data used for compliance and forensic analysis: an attacker who modifies inheritable capabilities to prepare for a privilege-escalating exec would have the change masked in the audit trail. The bug has been present since the original introduction of CAPSET audit records in 2008. Cc: stable@vger.kernel.org Fixes: e68b75a027bb ("When the capset syscall is used it is not possible for audit to record the actual capbilities being added/removed. This patch adds a new record type which emits the target pid and the eff, inh, and perm cap sets.") Reviewed-by: Ricardo Robaina Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Sergio Correia Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ab54fccba215ca..abdf8da3be9340 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; - context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.inheritable = new->cap_inheritable; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; From f9e1c1324b4d98d591a6f7568fdebf5cf456dfc2 Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Tue, 12 May 2026 14:28:59 +0100 Subject: [PATCH 812/972] audit: enforce AUDIT_LOCKED for AUDIT_TRIM and AUDIT_MAKE_EQUIV AUDIT_ADD_RULE and AUDIT_DEL_RULE correctly check for AUDIT_LOCKED and return -EPERM, but AUDIT_TRIM and AUDIT_MAKE_EQUIV do not. This allows a process with CAP_AUDIT_CONTROL to modify directory tree watches and equivalence mappings even when the audit configuration has been locked, undermining the purpose of the lock. Add AUDIT_LOCKED checks to both commands. Cc: stable@vger.kernel.org Reviewed-by: Ricardo Robaina Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Sergio Correia Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/audit.c b/kernel/audit.c index e1d489bc2dff99..34dc7cb246ff27 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); @@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, size_t msglen = data_len; char *old, *new; + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; From 577a8d3bae0531f0e5ccfac919cd8192f920a804 Mon Sep 17 00:00:00 2001 From: Aaron Sacks Date: Tue, 12 May 2026 02:07:42 -0400 Subject: [PATCH 813/972] KVM: Reject wrapped offset in kvm_reset_dirty_gfn() kvm_reset_dirty_gfn() guards the gfn range with if (!memslot || (offset + __fls(mask)) >= memslot->npages) return; but offset is u64 and the addition is unchecked. The check can be silently bypassed by a u64 wrap. The dirty ring backing those entries is MAP_SHARED at KVM_DIRTY_LOG_PAGE_OFFSET of the vcpu fd, so the VMM can rewrite the slot and offset fields of any entry between when the kernel pushes them and when KVM_RESET_DIRTY_RINGS consumes them. On reset, kvm_dirty_ring_reset() re-reads the values via READ_ONCE() and feeds them straight back into this check; only the flags handshake is treated as the handover, the slot/offset payload is taken on trust. Crafting two entries entry[i].offset = 0xffffffffffffffc1 entry[i+1].offset = 0 makes the coalescing loop in kvm_dirty_ring_reset() compute delta = (s64)(0 - 0xffffffffffffffc1) = 63 which falls in [0, BITS_PER_LONG), so it folds entry[i+1] into the existing mask by setting bit 63. The trailing kvm_reset_dirty_gfn() call then sees offset = 0xffffffffffffffc1 and __fls(mask) = 63; the sum is 0 in u64 and the bounds check passes. That offset propagates into kvm_arch_mmu_enable_log_dirty_pt_masked() unchanged. On the legacy MMU path -- kvm_memslots_have_rmaps() == true, i.e. shadow paging, any VM that has allocated shadow roots, or a write-tracked slot -- it reaches gfn_to_rmap(), which indexes slot->arch.rmap[0][] with a near-U64_MAX gfn. That is an out-of-bounds load of a kvm_rmap_head, followed by a conditional clear of PT_WRITABLE_MASK in whatever the loaded pointer points at. The path is reachable from any process holding /dev/kvm. Range-check offset on its own first, so the addition cannot wrap. memslot->npages is bounded well below U64_MAX, so once offset < npages holds, offset + __fls(mask) (with __fls(mask) < BITS_PER_LONG) stays in range. Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking") Cc: stable@vger.kernel.org Signed-off-by: Aaron Sacks Link: https://patch.msgid.link/20260512060742.1628959-1-contact@xchglabs.com/ Signed-off-by: Paolo Bonzini --- virt/kvm/dirty_ring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 02bc6b00d76cbd..572b854edf740d 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -63,7 +63,8 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); - if (!memslot || (offset + __fls(mask)) >= memslot->npages) + if (!memslot || offset >= memslot->npages || + offset + __fls(mask) >= memslot->npages) return; KVM_MMU_LOCK(kvm); From 2b72f1674e427c56e3772c5ccf785fdda2138820 Mon Sep 17 00:00:00 2001 From: Qiang Ma Date: Tue, 12 May 2026 09:53:13 +0800 Subject: [PATCH 814/972] KVM: x86: Fix Xen hypercall tracepoint argument assignment TRACE_EVENT(kvm_xen_hypercall) stores a5 in __entry->a4 instead of __entry->a5. That overwrites the recorded a4 argument and leaves a5 unset in the trace entry. Fix the typo so both arguments are captured correctly. Signed-off-by: Qiang Ma Link: https://patch.msgid.link/20260512015313.1685784-1-maqianga@uniontech.com/ Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index e7fdbe9efc904c..0db25bba17f6e8 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, __entry->a2 = a2; __entry->a3 = a3; __entry->a4 = a4; - __entry->a4 = a5; + __entry->a5 = a5; ), TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx", From 5bd1ddb7911ba7e94b61cf429970963f1b22dd76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 May 2026 14:33:21 -0700 Subject: [PATCH 815/972] KVM: nSVM: Never use L0's PAUSE loop exiting while L2 is running Never use L0's (KVM's) PAUSE loop exiting controls while L2 is running, and instead always configure vmcb02 according to L1's exact capabilities and desires. The purpose of intercepting PAUSE after N attempts is to detect when the vCPU may be stuck waiting on a lock, so that KVM can schedule in a different vCPU that may be holding said lock. Barring a very interesting setup, L1 and L2 do not share locks, and it's extremely unlikely that an L1 vCPU would hold a spinlock while running L2. I.e. having a vCPU executing in L1 yield to a vCPU running in L2 will not allow the L1 vCPU to make forward progress, and vice versa. While teaching KVM's "on spin" logic to only yield to other vCPUs in L2 is doable, in all likelihood it would do more harm than good for most setups. KVM has limited visibility into which L2 "vCPUs" belong to the same VM, and thus share a locking domain. And even if L2 vCPUs are in the same VM, KVM has no visilibity into L2 vCPU's that are scheduled out by the L1 hypervisor. Furthermore, KVM doesn't actually steal PAUSE exits from L1. If L1 is intercepting PAUSE, KVM will route PAUSE exits to L1, not L0, as nested_svm_intercept() gives priority to the vmcb12 intercept. As such, overriding the count/threshold fields in vmcb02 with vmcb01's values is nonsensical, as doing so clobbers all the training/learning that has been done in L1. Even worse, if L1 is not intercepting PAUSE, i.e. KVM is handling PAUSE exits, then KVM will adjust the PLE knobs based on L2 behavior, which could very well be detrimental to L1, e.g. due to essentially poisoning L1 PLE training with bad data. And copying the count from vmcb02 to vmcb01 on a nested VM-Exit makes even less sense, because again, the purpose of PLE is to detect spinning vCPUs. Whether or not a vCPU is spinning in L2 at the time of a nested VM-Exit has no relevance as to the behavior of the vCPU when it executes in L1. The only scenarios where any of this actually works is if at least one of KVM or L1 is NOT intercepting PAUSE for the guest. Per the original changelog, those were the only scenarios considered to be supported. Disabling KVM's use of PLE makes it so the VM is always in a "supported" mode. Last, but certainly not least, using KVM's count/threshold instead of the values provided by L1 is a blatant violation of the SVM architecture. Fixes: 74fd41ed16fd ("KVM: x86: nSVM: support PAUSE filtering when L0 doesn't intercept PAUSE") Cc: Maxim Levitsky Tested-by: David Kaplan Signed-off-by: Sean Christopherson Link: https://patch.msgid.link/20260508213321.373309-1-seanjc@google.com/ Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 43 +++++++++++++-------------------------- arch/x86/kvm/svm/svm.c | 15 ++++++++++++-- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 961804df5f451c..b340dc9991adb4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -160,6 +160,16 @@ void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI); + /* + * Intercept PAUSE if and only if L1 wants to. KVM intercepts PAUSE so + * that a vCPU that may be spinning waiting for a lock can be scheduled + * out in favor of the vCPU that holds said lock. KVM doesn't support + * yielding across L2 vCPUs, as KVM has limited visilibity into which + * L2 vCPUs are in the same L2 VM, i.e. may be contending for locks. + */ + if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_PAUSE); + if (nested_vmcb_needs_vls_intercept(svm)) { /* * If the virtual VMLOAD/VMSAVE is not enabled for the L2, @@ -819,7 +829,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; - u32 pause_count12, pause_thresh12; nested_svm_transition_tlb_flush(vcpu); @@ -947,31 +956,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) - pause_count12 = vmcb12_ctrl->pause_filter_count; + vmcb02->control.pause_filter_count = vmcb12_ctrl->pause_filter_count; else - pause_count12 = 0; + vmcb02->control.pause_filter_count = 0; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) - pause_thresh12 = vmcb12_ctrl->pause_filter_thresh; + vmcb02->control.pause_filter_thresh = vmcb12_ctrl->pause_filter_thresh; else - pause_thresh12 = 0; - if (kvm_pause_in_guest(svm->vcpu.kvm)) { - /* use guest values since host doesn't intercept PAUSE */ - vmcb02->control.pause_filter_count = pause_count12; - vmcb02->control.pause_filter_thresh = pause_thresh12; - - } else { - /* start from host values otherwise */ - vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; - vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; - - /* ... but ensure filtering is disabled if so requested. */ - if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) { - if (!pause_count12) - vmcb02->control.pause_filter_count = 0; - if (!pause_thresh12) - vmcb02->control.pause_filter_thresh = 0; - } - } + vmcb02->control.pause_filter_thresh = 0; /* * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to @@ -1298,12 +1289,6 @@ void nested_svm_vmexit(struct vcpu_svm *svm) /* in case we halted in L2 */ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); - if (!kvm_pause_in_guest(vcpu->kvm)) { - vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; - vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); - - } - /* * Invalidate last_bus_lock_rip unless KVM is still waiting for the * guest to make forward progress before re-enabling bus lock detection. diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e7fdd7a9c280d7..e02a38da5296e3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -913,7 +913,15 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + /* + * While running L2, KVM should intercept PAUSE if and only if L1 wants + * to intercept PAUSE, and L1's intercept should take priority, i.e. + * KVM should never handle a PAUSE intercept from L2. + */ + if (WARN_ON_ONCE(is_guest_mode(vcpu))) return; control->pause_filter_count = __grow_ple_window(old, @@ -934,7 +942,10 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; - if (kvm_pause_in_guest(vcpu->kvm)) + /* Adjusting pause_filter_count makes no sense if PLE is disabled. */ + WARN_ON_ONCE(kvm_pause_in_guest(vcpu->kvm)); + + if (is_guest_mode(vcpu)) return; control->pause_filter_count = From 80f4a7b8ce7513c203562191426e4d4cc635b095 Mon Sep 17 00:00:00 2001 From: Ninad Naik Date: Mon, 11 May 2026 23:13:02 +0530 Subject: [PATCH 816/972] Documentation: kvm: update links in the references section of AMD Memory Encryption Replace non-working links in the reference section with the working ones. Signed-off-by: Ninad Naik Link: https://patch.msgid.link/20260511174302.811918-1-ninadnaik07@gmail.com/ Reviewed-by: Liam Merwick Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/x86/amd-memory-encryption.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index b2395dd4769de6..bd04a908a8dbdd 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -656,8 +656,8 @@ References See [white-paper]_, [api-spec]_, [amd-apm]_, [kvm-forum]_, and [snp-fw-abi]_ for more info. -.. [white-paper] https://developer.amd.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf -.. [api-spec] https://support.amd.com/TechDocs/55766_SEV-KM_API_Specification.pdf -.. [amd-apm] https://support.amd.com/TechDocs/24593.pdf (section 15.34) +.. [white-paper] https://docs.amd.com/v/u/en-US/memory-encryption-white-paper +.. [api-spec] https://docs.amd.com/v/u/en-US/55766_PUB_3.24_SEV_API +.. [amd-apm] https://docs.amd.com/v/u/en-US/24593_3.44_APM_Vol2 (section 15.34) .. [kvm-forum] https://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf -.. [snp-fw-abi] https://www.amd.com/system/files/TechDocs/56860.pdf +.. [snp-fw-abi] https://www.amd.com/content/dam/amd/en/documents/developer/56860.pdf From 87c810160ed738cd983e4a65ebe9709927c702c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 12 May 2026 08:56:34 -0700 Subject: [PATCH 817/972] KVM: selftests: Ensure gmem file sizes are multiple of host page size When creating a guest_memfd file and associated memslot to validate shared guest memory, size the file+memslot to the maximum of the host or guest page size. Attempting to allocate a single guest page will fail if the host page size is greater than the guest page size, as KVM requires that the size of memslots and guest_memfd files are a multiple of the host page size. For simplicity, verify the entire file can be shared between guest and host, e.g. instead of trying to validate "partial" mappings. Fixes: 42188667be38 ("KVM: selftests: Add guest_memfd testcase to fault-in on !mmap()'d memory") Reported-by: Zenghui Yu Closes: https://lore.kernel.org/all/0064952b-048c-455d-ad89-e27e5cb82591@linux.dev Signed-off-by: Sean Christopherson Message-ID: <20260512155634.772602-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/guest_memfd_test.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index d6528c6f5e031d..253e748c1d4aa8 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -510,7 +510,12 @@ static void test_guest_memfd_guest(void) "Default VM type should support INIT_SHARED, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); - size = vm->page_size; + /* + * Use the max of the host or guest page size for all operations, as + * KVM requires guest_memfd files and memslots to be sized to multiples + * of the host page size. + */ + size = max_t(size_t, vm->page_size, page_size); fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); @@ -519,7 +524,7 @@ static void test_guest_memfd_guest(void) memset(mem, 0xaa, size); kvm_munmap(mem, size); - virt_pg_map(vm, gpa, gpa); + virt_map(vm, gpa, gpa, size / vm->page_size); vcpu_args_set(vcpu, 2, gpa, size); vcpu_run(vcpu); From 6b72d0578ca6f77b835d773d7c77c2f872d3e924 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sun, 3 May 2026 23:09:17 +0200 Subject: [PATCH 818/972] KVM: x86: use again the flush argument of __link_shadow_page() Except in the case of parentless nested-TDP pages, mmu_page_zap_pte() clears the SPTE but leaves the invalid_list empty. In this case, using kvm_flush_remote_tlbs() as kvm_mmu_remote_flush_or_zap() does is overkill. Avoid flushing the entirety of the remote TLBs unless the invalid_list was populated: instead, use a more efficient gfn-targeting flush (if available) and skip it altogether if the caller guarantees that a TLB flush is not necessary. Based-on: <20260503201029.106481-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini Message-ID: <20260503210917.121840-1-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 892246204435c5..f0144ae8d891d3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2526,6 +2526,23 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } +/* + * Note: while normally KVM uses a "bool flush" return value to let + * the caller batch flushes, __link_shadow_page() flushes immediately + * before populating the parent PTE with the new shadow page. The + * typical callers, direct_map() and FNAME(fetch)(), are not going + * to zap more than one huge SPTE anyway. + * + * The only exception, where @flush can be false, is when a huge SPTE + * is replaced with a shadow page SPTE with a fully populated page table, + * which can happen from shadow_mmu_split_huge_page(). In this case, + * no memory is unmapped across the change to the page tables and no + * immediate flush is needed for correctness. + * + * Even in that case, calls to kvm_mmu_commit_zap_page() are not + * batched. Doing so would require adding an invalid_list argument + * all the way down to __walk_slot_rmaps(). + */ static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) @@ -2541,8 +2558,10 @@ static void __link_shadow_page(struct kvm *kvm, parent_sp = sptep_to_sp(sptep); WARN_ON_ONCE(parent_sp->role.level == PG_LEVEL_4K); - mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list); - kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, true); + if (mmu_page_zap_pte(kvm, parent_sp, sptep, &invalid_list)) + kvm_mmu_commit_zap_page(kvm, &invalid_list); + else if (flush) + kvm_flush_remote_tlbs_sptep(kvm, sptep); } spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); From 3098c076c83ea2913245cb915cdcba98eb24214c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 May 2026 14:35:14 -0700 Subject: [PATCH 819/972] KVM: x86: Swap the dst and src operand for MOVNTDQA Swap the MOVNTDQA operands, as MOVNTDQA does NOT in fact have "the same characteristics as 0F E7 (MOVNTDQ)"; MOVNTDQA loads from memory and stores to registers, while MOVNTDQ loads from registers and stores to memory. Per the SDM: MOVNTDQ - Move packed integer values in xmm1 to m128 using non-temporal hint. MOVNTDQA - Move double quadword from m128 to xmm1 using non-temporal hint if WC memory type. Reported-by: Josh Eads Fixes: c57d9bafbd0b ("KVM: x86: Add support for emulating MOVNTDQA") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20260506213514.2781948-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c8c6cc0406d6dd..8013dccb31102a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4481,7 +4481,7 @@ static const struct opcode opcode_map_0f_38[256] = { X16(N), X16(N), /* 0x20 - 0x2f */ X8(N), - X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, + X2(N), GP(SrcMem | DstReg | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, /* 0x30 - 0x7f */ X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ From 39e25a2100604320e8d9df54c6c31258f7a3df29 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 May 2026 10:30:00 -1000 Subject: [PATCH 820/972] sched_ext: Drop NONE early return in scx_disable_and_exit_task() d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths") skipped the trailing scx_set_task_sched(p, NULL) on NONE tasks. After scx_fail_parent() parks a task at NONE/sched=parent and the parent is later freed via queue_rcu_work() during root_disable, the preserved p->scx.sched dangles - print_scx_info() from sched_show_task() reads sch->ops.name from freed memory. Drop the early return. __scx_disable_and_exit_task() already short- circuits on NONE and the SUB_INIT block was cleared by scx_fail_parent()'s earlier call, so clearing p->scx.sched is the only work left - and the one thing the path actually needs. v2: Extend the SUB_INIT block comment to note that the flag is only set on the sub-enable path, so it's always clear on the NONE re-entry (Andrea). Fixes: d3e73a0808dd ("sched_ext: Handle SCX_TASK_NONE in disable/switched_from paths") Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9354da79e162a7..68120f679178da 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3703,22 +3703,14 @@ static void scx_sub_init_cancel_task(struct scx_sched *sch, struct task_struct * static void scx_disable_and_exit_task(struct scx_sched *sch, struct task_struct *p) { - /* - * %NONE means @p is already detached at the SCX level (e.g. handed - * back to the parent by scx_fail_parent() with no init to undo). - * Skip to avoid clobbering scx_task_sched() and writing %NONE again - * on a state that's already %NONE. - */ - if (scx_get_task_state(p) == SCX_TASK_NONE) - return; - __scx_disable_and_exit_task(sch, p); /* * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and * its parent. Exit for the child too - scx_enable_task() never ran for - * it, so undo only init_task. + * it, so undo only init_task. The flag is only set on the sub-enable + * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. */ if (p->scx.flags & SCX_TASK_SUB_INIT) { if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) From b273b75b8d677aea06dd06d80b61b3bb06e94680 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 13:18:19 -1000 Subject: [PATCH 821/972] sched_ext: INIT_LIST_HEAD() &sch->all in scx_alloc_and_add_sched() On scx_link_sched() error paths (parent disabled, hash insert failure), &sch->all is never added to scx_sched_all. The cleanup path runs scx_unlink_sched() unconditionally, which calls list_del_rcu(&sch->all) on a list_head that was never initialized triggering a corruption warning. Initialize &sch->all. Fixes: 54be8de4236a ("sched_ext: Factor out scx_link_sched() and scx_unlink_sched()") Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 68120f679178da..6d69ba29cfd77c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6635,6 +6635,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; + INIT_LIST_HEAD(&sch->all); #ifdef CONFIG_EXT_SUB_SCHED char *buf = kzalloc(PATH_MAX, GFP_KERNEL); From cceb874eee46fe4b3d3c6c496f19125d9a3a9a8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 11 May 2026 13:18:23 -1000 Subject: [PATCH 822/972] sched_ext: Defer sub_kset base put to scx_sched_free_rcu_work scx_sub_enable_workfn() pins parent->kobj before dropping scx_sched_lock, but that does not pin parent->sub_kset. Concurrent disable can kset_unregister and free sub_kset before scx_alloc_and_add_sched() dereferences it. Split sub_kset teardown: kobject_del() at disable keeps sysfs removal; defer kobject_put() to scx_sched_free_rcu_work so the memory survives. A racing child sees state_in_sysfs=0 with valid memory, sysfs_create_dir() fails, and the existing exit_kind gate in scx_link_sched() turns it away with -ENOENT. Fixes: 411d3ef1a705 ("sched_ext: Unregister sub_kset on scheduler disable") Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6d69ba29cfd77c..23f7b3f63b09b2 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4821,6 +4821,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) kfree(sch->cgrp_path); if (sch_cgroup(sch)) cgroup_put(sch_cgroup(sch)); + if (sch->sub_kset) + kobject_put(&sch->sub_kset->kobj); #endif /* CONFIG_EXT_SUB_SCHED */ for_each_possible_cpu(cpu) { @@ -5861,7 +5863,7 @@ static void scx_sub_disable(struct scx_sched *sch) if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ @@ -5995,7 +5997,7 @@ static void scx_root_disable(struct scx_sched *sch) */ #ifdef CONFIG_EXT_SUB_SCHED if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); #endif kobject_del(&sch->kobj); From 2c308cf34284420963607d677d576a2b4124d8bd Mon Sep 17 00:00:00 2001 From: Zoran Ilievski Date: Mon, 11 May 2026 08:40:02 +0200 Subject: [PATCH 823/972] net: atlantic: preserve PCI wake-from-D3 on shutdown when WOL enabled The shutdown handler aq_pci_shutdown() unconditionally calls pci_wake_from_d3(pdev, false), clearing the PCI PME_En bit even when wake-on-LAN has been configured. While aq_nic_shutdown() correctly programs the NIC firmware via aq_nic_set_power() to listen for magic packets, the PCI subsystem will not propagate the resulting PME wake event from D3, so the system never wakes after poweroff. WOL from suspend (S3) is unaffected because aq_suspend_common() does not touch pci_wake_from_d3() and relies on the PM core's wake configuration via device_may_wakeup(). This affects all atlantic-supported NICs (AQC107/108/111/112/113); users have reported that WOL works if the atlantic driver is never loaded, but breaks once it has run its shutdown path. Pass the configured WOL state to pci_wake_from_d3() instead of a literal false, so the PCI PME_En bit is preserved when the user has armed WOL via ethtool. Fixes: 90869ddfefeb ("net: aquantia: Implement pci shutdown callback") Cc: stable@vger.kernel.org Signed-off-by: Zoran Ilievski Reviewed-by: Sukhdeep Singh Link: https://patch.msgid.link/20260511064002.1857-1-goodboy@rexbytes.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index e9e38af680c34a..39e1b606a75a9d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -371,7 +371,7 @@ static void aq_pci_shutdown(struct pci_dev *pdev) pci_disable_device(pdev); if (system_state == SYSTEM_POWER_OFF) { - pci_wake_from_d3(pdev, false); + pci_wake_from_d3(pdev, self->aq_hw->aq_nic_cfg->wol); pci_set_power_state(pdev, PCI_D3hot); } } From e3adf69f8eb121a9128c2b0029efd050d3649153 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sat, 9 May 2026 22:50:46 +0100 Subject: [PATCH 824/972] net: ethtool: phy: avoid NULL deref when PHY driver is unbound phydev->drv can become NULL while the phy_device is still attached to its net_device, namely after the PHY driver is unbound via sysfs: echo > /sys/bus/mdio_bus/drivers//unbind phy_remove() clears phydev->drv but doesn't call phy_detach(), so the phy_device stays in the link topology xarray and ethnl_req_get_phydev() still hands it back. ETHTOOL_MSG_PHY_GET then oopses on: rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); drvname is already treated as optional by phy_reply_size(), phy_fill_reply() and phy_cleanup_data(), so just skip the allocation when there is no driver bound. Fixes: 9dd2ad5e92b9 ("net: ethtool: phy: Convert the PHY_GET command to generic phy dump") Cc: stable@vger.kernel.org # 6.13.x Signed-off-by: David Carlier Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/20260509215046.107157-1-devnexen@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/phy.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index f76d94d848d6da..ddc6eab701ed65 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -94,10 +94,12 @@ static int phy_prepare_data(const struct ethnl_req_info *req_info, if (!rep_data->name) return -ENOMEM; - rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); - if (!rep_data->drvname) { - ret = -ENOMEM; - goto err_free_name; + if (phydev->drv) { + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + if (!rep_data->drvname) { + ret = -ENOMEM; + goto err_free_name; + } } rep_data->upstream_type = pdn->upstream_type; From f9e2342046ef1560d35bcd4a4b1197648ffd151d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 9 May 2026 20:23:58 +0800 Subject: [PATCH 825/972] net: atm: fix skb leak in sigd_send() default branch The default branch in sigd_send() calls sock_put() and returns -EINVAL without freeing the skb, while all other exit paths do so. Add the missing dev_kfree_skb() before sock_put() to fix the leak. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Wei Yang Link: https://patch.msgid.link/20260509122358.1102997-1-albin_yang@163.com Signed-off-by: Jakub Kicinski --- net/atm/signaling.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 358fbe5e4d1d06..b991d937205af3 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -179,6 +179,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) break; default: pr_alert("bad message type %d\n", (int)msg->type); + dev_kfree_skb(skb); /* Paired with find_get_vcc(msg->vcc) above */ sock_put(sk); return -EINVAL; From a3fdd924d88c30b9f488636ce0e4696012cf5511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Coccia?= Date: Sun, 10 May 2026 12:34:13 -0400 Subject: [PATCH 826/972] net/smc: fix sleep-inside-lock in __smc_setsockopt() causing local DoS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A logic flaw in __smc_setsockopt() allows a local unprivileged user to cause a Denial of Service (DoS) by holding the socket lock indefinitely. The function __smc_setsockopt() calls copy_from_sockptr() while holding lock_sock(sk). By passing a userfaultfd-monitored memory page (or FUSE-backed memory on systems where unprivileged userfaultfd is disabled) as the optval, an attacker can halt execution during the copy operation, keeping the lock held. Combined with asynchronous tear-down operations like shutdown(), this exhausts the kernel wq (kworkers) and triggers the hung task watchdog. [ 240.123456] INFO: task kworker/u8:2 blocked for more than 120 seconds. [ 240.123489] Call Trace: [ 240.123501] smc_shutdown+... [ 240.123512] lock_sock_nested+... This patch moves the user-space copy outside the lock_sock() critical section to prevent the issue. Fixes: a6a6fe27bab4 ("net/smc: Dynamic control handshake limitation by socket options") Signed-off-by: Nicolò Coccia Reviewed-by: Dust Li Tested-by: Dust Li Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 185dbed7de5d6c..da28652f681088 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3054,18 +3054,17 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, smc = smc_sk(sk); + /* pre-fetch user data outside the lock */ + if (optname == SMC_LIMIT_HS) { + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + } + lock_sock(sk); switch (optname) { case SMC_LIMIT_HS: - if (optlen < sizeof(int)) { - rc = -EINVAL; - break; - } - if (copy_from_sockptr(&val, optval, sizeof(int))) { - rc = -EFAULT; - break; - } - smc->limit_smc_hs = !!val; rc = 0; break; From 7bf563badd37cb796df5477d2b78bb64148a1268 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Sun, 10 May 2026 15:26:40 -0700 Subject: [PATCH 827/972] net/smc: avoid NULL deref of conn->lnk in smc_msg_event tracepoint The smc_msg_event tracepoint class, shared by smc_tx_sendmsg and smc_rx_recvmsg, unconditionally dereferences smc->conn.lnk: __string(name, smc->conn.lnk->ibname) conn->lnk is only set for SMC-R; for SMC-D it is NULL. Other code on these paths already handles this (e.g. !conn->lnk in SMC_STAT_RMB_TX_SIZE_SMALL()). With the tracepoint enabled, the first sendmsg()/recvmsg() on an SMC-D socket crashes: Oops: general protection fault, probably for non-canonical address KASAN: null-ptr-deref in range [...] RIP: 0010:strlen+0x1e/0xa0 Call Trace: trace_event_raw_event_smc_msg_event (net/smc/smc_tracepoint.h:44) smc_rx_recvmsg (net/smc/smc_rx.c:515) smc_recvmsg (net/smc/af_smc.c:2859) __sys_recvfrom (net/socket.c:2315) __x64_sys_recvfrom (net/socket.c:2326) do_syscall_64 The faulting address 0x3e0 is offsetof(struct smc_link, ibname), confirming the NULL ->lnk deref. Enabling the tracepoint requires root, but the trigger itself is unprivileged: socket(AF_SMC, ...) has no capability check, and SMC-D negotiation needs no admin step on s390 or on x86 with the loopback ISM device loaded. Log an empty device name for SMC-D instead of dereferencing NULL. Fixes: aff3083f10bf ("net/smc: Introduce tracepoints for tx and rx msg") Reported-by: Weiming Shi Signed-off-by: Xiang Mei Reviewed-by: Dust Li Reviewed-by: Sidraya Jayagond Signed-off-by: Jakub Kicinski --- net/smc/smc_tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index a9a6e3c1113aaa..53da84f57fd6f3 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) - __string(name, smc->conn.lnk->ibname) + __string(name, smc->conn.lnk ? smc->conn.lnk->ibname : "") ), TP_fast_assign( From 3d042592ebd4c7e44974d556de0b727cb7db4dab Mon Sep 17 00:00:00 2001 From: Chenguang Zhao Date: Mon, 11 May 2026 09:43:43 +0800 Subject: [PATCH 828/972] ethtool: fix ethnl_bitmap32_not_zero() bit interval semantics ethnl_bitmap32_not_zero() should return true if some bit in [start, end) is set: - Fix inverted memchr_inv() sense: return true when the scan finds a non-zero byte, not when the middle words are all zero. - Return false for an empty interval (end <= start). - When end is 32-bit aligned, indices in [start, end) do not include any bits from map[end_word]; return false after earlier checks found no non-zero data. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Signed-off-by: Chenguang Zhao Signed-off-by: Jakub Kicinski --- net/ethtool/bitset.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8bb98d3ea3db21..a3a2cc6480a0e8 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -92,7 +92,7 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, u32 mask; if (end <= start) - return true; + return false; if (start % 32) { mask = ethnl_upper_bits(start); @@ -105,11 +105,11 @@ static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start, start_word++; } - if (!memchr_inv(map + start_word, '\0', - (end_word - start_word) * sizeof(u32))) + if (memchr_inv(map + start_word, '\0', + (end_word - start_word) * sizeof(u32))) return true; if (end % 32 == 0) - return true; + return false; return map[end_word] & ethnl_lower_bits(end); } From f5b2772d14884f4be9e718644f1203d4d0e6f0d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Sun, 10 May 2026 12:30:17 +0200 Subject: [PATCH 829/972] net: ethernet: ravb: Do not check URAM suspension when WoL is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When updating the driver to match latest datasheet to suspend access to URAM when suspending DMA transfers a corner-case was missed, URAM access will not be suspended if WoL is enabled. This lead to the error message (correctly) being triggered as URAM access is not suspended even tho it's requested as part of stopping DMA. Avoid checking if URAM access is suspended and printing the error message if WoL is enabled when we suspend the system, as we know it will not be. Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/all/CAMuHMdWnjV%3DHGE1o08zLhUfTgOSene5fYx1J5GG10mB%2BToq8qg@mail.gmail.com/ Fixes: 353d8e7989b6 ("net: ethernet: ravb: Suspend and resume the transmission flow") Signed-off-by: Niklas Söderlund Reviewed-by: Sai Krishna Tested-by: Geert Uytterhoeven Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/ravb_main.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 1dbfadb2a88145..5f88733094d0fc 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1108,9 +1108,12 @@ static int ravb_stop_dma(struct net_device *ndev) /* Request for transmission suspension */ ravb_modify(ndev, CCC, CCC_DTSR, CCC_DTSR); - error = ravb_wait(ndev, CSR, CSR_DTS, CSR_DTS); - if (error) - netdev_err(ndev, "failed to stop AXI BUS\n"); + /* Access to URAM will not be suspended if WoL is enabled. */ + if (!priv->wol_enabled) { + error = ravb_wait(ndev, CSR, CSR_DTS, CSR_DTS); + if (error) + netdev_err(ndev, "failed to stop AXI BUS\n"); + } /* Stop AVB-DMAC process */ return ravb_set_opmode(ndev, CCC_OPC_CONFIG); From 5f7c7c63ffb1a187eb90c80864469db45f3bd2a8 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Wed, 13 May 2026 17:43:03 +0800 Subject: [PATCH 830/972] io_uring/rw: drop unused attr_type_mask from io_prep_rw_pi() io_prep_rw_pi() never used the attr_type_mask argument. Callers already validate sqe->attr_type_mask before invoking the helper (only IORING_RW_ATTR_FLAG_PI is supported today). Remove the dead parameter to avoid implying further interpretation happens here. Signed-off-by: Yang Xiuwei Link: https://patch.msgid.link/20260513094303.866533-1-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- io_uring/rw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index e729e0e7657ecc..0c48346452797a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -230,7 +230,7 @@ static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) } static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, - u64 attr_ptr, u64 attr_type_mask) + u64 attr_ptr) { struct io_uring_attr_pi pi_attr; struct io_async_rw *io; @@ -305,7 +305,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr); } return 0; } From 2d5d3fc593c9b7e41bee86175d7b9e11f470072e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 May 2026 16:58:48 +0200 Subject: [PATCH 831/972] KVM: VMX: introduce module parameter to disable CET There have been reports of host hangs caused by CET virtualization. Until these are analyzed further, introduce a module parameter that makes it possible to easily disable it. Link: https://lore.kernel.org/all/85548beb-1486-40f9-beb4-632c78e3360b@proxmox.com/ Cc: David Riley Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/capabilities.h | 1 + arch/x86/kvm/vmx/vmx.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 56cacc06225ecc..31568274d8bb02 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -14,6 +14,7 @@ extern bool __read_mostly flexpriority_enabled; extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; +extern bool __read_mostly enable_cet; extern bool __read_mostly enable_pml; extern int __read_mostly pt_mode; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c2c33a5f7dc9a..49feecb286b23c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -108,6 +108,9 @@ module_param_named(unrestricted_guest, bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, 0444); +bool __read_mostly enable_cet = 1; +module_param_named(cet, enable_cet, bool, 0444); + static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, 0444); @@ -4476,7 +4479,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter * 3 and 4 for details. */ - if (cpu_has_load_cet_ctrl()) { + if (enable_cet) { vmcs_writel(HOST_S_CET, kvm_host.s_cet); vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); @@ -4532,6 +4535,10 @@ static u32 vmx_get_initial_vmentry_ctrl(void) if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + + if (!enable_cet) + vmentry_ctrl &= ~VM_ENTRY_LOAD_CET_STATE; + /* * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. */ @@ -4546,6 +4553,9 @@ static u32 vmx_get_initial_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + if (!enable_cet) + vmexit_ctrl &= ~VM_EXIT_LOAD_CET_STATE; + /* * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for * nested virtualization and thus allowed to be set in vmcs12. @@ -8155,7 +8165,7 @@ static __init void vmx_set_cpu_caps(void) * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code * fails, so disable CET in this case too. */ - if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + if (!enable_cet || !enable_unrestricted_guest || !cpu_has_vmx_basic_no_hw_errcode_cc()) { kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); @@ -8630,6 +8640,9 @@ __init int vmx_hardware_setup(void) !cpu_has_vmx_invept_global()) enable_ept = 0; + if (!cpu_has_load_cet_ctrl()) + enable_cet = 0; + /* NX support is required for shadow paging. */ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { pr_err_ratelimited("NX (Execute Disable) not supported\n"); From 836efd35c472d89c838d7b17ef339ddb3286ffc5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 13 May 2026 20:11:29 +0900 Subject: [PATCH 832/972] block: fix handling of dead zone write plugs Shin'ichiro reported hard to reproduce unaligned write errors with zoned block devices. Under normal operation conditions (e.g. running XFS on an SMR disk), these errors are nearly impossible to trigger. But using a "slow" kernel with many debug options enables and some specific use cases (e.g. fio zbd test case 46), the errors can be reproduced fairly easily. The unaligned write errors come from mishandling a valid reference counting pattern of zone write plugs. Such pattern triggers for instance if a process A writes a zone (not necessarilly to the full state), another process B immediately resets the zone and immediately following the completion of the zone reset, starts issuing writes to the zone. With such pattern, in some cases, the zone write plugs worker thread of the device may still be holding a reference to the zone write plug of the zone taken when process A was writing to the zone. The following zone reset from process B marks the zone as dead but does not remove the zone write plug from the device hash table as a reference to the plug still exist. Once process B starts issuing new writes, the zone write plug is seen as dead and the writes from process B are immediately failed, despite this write pattern being perfectly legal. Fix this by allowing restoring a dead zone write plug to a live state if a write is issued to the zone when the zone is: marked as dead, empty and the write sector corresponds to the first sector of the zone (that is, the write is aligned to the zone write pointer). This is done with the new helper function disk_check_zone_wplug_dead(), which restores a dead zone write plug to a live state by clearing the BLK_ZONE_WPLUG_DEAD flag and restoring the initial reference to the zone write plug taken when the plug was added to the device hash table. Reported-by: Shin'ichiro Kawasaki Fixes: b7d4ffb51037 ("block: fix zone write plug removal") Signed-off-by: Damien Le Moal Tested-by: Shin'ichiro Kawasaki Link: https://patch.msgid.link/20260513111129.108809-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 30cad2bb9291b6..42ef830054dc8b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -623,6 +623,28 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } +static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug) +{ + if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) + return false; + + /* + * If a new write is received right after a zone reset completes and + * while the disk_zone_wplugs_worker() thread has not yet released the + * reference on the zone write plug after processing the last write to + * the zone, then the new write BIO will see the zone write plug marked + * as dead. This case is however a false positive and a perfectly valid + * pattern. In such case, restore the zone write plug to a live one. + */ + if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) { + zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD; + refcount_inc(&zwplug->ref); + return false; + } + + return true; +} + static bool disk_zone_wplug_submit_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug); @@ -1444,12 +1466,12 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) spin_lock_irqsave(&zwplug->lock, flags); /* - * If we got a zone write plug marked as dead, then the user is issuing - * writes to a full zone, or without synchronizing with zone reset or - * zone finish operations. In such case, fail the BIO to signal this - * invalid usage. + * Check if we got a zone write plug marked as dead. If yes, then the + * user is likely issuing writes to a full zone, or without + * synchronizing with zone reset or zone finish operations. In such + * case, fail the BIO to signal this invalid usage. */ - if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { + if (disk_check_zone_wplug_dead(zwplug)) { spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); bio_io_error(bio); From 87d0740b7c4cc847be1b6f307ab6d8547cb1a726 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 13 May 2026 18:19:40 +0800 Subject: [PATCH 833/972] selftests: ublk: cap nthreads to kernel's actual nr_hw_queues dev->nthreads is derived from the user-requested queue count before the ADD command, but the kernel may reduce nr_hw_queues (capped to nr_cpu_ids). When the VM has fewer CPUs than requested queues, the daemon creates more handler threads than there are kernel queues. In non-batch mode, the extra threads access uninitialized queues (q_depth=0), submit zero io_uring SQEs, and block forever in io_cqring_wait. In batch mode, the extra threads cause similar hangs during device removal. In both cases, the stuck threads prevent the daemon from closing the char device, holding the last ublk_device reference and causing ublk_ctrl_del_dev() to hang in wait_event_interruptible(). Fix by capping dev->nthreads to the kernel-returned nr_hw_queues after the ADD command completes. per_io_tasks mode is excluded because threads interleave across all queues, so nthreads > nr_hw_queues is valid. Fixes: abe54c160346 ("selftests: ublk: kublk: decouple ublk_queues from ublk server threads") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260513101941.1373998-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index fbd9b1e7342a35..0b23c09daea5a2 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1735,6 +1735,17 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } + /* + * The kernel may reduce nr_hw_queues (e.g. capped to nr_cpu_ids). + * Cap nthreads to the actual queue count to avoid creating extra + * handler threads that will hang during device removal. + * + * per_io_tasks mode is excluded: threads interleave across all + * queues so nthreads > nr_hw_queues is valid and intentional. + */ + if (!ctx->per_io_tasks && dev->nthreads > info->nr_hw_queues) + dev->nthreads = info->nr_hw_queues; + ret = ublk_start_daemon(ctx, dev); ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) From 8fb70afe671cc8c1f6237a39aabd50714fcd1189 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Sun, 10 May 2026 22:56:05 +0200 Subject: [PATCH 834/972] drm/xe: Drop unused ggtt_balloon field During recent GGTT refactoring we missed to drop now unused field from the xe_tile. Drop it now. Fixes: e904c56ba6e0 ("drm/xe: Rewrite GGTT VF initialization") Signed-off-by: Michal Wajdeczko Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260510205605.642-1-michal.wajdeczko@intel.com (cherry picked from commit 21d5a871f57909dc4d8e4f5d3bf92f9ccf2597b2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_tile_types.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_tile_types.h b/drivers/gpu/drm/xe/xe_tile_types.h index 33932fd547d71a..0048100ccb723d 100644 --- a/drivers/gpu/drm/xe/xe_tile_types.h +++ b/drivers/gpu/drm/xe/xe_tile_types.h @@ -106,8 +106,6 @@ struct xe_tile { struct xe_lmtt lmtt; } pf; struct { - /** @sriov.vf.ggtt_balloon: GGTT regions excluded from use. */ - struct xe_ggtt_node *ggtt_balloon[2]; /** @sriov.vf.self_config: VF configuration data */ struct xe_tile_sriov_vf_selfconfig self_config; } vf; From ea324444ece9f301b5c4ff71b258cc68990c4d61 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Mon, 16 Mar 2026 16:12:00 +0100 Subject: [PATCH 835/972] x86/mce: Restore MCA polling interval halving RongQing reported that the MCA polling interval doesn't halve when an error gets logged. It was traced down to the commit in Fixes:, because: mce_timer_fn() |-> mce_poll_banks() |-> machine_check_poll() |-> mce_log() which will queue the work and return. Now, back in mce_timer_fn(): /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. */ if (mce_notify_irq()) <--- here we haven't ran the notifier chain yet so mce_need_notify is not set yet so this won't hit and we won't halve the interval iv. Now the notifier chain runs. mce_early_notifier() sets the bit, does mce_notify_irq(), that clears the bit and then the notifier chain a little later logs the error. So this is a silly timing issue. But, that's all unnecessary. All it needs to happen here is, the "should we notify of a logged MCE" mce_notify_irq() asks, should be simply a question to the mce gen pool: "Are you empty?" And that then turns into a simple yes or no answer and it all JustWorks(tm). So do that and also distribute the functionality where it belongs: - Print that MCE events have been logged in mce_log() - Trigger the mcelog tool specific work in the first notifier As a result, mce_notify_irq() can go now. Fixes: 011d82611172 ("RAS: Add a Corrected Errors Collector") Reported-by: Li RongQing Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Qiuxu Zhuo Tested-by: Qiuxu Zhuo Link: https://lore.kernel.org/r/20260112082747.2842-1-lirongqing@baidu.com --- arch/x86/kernel/cpu/mce/core.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 8dd424ac5de8a8..f3a793e3a6c8fa 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -90,7 +90,6 @@ struct mca_config mca_cfg __read_mostly = { }; static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); -static unsigned long mce_need_notify; /* * MCA banks polled by the period polling timer for corrected events. @@ -152,8 +151,10 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) { + pr_info(HW_ERR "Machine check events logged\n"); irq_work_queue(&mce_irq_work); + } } EXPORT_SYMBOL_GPL(mce_log); @@ -585,28 +586,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -static bool mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return true; - } - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -618,9 +597,7 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, /* Emit the trace record: */ trace_mce_record(err); - set_bit(0, &mce_need_notify); - - mce_notify_irq(); + mce_work_trigger(); return NOTIFY_DONE; } @@ -1804,7 +1781,7 @@ static void mce_timer_fn(struct timer_list *t) * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. */ - if (mce_notify_irq()) + if (!mce_gen_pool_empty()) iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); From 950953f774b3f69da6f413e045ef075e1f3da2df Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:44 +0200 Subject: [PATCH 836/972] drm/gma500/oaktrail_hdmi: fix i2c adapter leak on setup Make sure to drop the reference taken to the I2C adapter (and its module) when setting up HDMI to allow the adapter to be deregistered. Fixes: 1b082ccf5901 ("gma500: Add Oaktrail support") Cc: stable@vger.kernel.org # 3.3 Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-2-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_hdmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi.c b/drivers/gpu/drm/gma500/oaktrail_hdmi.c index 58d7e191fd56f8..403d21cbb3a230 100644 --- a/drivers/gpu/drm/gma500/oaktrail_hdmi.c +++ b/drivers/gpu/drm/gma500/oaktrail_hdmi.c @@ -580,6 +580,7 @@ static int oaktrail_hdmi_get_modes(struct drm_connector *connector) } else { edid = (struct edid *)raw_edid; /* FIXME ? edid = drm_get_edid(connector, i2c_adap); */ + i2c_put_adapter(i2c_adap); } if (edid) { From 657a091ab6d01d0091b77660c75cfed573c9a53e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:45 +0200 Subject: [PATCH 837/972] drm/gma500/oaktrail_lvds: fix hang on init failure The LVDS init code looks up an I2C adapter using i2c_get_adapter() and tries to read the EDID before falling back to allocating and registering its own adapter. The error handling does not separate these cases so on a late init failure it will try to deregister and free also an adapter that had previously been registered. Since i2c_get_adapter() takes another reference to the adapter, deregistration hangs indefinitely while waiting for the reference to be released. Fix this by only destroying adapters allocated during LVDS init on errors. Fixes: a57ebfc0b4da ("drm/gma500: Make oaktrail lvds use ddc adapter from drm_connector") Cc: stable@vger.kernel.org # 6.0 Cc: Patrik Jakobsson Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-3-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_lvds.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/gma500/oaktrail_lvds.c b/drivers/gpu/drm/gma500/oaktrail_lvds.c index 884d324f004408..983cc60a1e698d 100644 --- a/drivers/gpu/drm/gma500/oaktrail_lvds.c +++ b/drivers/gpu/drm/gma500/oaktrail_lvds.c @@ -293,7 +293,7 @@ void oaktrail_lvds_init(struct drm_device *dev, { struct gma_encoder *gma_encoder; struct gma_connector *gma_connector; - struct gma_i2c_chan *ddc_bus; + struct gma_i2c_chan *ddc_bus = NULL; struct drm_connector *connector; struct drm_encoder *encoder; struct drm_psb_private *dev_priv = to_drm_psb_private(dev); @@ -421,7 +421,8 @@ void oaktrail_lvds_init(struct drm_device *dev, err_unlock: mutex_unlock(&dev->mode_config.mutex); - gma_i2c_destroy(to_gma_i2c_chan(connector->ddc)); + if (!IS_ERR_OR_NULL(ddc_bus)) + gma_i2c_destroy(ddc_bus); drm_encoder_cleanup(encoder); err_connector_cleanup: drm_connector_cleanup(connector); From 84d1c9b416d54afe760ca4c378bd95c89261254c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 8 May 2026 16:44:46 +0200 Subject: [PATCH 838/972] drm/gma500/oaktrail_lvds: fix i2c adapter leaks on init The LVDS init code looks up an I2C adapter using i2c_get_adapter() and tries to read the EDID before falling back to allocating and registering its own adapter. Make sure to drop the references taken by i2c_get_adapter() when falling back to allocating an adapter as well as on late errors to allow the looked up adapter to be deregistered. Fixes: 1b082ccf5901 ("gma500: Add Oaktrail support") Cc: stable@vger.kernel.org # 3.3 Signed-off-by: Johan Hovold Signed-off-by: Patrik Jakobsson Link: https://patch.msgid.link/20260508144446.59722-4-johan@kernel.org --- drivers/gpu/drm/gma500/oaktrail_lvds.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/gma500/oaktrail_lvds.c b/drivers/gpu/drm/gma500/oaktrail_lvds.c index 983cc60a1e698d..e194d0cce0671a 100644 --- a/drivers/gpu/drm/gma500/oaktrail_lvds.c +++ b/drivers/gpu/drm/gma500/oaktrail_lvds.c @@ -367,6 +367,8 @@ void oaktrail_lvds_init(struct drm_device *dev, if (edid == NULL && dev_priv->lpc_gpio_base) { ddc_bus = oaktrail_lvds_i2c_init(dev); if (!IS_ERR(ddc_bus)) { + if (i2c_adap) + i2c_put_adapter(i2c_adap); i2c_adap = &ddc_bus->base; edid = drm_get_edid(connector, i2c_adap); } @@ -423,6 +425,8 @@ void oaktrail_lvds_init(struct drm_device *dev, mutex_unlock(&dev->mode_config.mutex); if (!IS_ERR_OR_NULL(ddc_bus)) gma_i2c_destroy(ddc_bus); + else if (i2c_adap) + i2c_put_adapter(i2c_adap); drm_encoder_cleanup(encoder); err_connector_cleanup: drm_connector_cleanup(connector); From 7d8f3158a51cb40fc710d2a781549141a139b796 Mon Sep 17 00:00:00 2001 From: Yu Miao Date: Wed, 13 May 2026 10:39:07 +0800 Subject: [PATCH 839/972] selftests/cgroup: Fix error path leaks in test_percpu_basic When cg_name_indexed() returns NULL partway through the child creation loop, the code returned -1 without running cleanup_children and cleanup. That left the `parent` pathname allocation unreleased and did not remove child cgroup directories already created under the parent. Fix by jumping to cleanup_children instead of returning. When cg_create() fails, `child` (the pathname from cg_name_indexed()) was not freed before cleanup_children. Fix by freeing `child` before branching to cleanup_children. Fixes: 90631e1dea55 ("kselftests: cgroup: add perpcu memory accounting test") Signed-off-by: Yu Miao Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_kmem.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index eeabd34bf08370..12f59925500bd4 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -368,11 +368,15 @@ static int test_percpu_basic(const char *root) for (i = 0; i < 1000; i++) { child = cg_name_indexed(parent, "child", i); - if (!child) - return -1; + if (!child) { + ret = -1; + goto cleanup_children; + } - if (cg_create(child)) + if (cg_create(child)) { + free(child); goto cleanup_children; + } free(child); } From 345f40166694e60db6d5cf02233814bb27ac5dec Mon Sep 17 00:00:00 2001 From: sunshaojie Date: Wed, 13 May 2026 18:37:38 +0800 Subject: [PATCH 840/972] cgroup/cpuset: Return only actually allocated CPUs during partition invalidation In update_parent_effective_cpumask() with partcmd_invalidate, the CPUs to return to the parent are computed as: adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); where xcpus = user_xcpus(cs) which returns cs->exclusive_cpus (if set) or cs->cpus_allowed. When exclusive_cpus is not set, user_xcpus(cs) can contain CPUs that were never actually granted to the partition due to sibling exclusion in compute_excpus(). Consequently, the invalidation may return CPUs to the parent that remain in use by sibling partitions, causing overlapping effective_cpus and triggering the WARN_ON_ONCE(1) in generate_sched_domains(). Use cs->effective_xcpus instead, which reflects the CPUs actually granted to this partition. Reproducer (on a 4-CPU machine): cd /sys/fs/cgroup mkdir a1 b1 # a1 becomes partition root with CPUs 0-1 echo "0-1" > a1/cpuset.cpus echo "root" > a1/cpuset.cpus.partition # b1 becomes partition root with CPUs 1-2, but sibling exclusion # reduces its effective_xcpus to CPU 2 only echo "1-2" > b1/cpuset.cpus echo "root" > b1/cpuset.cpus.partition # b1 changes cpus_allowed to 0-1 -> partition invalidation echo "0-1" > b1/cpuset.cpus # Expected: CPUs 2-3 (only CPU 2 returned from b1) # Actual: CPUs 1-3 (CPU 0-1 returned, overlapping with a1) cat cpuset.cpus.effective dmesg will also show a WARNING from generate_sched_domains() reporting overlapping partition root effective_cpus. Fixes: 2a3602030d80 ("cgroup/cpuset: Don't invalidate sibling partitions on cpuset.cpus conflict") Cc: stable@vger.kernel.org # v7.0+ Signed-off-by: sunshaojie Tested-by: Chen Ridong Reviewed-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e84e801e22cf42..5c33ab20cc208b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1718,7 +1718,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; From d6a2d7b04b5a093021a7a0e2e69e9d5237dfa8cc Mon Sep 17 00:00:00 2001 From: Nicholas Carlini Date: Mon, 11 May 2026 18:02:16 +0000 Subject: [PATCH 841/972] io-wq: check that the predecessor is hashed in io_wq_remove_pending() io_wq_remove_pending() needs to fix up wq->hash_tail[] if the cancelled work was the tail of its hash bucket. When doing this, it checks whether the preceding entry in acct->work_list has the same hash value, but never checks that the predecessor is hashed at all. io_get_work_hash() is simply atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT, and the hash bits are never set for non-hashed work, so it returns 0. Thus, when a hashed bucket-0 work is cancelled while a non-hashed work is its list predecessor, the check spuriously passes and a pointer to the non-hashed io_kiocb is stored in wq->hash_tail[0]. Because non-hashed work is dequeued via the fast path in io_get_next_work(), which never touches hash_tail[], the stale pointer is never cleared. Therefore, after the non-hashed io_kiocb completes and is freed back to req_cachep, wq->hash_tail[0] is a dangling pointer. The io_wq is per-task (tctx->io_wq) and survives ring open/close, so the dangling pointer persists for the lifetime of the task; the next hashed bucket-0 enqueue dereferences it in io_wq_insert_work() and wq_list_add_after() writes through freed memory. Add the missing io_wq_is_hashed() check so a non-hashed predecessor never inherits a hash_tail[] slot. Cc: stable@vger.kernel.org Fixes: 204361a77f40 ("io-wq: fix hang after cancelling pending hashed work") Signed-off-by: Nicholas Carlini Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 7a9f94a0ce6f2f..8cc7b47d30894a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1124,7 +1124,8 @@ static inline void io_wq_remove_pending(struct io_wq *wq, if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); - if (prev_work && io_get_work_hash(prev_work) == hash) + if (prev_work && io_wq_is_hashed(prev_work) && + io_get_work_hash(prev_work) == hash) wq->hash_tail[hash] = prev_work; else wq->hash_tail[hash] = NULL; From 32d5019ed3b6ff4439cb075fb275f655c8a2059c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:47 +0200 Subject: [PATCH 842/972] block: pass a minsize argument to bio_iov_iter_bounce When bouncing for block size > PAGE_SIZE file systems that require file system block size alignment (e.g. zoned XFS), the bio needs to be big enough to fit an entire block. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 23 +++++++++++++---------- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 3 ++- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/block/bio.c b/block/bio.c index b8972dba68a090..f3e5d8bea08cd6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1279,11 +1279,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); } -static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size, + size_t minsize) { struct folio *folio; - while (*size > PAGE_SIZE) { + while (*size > minsize) { folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); if (folio) return folio; @@ -1307,7 +1308,7 @@ static void bio_free_folios(struct bio *bio) } static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t total_len = min(maxlen, iov_iter_count(iter)); @@ -1322,13 +1323,13 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, size_t this_len = min(total_len, SZ_1M); struct folio *folio; - if (this_len > PAGE_SIZE * 2) + if (this_len > minsize * 2) this_len = rounddown_pow_of_two(this_len); if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) break; - folio = folio_alloc_greedy(GFP_KERNEL, &this_len); + folio = folio_alloc_greedy(GFP_KERNEL, &this_len, minsize); if (!folio) break; bio_add_folio_nofail(bio, folio, this_len, 0); @@ -1348,12 +1349,12 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, - size_t maxlen) + size_t maxlen, size_t minsize) { size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M); struct folio *folio; - folio = folio_alloc_greedy(GFP_KERNEL, &len); + folio = folio_alloc_greedy(GFP_KERNEL, &len, minsize); if (!folio) return -ENOMEM; @@ -1390,6 +1391,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * @bio: bio to send * @iter: iter to read from / write into * @maxlen: maximum size to bounce + * @minsize: minimum folio allocation size * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require @@ -1397,11 +1399,12 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen) +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize) { if (op_is_write(bio_op(bio))) - return bio_iov_iter_bounce_write(bio, iter, maxlen); - return bio_iov_iter_bounce_read(bio, iter, maxlen); + return bio_iov_iter_bounce_write(bio, iter, maxlen, minsize); + return bio_iov_iter_bounce_read(bio, iter, maxlen, minsize); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b0a6549b38487b..b36ee619cdcdda 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -355,7 +355,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_BOUNCE) ret = bio_iov_iter_bounce(bio, dio->submit.iter, - iomap_max_bio_size(&iter->iomap)); + iomap_max_bio_size(&iter->iomap), alignment); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/include/linux/bio.h b/include/linux/bio.h index 97d747320b35bc..dc17780d6c1e36 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,7 +475,8 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen, + size_t minsize); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, From e7b8b3c5b2a65595d506ffedafac66f0a11fbdc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 May 2026 07:01:48 +0200 Subject: [PATCH 843/972] block: align down bounces bios Just like for the extract user pages path, we need to align down the size to the supported boundary. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20260507050153.1298375-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index f3e5d8bea08cd6..5f10900b3f42a1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1345,7 +1345,7 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, if (!bio->bi_iter.bi_size) return -ENOMEM; - return 0; + return bio_iov_iter_align_down(bio, iter, minsize - 1); } static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, @@ -1383,7 +1383,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); - return 0; + return bio_iov_iter_align_down(bio, iter, minsize - 1); } /** From c64a647c84f30be368404f50f9052ed6c75c0f17 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 7 May 2026 08:35:46 -0600 Subject: [PATCH 844/972] vfio/pci: fix dma-buf kref underflow after revoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vfio_pci_dma_buf_move(revoked=true) and vfio_pci_dma_buf_cleanup() ran the same drain sequence: set priv->revoked, invalidate mappings, wait for fences, drop the registered kref, wait for completion. When the VFIO device fd was closed after PCI_COMMAND_MEMORY had been cleared, both ran in turn -- the second kref_put underflowed and the subsequent wait_for_completion() blocked on a completion that the first run had already consumed: refcount_t: underflow; use-after-free. WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x59/0x90 Call Trace: vfio_pci_dma_buf_cleanup+0x163/0x168 [vfio_pci_core] vfio_pci_core_close_device+0x67/0xe0 [vfio_pci_core] vfio_df_close+0x4c/0x80 [vfio] vfio_df_group_close+0x36/0x80 [vfio] vfio_device_fops_release+0x21/0x40 [vfio] __fput+0xe6/0x2b0 __x64_sys_close+0x3d/0x80 Collapse the duplication: vfio_pci_dma_buf_cleanup() now delegates the drain to vfio_pci_dma_buf_move(true), which is idempotent for already-revoked dma-bufs. cleanup retains only list removal and the device registration drop; the dma_resv_lock that bracketed those is dropped along with the in-line drain that required it, memory_lock continues to protect them. Re-arm the kref and the completion at the end of move()'s revoke branch so post-revoke state matches post-creation (kref == 1, completion ready). This keeps cleanup's call into move() a no-op when revoke already ran, and replaces the explicit kref_init() that the un-revoke branch used to perform for the un-revoke -> remap path. Fixes: 1a8a5227f229 ("vfio: Wait for dma-buf invalidation to complete") Reported-by: Joonas Kylmälä Closes: https://lore.kernel.org/all/GVXPR02MB12019AA6014F27EF5D773E89BFB372@GVXPR02MB12019.eurprd02.prod.outlook.com/ Cc: stable@vger.kernel.org Assisted-by: Claude:claude-opus-4-7 Reviewed-by: Leon Romanovsky Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20260507143548.1018405-1-alex.williamson@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_dmabuf.c | 36 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index f87fd32e4a0176..fdc22e8b465680 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -354,19 +354,18 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (revoked) { kref_put(&priv->kref, vfio_pci_dma_buf_done); wait_for_completion(&priv->comp); - } else { /* - * Kref is initialize again, because when revoke - * was performed the reference counter was decreased - * to zero to trigger completion. + * Re-arm the registered kref reference and the + * completion so the post-revoke state matches the + * post-creation state. An un-revoke followed by a + * new mapping needs the kref to be non-zero before + * kref_get(), and vfio_pci_dma_buf_cleanup() + * delegates its drain back through this revoke + * path on a possibly-already-revoked dma-buf. */ kref_init(&priv->kref); - /* - * There is no need to wait as no mapping was - * performed when the previous status was - * priv->revoked == true. - */ reinit_completion(&priv->comp); + } else { dma_resv_lock(priv->dmabuf->resv, NULL); priv->revoked = false; dma_resv_unlock(priv->dmabuf->resv); @@ -382,21 +381,22 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) struct vfio_pci_dma_buf *tmp; down_write(&vdev->memory_lock); + + /* + * Drain any active mappings via the revoke path. The move is + * idempotent for dma-bufs already in the revoked state and + * leaves every priv with the kref re-armed and the completion + * ready, so cleanup itself does not need to participate in kref + * bookkeeping. + */ + vfio_pci_dma_buf_move(vdev, true); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) continue; - dma_resv_lock(priv->dmabuf->resv, NULL); list_del_init(&priv->dmabufs_elm); priv->vdev = NULL; - priv->revoked = true; - dma_buf_invalidate_mappings(priv->dmabuf); - dma_resv_wait_timeout(priv->dmabuf->resv, - DMA_RESV_USAGE_BOOKKEEP, false, - MAX_SCHEDULE_TIMEOUT); - dma_resv_unlock(priv->dmabuf->resv); - kref_put(&priv->kref, vfio_pci_dma_buf_done); - wait_for_completion(&priv->comp); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); } From 6ae315d37924435516d697ea7dde0b799a5928e0 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 13 May 2026 13:24:38 +0200 Subject: [PATCH 845/972] sched_ext: Use HK_TYPE_DOMAIN_BOOT to detect isolcpus= domain isolation scx_enable() refuses to attach a BPF scheduler when isolcpus=domain is in effect by comparing housekeeping_cpumask(HK_TYPE_DOMAIN) against cpu_possible_mask. Since commit 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers"), HK_TYPE_DOMAIN's cpumask is RCU protected and dereferencing it requires either RCU read lock, the cpu_hotplug write lock, or the cpuset lock; scx_enable() holds none of these, so booting with isolcpus=domain and attaching any BPF scheduler triggers the following lockdep splat: ============================= WARNING: suspicious RCU usage ----------------------------- kernel/sched/isolation.c:60 suspicious rcu_dereference_check() usage! 1 lock held by scx_flash/281: #0: ffffffff8379fce0 (update_mutex){+.+.}-{4:4}, at: bpf_struct_ops_link_create+0x134/0x1c0 Call Trace: dump_stack_lvl+0x6f/0xb0 lockdep_rcu_suspicious.cold+0x37/0x70 housekeeping_cpumask+0xcd/0xe0 scx_enable.isra.0+0x17/0x120 bpf_scx_reg+0x5e/0x80 bpf_struct_ops_link_create+0x151/0x1c0 __sys_bpf+0x1e4b/0x33c0 __x64_sys_bpf+0x21/0x30 do_syscall_64+0x117/0xf80 entry_SYSCALL_64_after_hwframe+0x77/0x7f In addition, commit 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") made HK_TYPE_DOMAIN include cpuset isolated partitions as well, which means the current check also rejects BPF schedulers when a cpuset partition is active. That contradicts the original intent of commit 9f391f94a173 ("sched_ext: Disallow loading BPF scheduler if isolcpus= domain isolation is in effect"), which explicitly noted that cpuset partitions are honored through per-task cpumasks and should not be rejected. Switch to housekeeping_enabled(HK_TYPE_DOMAIN_BOOT), which reads only the housekeeping flag bit (no RCU dereference) and reflects exactly the boot-time isolcpus= configuration that the error message refers to. Fixes: 27c3a5967f05 ("sched/isolation: Convert housekeeping cpumasks to rcu pointers") Cc: stable@vger.kernel.org # v7.0+ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo Acked-by: Frederic Weisbecker --- kernel/sched/ext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 23f7b3f63b09b2..a6d0a93d81748b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -7415,8 +7415,7 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) static DEFINE_MUTEX(helper_mutex); struct scx_enable_cmd cmd; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } From df733ddc263dbe5f471e7c80c8b669532f56bf76 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:46:42 -0700 Subject: [PATCH 846/972] vfio/pci: Make VFIO_PCI_OFFSET_TO_INDEX() return unsigned VFIO_PCI_OFFSET_TO_INDEX() is used in several places with a signed parameter (e.g. loff_t). Because it makes no sense for a BAR/resource index to be negative, enforce this in the macro. This fixes at least one current issue, where vfio_pci_ioeventfd() uses this macro with an unvalidated signed loff_t returned into a signed type, leading to a possible negative array access. This instance does test against an out-of-bounds positive value, so treating the index as unsigned fixes this issue. Fixes: 89e1f7d4c66d8 ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511144642.2926799-1-mattev@meta.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 2ebba746c18f7b..89165b769e5c16 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -21,7 +21,7 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 -#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_TO_INDEX(off) ((u64)(off) >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) From 4694efc4164123580f19467141cdcfb73f7a740a Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sat, 9 May 2026 22:04:50 +0100 Subject: [PATCH 847/972] FDDI: defza: Sanitise the reset safety timer The reset actions of the DEFZA adapters are exceedingly slow, taking up to 30 seconds to complete by the device spec and typically in the range of 10 seconds in reality, as required for the device RTOS to boot, still quite a lot. Therefore a state machine is used that's interrupt driven, however a safety mechanism is required in case of adapter malfunction, so that if no state change interrupt has arrived in time, then the situation is taken care of. The safety mechanism depends on the origin of the reset. For regular adapter initialisation at the device probe time a sleep is requested. However a reset is also required by the device spec when the adapter has transitioned into the halted state, such as in response to a PC Trace event in the course of ring fault recovery, possibly a common network event. In that case no sleep is possible as a device halt is reported at the hardirq level. A timer is therefore set up to ensure progress in case no adapter state change interrupt has arrived in time, but as from commit 168f6b6ffbee ("timers: Use del_timer_sync() even on UP") a warning is issued as the timer is deleted in the hardirq handler upon an expected state change: defza: v.1.1.4 Oct 6 2018 Maciej W. Rozycki tc2: DEC FDDIcontroller 700 or 700-C at 0x18000000, irq 4 tc2: resetting the board... ------------[ cut here ]------------ WARNING: kernel/time/timer.c:1611 at __timer_delete_sync+0x104/0x120, CPU#0: swapper/0/0 Modules linked in: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 7.0.0-dirty #2 VOLUNTARY Stack : 9800000002027d08 00000000140120e0 0000000000000000 ffffffff8089d468 0000000000000000 0000000000000000 ffffffff807ed6b8 ffffffff80897458 ffffffff80897400 9800000002027b88 0000000000000000 7070617773203a6d 0000000000000000 9800000002027ba4 0000000000001000 6465746e69617420 0000000000000000 ffffffff807ed6b8 00000000140120e0 0000000000000009 000000000000064b ffffffff800dd14c 0000000000000036 9800000002184000 0000000000000000 0000000000000020 0000000000000000 ffffffff80910000 ffffffff8085c000 9800000002027c70 0000000000000001 ffffffff80045fa0 0000000000000000 0000000000000000 0000000000000000 0000000000000009 000000000000064b ffffffff800502b8 ffffffff807ed6b8 ffffffff80045fa0 ... Call Trace: [] show_stack+0x28/0xf0 [] dump_stack_lvl+0x48/0x7c [] __warn+0xa0/0x128 [] warn_slowpath_fmt+0x64/0xa4 [] __timer_delete_sync+0x104/0x120 [] fza_interrupt+0xc74/0xeb8 [] __handle_irq_event_percpu+0x70/0x228 [] handle_irq_event_percpu+0x18/0x78 [] handle_percpu_irq+0x50/0x80 [] generic_handle_irq+0x90/0xd0 [] do_IRQ+0x1c/0x30 [] handle_int+0x148/0x154 [] do_idle+0x40/0x108 [] cpu_startup_entry+0x2c/0x38 [] kernel_init+0x0/0x108 ---[ end trace 0000000000000000 ]--- tc2: OK tc2: model 700 (DEFZA-AA), MMF PMD, address 08-00-2b-xx-xx-xx tc2: ROM rev. 1.0, firmware rev. 1.2, RMC rev. A, SMT ver. 1 tc2: link unavailable ------------[ cut here ]------------ WARNING: kernel/time/timer.c:1611 at __timer_delete_sync+0x104/0x120, CPU#0: swapper/0/0 Modules linked in: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Tainted: G W 7.0.0-dirty #2 VOLUNTARY Tainted: [W]=WARN Stack : 9800000002027d08 00000000140120e0 0000000000000000 ffffffff8089d468 0000000000000000 0000000000000000 ffffffff807ed6b8 ffffffff80897458 ffffffff80897400 9800000002027b88 0000000000000000 0000000000000000 0000000000000000 9800000002027ba4 0000000000001000 0000000000000000 0000000000000000 ffffffff807ed6b8 00000000140120e0 0000000000000009 000000000000064b ffffffff800dd14c 0000000000000036 9800000002184000 0000000000000000 0000000000000020 0000000000000000 ffffffff80910000 ffffffff8085c000 9800000002027c70 0000000000000001 ffffffff80045fa0 0000000000000000 0000000000000000 0000000000000000 0000000000000009 000000000000064b ffffffff800502b8 ffffffff807ed6b8 ffffffff80045fa0 ... Call Trace: [] show_stack+0x28/0xf0 [] dump_stack_lvl+0x48/0x7c [] __warn+0xa0/0x128 [] warn_slowpath_fmt+0x64/0xa4 [] __timer_delete_sync+0x104/0x120 [] fza_interrupt+0xc74/0xeb8 [] __handle_irq_event_percpu+0x70/0x228 [] handle_irq_event_percpu+0x18/0x78 [] handle_percpu_irq+0x50/0x80 [] generic_handle_irq+0x90/0xd0 [] do_IRQ+0x1c/0x30 [] handle_int+0x148/0x154 [] arch_local_irq_disable+0x4/0x28 [] do_idle+0x50/0x108 [] cpu_startup_entry+0x2c/0x38 [] kernel_init+0x0/0x108 ---[ end trace 0000000000000000 ]--- tc2: registered as fddi0 The immediate origin of the new warning is the switch away from aliasing del_timer_sync() to del_timer() (timer_delete_sync() to timer_delete() in terms of current function names) for UP configurations, which however is the only choice for this driver anyway as no SMP hardware supports the TURBOchannel bus this device interfaces to. Therefore there is a very remote issue only this is a sign of. Specifically if an adapter reset issued upon a transition to the halted state times out and first triggers fza_reset_timer() for another reset assertion, which then schedules fza_reset_timer() for reset deassertion and then that second call is pre-empted after poking at the hardware, but before the timer has been rearmed and owing to high system load causing exceedingly high scheduling latency control is not handed back before a transition to the uninitialised state has caused the timer to be deleted even before it has been started, then fza_reset_timer() will be called yet again and issue another reset even though by then the adapter has already recovered. Prevent this situation from happening by switching to timer_delete() for the transition to the halted state and protect the code region affected with a spinlock, also to make sure add_timer() has not been called twice in a row due to an execution race between the interrupt handler and the timer handler (though it could only happen on SMP, but let's keep the driver clean). It's a very unlikely sequence of events to happen and therefore there's no point in trying to be overly clever about it, such as by placing printk() calls outside the protection. For the transition to the uninitialised state switch to timer_delete_sync_try() instead, so that a timer isn't deleted that's just been rearmed by the timer handler and needs to watch for the device to come out of reset again (again, an SMP scenario only). Retain timer_delete_sync() invocations outside the hardirq context for a stray timer not to fire once device structures have been released. Fixes: 61414f5ec9834 ("FDDI: defza: Add support for DEC FDDIcontroller 700 TURBOchannel adapter") Signed-off-by: Maciej W. Rozycki Signed-off-by: Jakub Kicinski --- drivers/net/fddi/defza.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c index 064fa484f797af..9bfecc87d6b252 100644 --- a/drivers/net/fddi/defza.c +++ b/drivers/net/fddi/defza.c @@ -984,7 +984,7 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) case FZA_STATE_UNINITIALIZED: netif_carrier_off(dev); - timer_delete_sync(&fp->reset_timer); + timer_delete_sync_try(&fp->reset_timer); fp->ring_cmd_index = 0; fp->ring_uns_index = 0; fp->ring_rmc_tx_index = 0; @@ -1018,7 +1018,9 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) fp->queue_active = 0; netif_stop_queue(dev); pr_debug("%s: queue stopped\n", fp->name); - timer_delete_sync(&fp->reset_timer); + + spin_lock(&fp->lock); + timer_delete(&fp->reset_timer); pr_warn("%s: halted, reason: %x\n", fp->name, FZA_STATUS_GET_HALT(status)); fza_regs_dump(fp); @@ -1027,6 +1029,8 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) fp->timer_state = 0; fp->reset_timer.expires = jiffies + 45 * HZ; add_timer(&fp->reset_timer); + spin_unlock(&fp->lock); + break; default: @@ -1046,7 +1050,9 @@ static irqreturn_t fza_interrupt(int irq, void *dev_id) static void fza_reset_timer(struct timer_list *t) { struct fza_private *fp = timer_container_of(fp, t, reset_timer); + unsigned long flags; + spin_lock_irqsave(&fp->lock, flags); if (!fp->timer_state) { pr_err("%s: RESET timed out!\n", fp->name); pr_info("%s: trying harder...\n", fp->name); @@ -1069,6 +1075,7 @@ static void fza_reset_timer(struct timer_list *t) fp->reset_timer.expires = jiffies + 45 * HZ; } add_timer(&fp->reset_timer); + spin_unlock_irqrestore(&fp->lock, flags); } static int fza_set_mac_address(struct net_device *dev, void *addr) From 5a30862dec5a70da0a9d259de3f87a7542cc95b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Tue, 12 May 2026 11:03:53 -0300 Subject: [PATCH 848/972] ASoC: sdw_utils: Check speaker component string allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_kasprintf() can fail while building the temporary speaker component string. If that happens, spk_components is set to NULL, but the current code can still pass it to strlen() on a later loop iteration or after the loop when appending the speaker component list to card->components. Use NULL to represent the initial "no speaker components" state, and return -ENOMEM immediately if building spk_components fails. Fixes: 0f60ecffbfe3 ("ASoC: sdw_utils: generate combined spk components string") Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260512-asoc-sdw-utils-spk-components-alloc-v1-1-c9bbd6d2e123@gmail.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index be45a2e62d3ea6..e440c23271001c 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1114,7 +1114,7 @@ int asoc_sdw_rtd_init(struct snd_soc_pcm_runtime *rtd) struct asoc_sdw_codec_info *codec_info; struct snd_soc_dai *dai; struct sdw_slave *sdw_peripheral; - const char *spk_components=""; + const char *spk_components = NULL; int dai_index; int ret; int i; @@ -1197,7 +1197,7 @@ int asoc_sdw_rtd_init(struct snd_soc_pcm_runtime *rtd) else component = codec_info->dais[dai_index].component_name; - if (strlen (spk_components) == 0) + if (!spk_components) spk_components = devm_kasprintf(card->dev, GFP_KERNEL, "%s", component); else @@ -1205,13 +1205,15 @@ int asoc_sdw_rtd_init(struct snd_soc_pcm_runtime *rtd) spk_components = devm_kasprintf(card->dev, GFP_KERNEL, "%s+%s", spk_components, component); + + if (!spk_components) + return -ENOMEM; } codec_info->dais[dai_index].rtd_init_done = true; - } - if (strlen (spk_components) > 0) { + if (spk_components) { /* Update card components for speaker components */ card->components = devm_kasprintf(card->dev, GFP_KERNEL, "%s spk:%s", card->components, spk_components); From c157f03b48644cc4befd48484e06408cdab2d8e4 Mon Sep 17 00:00:00 2001 From: Niranjan H Y Date: Wed, 13 May 2026 07:25:42 +0530 Subject: [PATCH 849/972] ASoC: sdw_utils: Remove dead code in asoc_sdw_ti_add_tac5xx2_routes() Remove unnecessary checks for scnprintf() return values in asoc_sdw_ti_add_tac5xx2_routes(). The function scnprintf() never returns negative values and cannot return zero given the format strings used ("%s SPK_L" and "%s SPK_R"). The existing length validation at line 110 already ensures that name_prefix won't cause buffer overflow, and scnprintf() guarantees null-termination even in case of truncation. Fixes: e812de61e9a0 ("ASoC: sdw_utils: TI amp utility for tac5xx2 family") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-sound/agF8GBcHYUaGJbXY@stanley.mountain/ Signed-off-by: Niranjan H Y Link: https://patch.msgid.link/20260513015542.2420-1-niranjan.hy@ti.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_ti_amp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sound/soc/sdw_utils/soc_sdw_ti_amp.c b/sound/soc/sdw_utils/soc_sdw_ti_amp.c index f156116fbeb652..d0ae5d7efe8f71 100644 --- a/sound/soc/sdw_utils/soc_sdw_ti_amp.c +++ b/sound/soc/sdw_utils/soc_sdw_ti_amp.c @@ -105,18 +105,12 @@ static int asoc_sdw_ti_add_tac5xx2_routes(struct snd_soc_dapm_context *dapm, struct snd_soc_dapm_route routes[2]; char left_widget[TAC5XX2_WIDGET_NAME_MAX]; char right_widget[TAC5XX2_WIDGET_NAME_MAX]; - int ret; if (strlen(name_prefix) > (TAC5XX2_WIDGET_NAME_MAX - 7)) return -ENAMETOOLONG; - ret = scnprintf(left_widget, sizeof(left_widget), "%s SPK_L", name_prefix); - if (ret <= 0) - return -EINVAL; - - ret = scnprintf(right_widget, sizeof(right_widget), "%s SPK_R", name_prefix); - if (ret <= 0) - return -EINVAL; + scnprintf(left_widget, sizeof(left_widget), "%s SPK_L", name_prefix); + scnprintf(right_widget, sizeof(right_widget), "%s SPK_R", name_prefix); routes[0] = (struct snd_soc_dapm_route){"Left Spk", NULL, left_widget}; routes[1] = (struct snd_soc_dapm_route){"Right Spk", NULL, right_widget}; From 5d03a4f357100c6ba369ec8c2432d755d006d0dc Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 13 May 2026 11:15:49 +0800 Subject: [PATCH 850/972] ASoC: sdw_utils: add soc_sdw_es9356 Add a utility program for handling ES9356 in the universal machine driver Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260513031554.5422-2-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 14 ++ sound/soc/sdw_utils/Makefile | 1 + sound/soc/sdw_utils/soc_sdw_es9356.c | 230 +++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) create mode 100644 sound/soc/sdw_utils/soc_sdw_es9356.c diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index d713ab2f66203f..79c21966220b24 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -223,6 +223,17 @@ int asoc_sdw_cs42l43_spk_init(struct snd_soc_card *card, struct asoc_sdw_codec_info *info, bool playback); +/* es9356 codec support */ +int asoc_sdw_es9356_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback); +int asoc_sdw_es9356_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback); +int asoc_sdw_es9356_exit(struct snd_soc_card *card, struct snd_soc_dai_link *dai_link); + /* CS AMP support */ int asoc_sdw_bridge_cs35l56_count_sidecar(struct snd_soc_card *card, int *num_dais, int *num_devs); @@ -278,5 +289,8 @@ int asoc_sdw_ti_amp_initial_settings(struct snd_soc_card *card, const char *name_prefix); int asoc_sdw_ti_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_ti_sdca_jack_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_es9356_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_es9356_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_es9356_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); #endif diff --git a/sound/soc/sdw_utils/Makefile b/sound/soc/sdw_utils/Makefile index a8d091fd374b6e..5ae8c69b8b98b7 100644 --- a/sound/soc/sdw_utils/Makefile +++ b/sound/soc/sdw_utils/Makefile @@ -8,6 +8,7 @@ snd-soc-sdw-utils-y := soc_sdw_utils.o soc_sdw_dmic.o soc_sdw_rt_dmic.o \ soc_sdw_cs42l45.o \ soc_sdw_cs47l47.o \ soc_sdw_cs_amp.o \ + soc_sdw_es9356.o \ soc_sdw_maxim.o \ soc_sdw_ti_amp.o obj-$(CONFIG_SND_SOC_SDW_UTILS) += snd-soc-sdw-utils.o diff --git a/sound/soc/sdw_utils/soc_sdw_es9356.c b/sound/soc/sdw_utils/soc_sdw_es9356.c new file mode 100644 index 00000000000000..aa405e7dad9942 --- /dev/null +++ b/sound/soc/sdw_utils/soc_sdw_es9356.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Based on sof_sdw_rt5682.c +// This file incorporates work covered by the following copyright notice: +// Copyright (c) 2023 Intel Corporation +// Copyright (c) 2024 Advanced Micro Devices, Inc. +// Copyright (c) 2025 Everest Semiconductor Co., Ltd + +/* + * soc_sdw_es9356 - Helpers to handle ES9356 from generic machine driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note this MUST be called before snd_soc_register_card(), so that the props + * are in place before the codec component driver's probe function parses them. + */ +static int es9356_add_codec_device_props(struct device *sdw_dev, unsigned long quirk) +{ + struct property_entry props[SOC_SDW_MAX_NO_PROPS] = {}; + struct fwnode_handle *fwnode; + int ret; + + if (!SOC_SDW_JACK_JDSRC(quirk)) + return 0; + props[0] = PROPERTY_ENTRY_U32("everest,jd-src", SOC_SDW_JACK_JDSRC(quirk)); + + fwnode = fwnode_create_software_node(props, NULL); + if (IS_ERR(fwnode)) + return PTR_ERR(fwnode); + + ret = device_add_software_node(sdw_dev, to_software_node(fwnode)); + + fwnode_handle_put(fwnode); + + return ret; +} + +static const struct snd_soc_dapm_route es9356_map[] = { + /* Headphones */ + { "Headphone", NULL, "es9356 HP" }, + { "es9356 MIC1", NULL, "Headset Mic" }, +}; + +static const struct snd_soc_dapm_route es9356_spk_map[] = { + /* Speaker */ + { "Speaker", NULL, "es9356 SPK" }, +}; + +static const struct snd_soc_dapm_route es9356_dmic_map[] = { + /* DMIC */ + { "es9356 PDM_DIN", NULL, "DMIC" }, +}; + +static struct snd_soc_jack_pin es9356_jack_pins[] = { + { + .pin = "Headphone", + .mask = SND_JACK_HEADPHONE, + }, + { + .pin = "Headset Mic", + .mask = SND_JACK_MICROPHONE, + }, +}; + +int asoc_sdw_es9356_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + struct snd_soc_dapm_context *dapm = snd_soc_card_to_dapm(card); + int ret; + + card->components = devm_kasprintf(card->dev, GFP_KERNEL, + "%s spk:es9356-spk", + card->components); + if (!card->components) + return -ENOMEM; + + ret = snd_soc_dapm_add_routes(dapm, es9356_spk_map, + ARRAY_SIZE(es9356_spk_map)); + if (ret) + dev_err(card->dev, "es9356 map addition failed: %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_spk_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_es9356_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + struct snd_soc_dapm_context *dapm = snd_soc_card_to_dapm(card); + int ret; + + card->components = devm_kasprintf(card->dev, GFP_KERNEL, + "%s mic:es9356-dmic", + card->components); + if (!card->components) + return -ENOMEM; + + ret = snd_soc_dapm_add_routes(dapm, es9356_dmic_map, + ARRAY_SIZE(es9356_dmic_map)); + if (ret) + dev_err(card->dev, "es9356 map addition failed: %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_dmic_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_es9356_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai) +{ + struct snd_soc_card *card = rtd->card; + struct snd_soc_dapm_context *dapm = snd_soc_card_to_dapm(card); + struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card); + struct snd_soc_component *component; + struct snd_soc_jack *jack; + int ret; + + component = dai->component; + card->components = devm_kasprintf(card->dev, GFP_KERNEL, + "%s hs:es9356", + card->components); + if (!card->components) + return -ENOMEM; + + ret = snd_soc_dapm_add_routes(dapm, es9356_map, + ARRAY_SIZE(es9356_map)); + + if (ret) { + dev_err(card->dev, "es9356 map addition failed: %d\n", ret); + return ret; + } + + ret = snd_soc_card_jack_new_pins(rtd->card, "Headset Jack", + SND_JACK_HEADSET | SND_JACK_BTN_0 | + SND_JACK_BTN_1 | SND_JACK_BTN_2 | + SND_JACK_BTN_3 | SND_JACK_BTN_4, + &ctx->sdw_headset, + es9356_jack_pins, + ARRAY_SIZE(es9356_jack_pins)); + if (ret) { + dev_err(rtd->card->dev, "Headset Jack creation failed: %d\n", + ret); + return ret; + } + + jack = &ctx->sdw_headset; + + snd_jack_set_key(jack->jack, SND_JACK_BTN_0, KEY_PLAYPAUSE); + snd_jack_set_key(jack->jack, SND_JACK_BTN_1, KEY_VOLUMEUP); + snd_jack_set_key(jack->jack, SND_JACK_BTN_2, KEY_VOLUMEDOWN); + snd_jack_set_key(jack->jack, SND_JACK_BTN_3, KEY_NEXTSONG); + snd_jack_set_key(jack->jack, SND_JACK_BTN_4, KEY_PREVIOUSSONG); + + ret = snd_soc_component_set_jack(component, jack, NULL); + + if (ret) + dev_err(rtd->card->dev, "Headset Jack call-back failed: %d\n", + ret); + + return ret; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_rtd_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_es9356_exit(struct snd_soc_card *card, struct snd_soc_dai_link *dai_link) +{ + struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card); + + if (!ctx->headset_codec_dev) + return 0; + + device_remove_software_node(ctx->headset_codec_dev); + put_device(ctx->headset_codec_dev); + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_exit, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_es9356_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback) +{ + struct asoc_sdw_mc_private *ctx = snd_soc_card_get_drvdata(card); + struct device *sdw_dev; + int ret; + + /* + * headset should be initialized once. + * Do it with dai link for playback. + */ + if (!playback) + return 0; + + sdw_dev = bus_find_device_by_name(&sdw_bus_type, NULL, dai_links->codecs[0].name); + if (!sdw_dev) + return -EPROBE_DEFER; + + ret = es9356_add_codec_device_props(sdw_dev, ctx->mc_quirk); + if (ret < 0) { + put_device(sdw_dev); + return ret; + } + ctx->headset_codec_dev = sdw_dev; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_init, "SND_SOC_SDW_UTILS"); + +int asoc_sdw_es9356_amp_init(struct snd_soc_card *card, + struct snd_soc_dai_link *dai_links, + struct asoc_sdw_codec_info *info, + bool playback) +{ + if (!playback) + return 0; + + info->amp_num++; + + return 0; +} +EXPORT_SYMBOL_NS(asoc_sdw_es9356_amp_init, "SND_SOC_SDW_UTILS"); From 4ff4fc6fea57c0fcf301bf8591ed337cd27dce64 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 13 May 2026 11:15:50 +0800 Subject: [PATCH 851/972] ASoC: sdw_utils: add ES9356 in codec_info_list Add ES9356 in codec_info_list Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260513031554.5422-3-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 50 +++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index 4506fe9756098e..fe70ce7082133e 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1090,6 +1090,56 @@ struct asoc_sdw_codec_info codec_info_list[] = { }, .aux_num = 1, }, + { + .vendor_id = 0x04b3, + .part_id = 0x9356, + .name_prefix = "es9356", + .version_id = 3, + .dais = { + { + .direction = {true, false}, + .dai_name = "es9356-sdp-aif1", + .dai_type = SOC_SDW_DAI_TYPE_JACK, + .dailink = {SOC_SDW_JACK_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID}, + .init = asoc_sdw_es9356_init, + .exit = asoc_sdw_es9356_exit, + .rtd_init = asoc_sdw_es9356_rtd_init, + .controls = generic_jack_controls, + .num_controls = ARRAY_SIZE(generic_jack_controls), + .widgets = generic_jack_widgets, + .num_widgets = ARRAY_SIZE(generic_jack_widgets), + }, + { + .direction = {false, true}, + .dai_name = "es9356-sdp-aif4", + .dai_type = SOC_SDW_DAI_TYPE_MIC, + .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_DMIC_DAI_ID}, + .rtd_init = asoc_sdw_es9356_dmic_rtd_init, + .widgets = generic_dmic_widgets, + .num_widgets = ARRAY_SIZE(generic_dmic_widgets), + }, + { + .direction = {false, true}, + .dai_name = "es9356-sdp-aif2", + .dai_type = SOC_SDW_DAI_TYPE_JACK, + .dailink = {SOC_SDW_UNUSED_DAI_ID, SOC_SDW_JACK_IN_DAI_ID}, + }, + { + .direction = {true, false}, + .dai_name = "es9356-sdp-aif3", + .component_name = "es9356", + .dai_type = SOC_SDW_DAI_TYPE_AMP, + .dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID}, + .init = asoc_sdw_es9356_amp_init, + .rtd_init = asoc_sdw_es9356_spk_rtd_init, + .controls = generic_spk_controls, + .num_controls = ARRAY_SIZE(generic_spk_controls), + .widgets = generic_spk_widgets, + .num_widgets = ARRAY_SIZE(generic_spk_widgets), + }, + }, + .dai_num = 4, + }, { .vendor_id = 0x0105, .part_id = 0xaaaa, /* generic codec mockup */ From 320fb29ea23cfa1aeef32563da8748247db896ea Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 11 May 2026 14:30:57 -0400 Subject: [PATCH 852/972] net/sched: sch_cbs: Call qdisc_reset for child qdisc During a reset, CBS is not calling reset on its child qdisc, which might cause qlen/backlog accounting issues. For example, if we have CBS with a QFQ parent and a netem child with delay, we can create a scenario where the parent's qlen underflows. QFQ, specifically, uses qlen to check whether it should deference a pointer, so this scenario may cause a null-ptr deref in QFQ: [ 43.875639][ T319] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000009: 0000 [#1] SMP KASAN NOPTI [ 43.876124][ T319] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 43.876417][ T319] CPU: 10 UID: 0 PID: 319 Comm: ping Not tainted 7.0.0-13039-ge728258debd5 #773 PREEMPT(full) [ 43.876751][ T319] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 43.876949][ T319] RIP: 0010:qfq_dequeue+0x35c/0x1650 [ 43.877123][ T319] Code: 00 fc ff df 80 3c 02 00 0f 85 17 0e 00 00 4c 8d 73 48 48 89 9d b8 02 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 76 0c 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b [ 43.877648][ T319] RSP: 0018:ffff8881017ef4f0 EFLAGS: 00010216 [ 43.877845][ T319] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: dffffc0000000000 [ 43.878073][ T319] RDX: 0000000000000009 RSI: 0000000c40000000 RDI: ffff88810eef02b0 [ 43.878306][ T319] RBP: ffff88810eef0000 R08: ffff88810eef0280 R09: 1ffff1102120fd63 [ 43.878523][ T319] R10: 1ffff1102120fd66 R11: 1ffff1102120fd67 R12: 0000000c40000000 [ 43.878742][ T319] R13: ffff88810eef02b8 R14: 0000000000000048 R15: 0000000020000000 [ 43.878959][ T319] FS: 00007f9c51c47c40(0000) GS:ffff88817a0be000(0000) knlGS:0000000000000000 [ 43.879214][ T319] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 43.879403][ T319] CR2: 000055e69a2230a8 CR3: 000000010c07a000 CR4: 0000000000750ef0 [ 43.879621][ T319] PKRU: 55555554 [ 43.879735][ T319] Call Trace: [ 43.879844][ T319] [ 43.879924][ T319] __qdisc_run+0x169/0x1900 [ 43.880075][ T319] ? dev_qdisc_enqueue+0x8b/0x210 [ 43.880222][ T319] __dev_queue_xmit+0x2346/0x37a0 [ 43.880376][ T319] ? register_lock_class+0x3f/0x800 [ 43.880531][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.880684][ T319] ? __pfx___dev_queue_xmit+0x10/0x10 [ 43.880834][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.880977][ T319] ? __lock_acquire+0x819/0x1df0 [ 43.881124][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881275][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881418][ T319] ? __asan_memcpy+0x3c/0x60 [ 43.881563][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.881708][ T319] ? eth_header+0x165/0x1a0 [ 43.881853][ T319] ? lockdep_hardirqs_on_prepare+0xdb/0x1a0 [ 43.882031][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.882174][ T319] ? neigh_resolve_output+0x3cc/0x7e0 [ 43.882325][ T319] ? srso_alias_return_thunk+0x5/0xfbef5 [ 43.882471][ T319] ip_finish_output2+0x6b6/0x1e10 Fix this by calling qdisc_reset for CBS' child qdisc. Sashiko caught an issue which could result in a null ptr deref if qdisc_create_dflt() is invoked on an unitialised cbs qdisc which is exposed by this patch. We add an early return if the qdisc is null to address this. This is a similar approach used by two other fixes[1][2]. The proper fix for this specific issue elucidated by sashiko is to remove the call to qdisc_reset when qdisc_create_dflt fails. Since the dflt qdisc isn't attached anywhere yet at that point, calling the reset callback doesn't make much sense (and as stated has been a source of two other bugs). We plan on submitting this fix in a later patch. [1] https://lore.kernel.org/netdev/20221018063201.306474-2-shaozhengchao@huawei.com/ [2] https://lore.kernel.org/netdev/20221018063201.306474-4-shaozhengchao@huawei.com/ Fixes: 585d763af09c ("net/sched: Introduce Credit Based Shaper (CBS) qdisc") Reported-by: Junyoung Jang Tested-by: Junyoung Jang Tested-by: Victor Nogueira Acked-by: Vinicius Costa Gomes Signed-off-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/sch_cbs.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 8c9a0400c8622c..0f953bd46b5814 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -243,6 +243,20 @@ static struct sk_buff *cbs_dequeue(struct Qdisc *sch) return q->dequeue(sch); } +static void cbs_reset(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + /* Nothing to do if we couldn't create the underlying qdisc */ + if (!q->qdisc) + return; + + qdisc_reset(q->qdisc); + qdisc_watchdog_cancel(&q->watchdog); + q->credits = 0; + q->last = 0; +} + static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, }; @@ -540,7 +554,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dequeue = cbs_dequeue, .peek = qdisc_peek_dequeued, .init = cbs_init, - .reset = qdisc_reset_queue, + .reset = cbs_reset, .destroy = cbs_destroy, .change = cbs_change, .dump = cbs_dump, From 59afae20080a9681014bdc87897cbfd30bedd261 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 11 May 2026 14:30:58 -0400 Subject: [PATCH 853/972] selftests/tc-testing: Add QFQ/CBS qlen underflow test Since CBS was not calling reset for its child qdisc, there are scenarios where it could cause an underflow on its parent's qlen/backlog. When the parent is QFQ, a null-ptr deref could occur. Add a test case that reproduces the underflow followed by a null-ptr deref scenario. Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index b1f856cf62c126..848696c373fca1 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1284,5 +1284,46 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "3a62", + "name": "Try to create a qlen underflow with QFQ/CBS", + "category": [ + "qdisc", + "qfq", + "cbs" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: qfq", + "$TC class add dev $DUMMY classid 1:1 parent 1: qfq", + "$TC class add dev $DUMMY classid 1:2 parent 1: qfq", + "$TC qdisc add dev $DUMMY handle 2: parent 1:1 cbs", + "$TC qdisc add dev $DUMMY handle 3: parent 2: netem delay 5000000000", + "$TC filter add dev $DUMMY parent 1: prio 1 u32 match ip dst 10.10.10.1 classid 1:1 action ok", + "$TC filter add dev $DUMMY parent 1: prio 2 u32 match ip dst 10.10.10.2 classid 1:2 action ok", + "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "$IP l set $DUMMY down", + "$IP l set $DUMMY up", + "$TC qdisc replace dev $DUMMY handle 4: parent 2: pfifo" + ], + "cmdUnderTest": "ping -c 1 10.10.10.2 -W0.01 -I$DUMMY", + "expExitCode": "1", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:1", + "matchJSON": [ + { + "kind": "cbs", + "handle": "2:", + "bytes": 0, + "packets": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] } ] From 6c7674b5b7ae513cecae22aa9dcdcf533862cf5c Mon Sep 17 00:00:00 2001 From: Zong Li Date: Mon, 27 Apr 2026 19:41:05 -0700 Subject: [PATCH 854/972] riscv: cfi: reduce shadow stack size limit from 4GB to 2GB Follow the ARM64 GCS (Guarded Control Stack) implementation approach by reducing the shadow stack size allocation from min(RLIMIT_STACK, 4GB) to min(RLIMIT_STACK/2, 2GB). See commit 506496bcbb42 ("arm64/gcs: Ensure that new threads have a GCS") Rationale: 1. Shadow stacks only store return addresses (8 bytes per entry), not local variables, function parameters, or saved registers. A 2GB shadow stack is far more than sufficient for any practical application, even with extremely deep recursion. Using half the size maintains adequate margin while being more resource-efficient. 2. On memory-constrained systems (e.g., platforms with only 4GB of physical memory, which is a common configuration), allocating 4GB of virtual address space for shadow stack per process/thread can lead to virtual memory allocation failures when the overcommit mode is set to OVERCOMMIT_GUESS or OVERCOMMIT_NEVER: Error: "__vm_enough_memory: not enough memory for the allocation" This reduces virtual address space consumption by 50% while maintaining more than adequate space for return address storage. Signed-off-by: Zong Li Link: https://patch.msgid.link/20260428024105.645162-1-zong.li@sifive.com [pjw@kernel.org: clean up patch description] Signed-off-by: Paul Walmsley --- arch/riscv/kernel/usercfi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 6eaa0d94fdfe50..cbfb4e495e9f9f 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -109,15 +109,16 @@ void set_indir_lp_lock(struct task_struct *task, bool lock) task->thread_info.user_cfi_state.ufcfi_locked = lock; } /* - * If size is 0, then to be compatible with regular stack we want it to be as big as - * regular stack. Else PAGE_ALIGN it and return back + * The shadow stack only stores the return address and not any variables + * this should be more than sufficient for most applications. + * Else PAGE_ALIGN it and return back */ static unsigned long calc_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); - return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); + return PAGE_ALIGN(min(rlimit(RLIMIT_STACK) / 2, SZ_2G)); } /* From 9a390d34d55cb4ecbca4981c660dd95440827c70 Mon Sep 17 00:00:00 2001 From: Sukhdeep Singh Date: Tue, 12 May 2026 18:27:11 +0530 Subject: [PATCH 855/972] MAINTAINERS: update atlantic driver maintainer Igor Russkikh and Egor Pomozov have left Marvell. Take over maintenance of the atlantic driver and its PTP subsystem. Signed-off-by: Sukhdeep Singh Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8a134cf6e9bb1d..7a4f2aab78ff89 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2021,7 +2021,7 @@ F: Documentation/hwmon/aquacomputer_d5next.rst F: drivers/hwmon/aquacomputer_d5next.c AQUANTIA ETHERNET DRIVER (atlantic) -M: Igor Russkikh +M: Sukhdeep Singh L: netdev@vger.kernel.org S: Maintained W: https://www.marvell.com/ @@ -2030,7 +2030,7 @@ F: Documentation/networking/device_drivers/ethernet/aquantia/atlantic.rst F: drivers/net/ethernet/aquantia/atlantic/ AQUANTIA ETHERNET DRIVER PTP SUBSYSTEM -M: Egor Pomozov +M: Sukhdeep Singh L: netdev@vger.kernel.org S: Maintained W: http://www.aquantia.com From b84c5632c7b31f8910167075a8128cfb9e50fcfe Mon Sep 17 00:00:00 2001 From: Faicker Mo Date: Mon, 11 May 2026 22:05:51 +0800 Subject: [PATCH 856/972] net: net_failover: Fix the deadlock in slave register There is netdev_lock_ops() before the NETDEV_REGISTER notifier in register_netdevice(), so use the non-locking functions in net_failover_slave_register(). failover_slave_register() in failover_existing_slave_register() adds lock and unlock ops too. Call Trace: __schedule+0x30d/0x7a0 schedule+0x27/0x90 schedule_preempt_disabled+0x15/0x30 __mutex_lock.constprop.0+0x538/0x9e0 __mutex_lock_slowpath+0x13/0x20 mutex_lock+0x3b/0x50 dev_set_mtu+0x40/0xe0 net_failover_slave_register+0x24/0x280 failover_slave_register+0x103/0x1b0 failover_event+0x15e/0x210 ? dropmon_net_event+0xac/0xe0 notifier_call_chain+0x5e/0xe0 raw_notifier_call_chain+0x16/0x30 call_netdevice_notifiers_info+0x52/0xa0 register_netdevice+0x5f4/0x7c0 register_netdev+0x1e/0x40 _mlx5e_probe+0xe2/0x370 [mlx5_core] mlx5e_probe+0x59/0x70 [mlx5_core] ? __pfx_mlx5e_probe+0x10/0x10 [mlx5_core] Fixes: 4c975fd70002 ("net: hold instance lock during NETDEV_REGISTER/UP") Signed-off-by: Faicker Mo Signed-off-by: Jakub Kicinski --- drivers/net/net_failover.c | 12 ++++++------ net/core/failover.c | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index d0361aaf25efb5..3f7d31033bae8b 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -502,7 +502,7 @@ static int net_failover_slave_register(struct net_device *slave_dev, /* Align MTU of slave with failover dev */ orig_mtu = slave_dev->mtu; - err = dev_set_mtu(slave_dev, failover_dev->mtu); + err = netif_set_mtu(slave_dev, failover_dev->mtu); if (err) { netdev_err(failover_dev, "unable to change mtu of %s to %u register failed\n", slave_dev->name, failover_dev->mtu); @@ -512,11 +512,11 @@ static int net_failover_slave_register(struct net_device *slave_dev, dev_hold(slave_dev); if (netif_running(failover_dev)) { - err = dev_open(slave_dev, NULL); + err = netif_open(slave_dev, NULL); if (err && (err != -EBUSY)) { netdev_err(failover_dev, "Opening slave %s failed err:%d\n", slave_dev->name, err); - goto err_dev_open; + goto err_netif_open; } } @@ -562,10 +562,10 @@ static int net_failover_slave_register(struct net_device *slave_dev, err_vlan_add: dev_uc_unsync(slave_dev, failover_dev); dev_mc_unsync(slave_dev, failover_dev); - dev_close(slave_dev); -err_dev_open: + netif_close(slave_dev); +err_netif_open: dev_put(slave_dev); - dev_set_mtu(slave_dev, orig_mtu); + netif_set_mtu(slave_dev, orig_mtu); done: return err; } diff --git a/net/core/failover.c b/net/core/failover.c index 11bb183c7a1bac..e43c59cd686853 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static LIST_HEAD(failover_list); @@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev) for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { + netdev_lock_ops(dev); failover_slave_register(dev); + netdev_unlock_ops(dev); + } } rtnl_unlock(); } From c6690a9030d784d3f099850800b6d5323771ca37 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:30:58 +0800 Subject: [PATCH 857/972] macsec: introduce dedicated workqueue for SA crypto cleanup Introduce a dedicated ordered workqueue, macsec_wq, which will be used by subsequent patches to defer SA crypto cleanup (crypto_free_aead and related teardown) out of softirq context. Using a dedicated workqueue instead of system_wq allows macsec_exit() to drain exactly the work items belonging to this module via destroy_workqueue(), without interfering with unrelated work items on system_wq or causing unexpected delays elsewhere. rcu_barrier() in macsec_exit() ensures all in-flight rcu_work callbacks have enqueued their work items before destroy_workqueue() drains and destroys the queue, making the two-step teardown correct and complete. The same sequence is kept in the error path of macsec_init() as a precaution, to mirror macsec_exit() and stay safe if work ever becomes queueable before this point in the future. While at it, rename the error labels in macsec_init() from the resource-named style (rtnl:, notifier:, wq:) to the err_xxx: style (err_rtnl:, err_notifier:, err_destroy_wq:) to align with the broader kernel convention. Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-2-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 6147ee8b1d78b1..ef5ac634f9163d 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -26,6 +26,8 @@ #include +static struct workqueue_struct *macsec_wq; + /* SecTAG length = macsec_eth_header without the optional SCI */ #define MACSEC_TAG_LEN 6 @@ -4505,25 +4507,35 @@ static int __init macsec_init(void) { int err; + macsec_wq = alloc_workqueue("macsec", WQ_UNBOUND, 0); + if (!macsec_wq) + return -ENOMEM; + pr_info("MACsec IEEE 802.1AE\n"); err = register_netdevice_notifier(&macsec_notifier); if (err) - return err; + goto err_destroy_wq; err = rtnl_link_register(&macsec_link_ops); if (err) - goto notifier; + goto err_notifier; err = genl_register_family(&macsec_fam); if (err) - goto rtnl; + goto err_rtnl; return 0; -rtnl: +err_rtnl: rtnl_link_unregister(&macsec_link_ops); -notifier: +err_notifier: unregister_netdevice_notifier(&macsec_notifier); +err_destroy_wq: + /* Precautionary, mirrors macsec_exit() to stay safe if work + * ever becomes queueable before this point in the future. + */ + rcu_barrier(); + destroy_workqueue(macsec_wq); return err; } @@ -4533,6 +4545,7 @@ static void __exit macsec_exit(void) rtnl_link_unregister(&macsec_link_ops); unregister_netdevice_notifier(&macsec_notifier); rcu_barrier(); + destroy_workqueue(macsec_wq); } module_init(macsec_init); From 6624bba469a325ecd699feae400b77cd11c76b98 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:30:59 +0800 Subject: [PATCH 858/972] macsec: use rcu_work to defer RX SA crypto cleanup out of softirq crypto_free_aead() can internally invoke vunmap() (e.g. via dma_free_attrs() in hardware crypto drivers such as hisi_sec2). vunmap() must not be called from softirq context, but free_rxsa() is an RCU callback that runs in softirq, leading to a kernel crash: vunmap+0x4c/0x70 __iommu_dma_free+0xd0/0x138 dma_free_attrs+0xf4/0x100 sec_aead_exit+0x64/0xb8 [hisi_sec2] crypto_destroy_tfm+0x98/0x110 free_rxsa+0x28/0x50 [macsec] rcu_do_batch+0x184/0x460 rcu_core+0xf4/0x1f8 handle_softirqs+0x118/0x330 Use rcu_work to defer the cleanup to a workqueue. rcu_work dispatches the worker asynchronously after the RCU grace period, so no thread blocks waiting, and concurrent releases of multiple SAs naturally share the same grace period. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-3-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index ef5ac634f9163d..e7ad24f1ea5b90 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -176,9 +176,10 @@ static void macsec_rxsc_put(struct macsec_rx_sc *sc) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } -static void free_rxsa(struct rcu_head *head) +static void free_rxsa_work(struct work_struct *work) { - struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); + struct macsec_rx_sa *sa = + container_of(to_rcu_work(work), struct macsec_rx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -188,7 +189,7 @@ static void free_rxsa(struct rcu_head *head) static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_rxsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) @@ -1409,6 +1410,7 @@ static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); + INIT_RCU_WORK(&rx_sa->destroy_work, free_rxsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index bc7de5b53e5431..0980ef36fbf0a4 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -123,6 +124,7 @@ struct macsec_dev_stats { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_rx_sa { struct macsec_key key; @@ -136,7 +138,7 @@ struct macsec_rx_sa { bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; - struct rcu_head rcu; + struct rcu_work destroy_work; }; struct pcpu_rx_sc_stats { From 552cc2306c3d87632f44a655737d1d367c2a3295 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Mon, 11 May 2026 23:31:00 +0800 Subject: [PATCH 859/972] macsec: use rcu_work to defer TX SA crypto cleanup out of softirq free_txsa() is an RCU callback running in softirq context, but calls crypto_free_aead() which can invoke vunmap() internally on hardware crypto drivers (e.g. hisi_sec2), triggering a kernel crash. Use rcu_work to defer the cleanup to a workqueue, for the same reasons as the analogous fix to free_rxsa() in the previous patch. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Jinliang Zheng Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511153102.2640368-4-alexjlzheng@tencent.com Signed-off-by: Jakub Kicinski --- drivers/net/macsec.c | 8 +++++--- include/net/macsec.h | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index e7ad24f1ea5b90..f904f4d16b45fd 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -205,9 +205,10 @@ static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) return sa; } -static void free_txsa(struct rcu_head *head) +static void free_txsa_work(struct work_struct *work) { - struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); + struct macsec_tx_sa *sa = + container_of(to_rcu_work(work), struct macsec_tx_sa, destroy_work); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); @@ -217,7 +218,7 @@ static void free_txsa(struct rcu_head *head) static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) - call_rcu(&sa->rcu, free_txsa); + queue_rcu_work(macsec_wq, &sa->destroy_work); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) @@ -1510,6 +1511,7 @@ static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); + INIT_RCU_WORK(&tx_sa->destroy_work, free_txsa_work); return 0; } diff --git a/include/net/macsec.h b/include/net/macsec.h index 0980ef36fbf0a4..d962093ee92375 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -176,6 +176,7 @@ struct macsec_rx_sc { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_tx_sa { struct macsec_key key; @@ -188,7 +189,7 @@ struct macsec_tx_sa { refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; - struct rcu_head rcu; + struct rcu_work destroy_work; }; /** From f44d38a31f1802b7222adaea9ee69f9d280f698a Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Thu, 14 May 2026 10:18:47 +0800 Subject: [PATCH 860/972] io_uring: validate user-controlled cq.head in io_cqe_cache_refill() A fuzzing run reproduced an unkillable io_uring task stuck at ~100% CPU: [root@fedora io_uring_stress]# ps -ef | grep io_uring root 1240 1 99 13:36 ? 00:01:35 [io_uring_stress] The task loops inside io_cqring_wait() and never returns to userspace, and SIGKILL has no effect. This is caused by the CQ ring exposing rings->cq.head to userspace as writable, while the authoritative tail lives in kernel-private ctx->cached_cq_tail. io_cqe_cache_refill() computes free space as an unsigned subtraction: free = ctx->cq_entries - min(tail - head, ctx->cq_entries); If userspace keeps head within [0, tail], the subtraction is well defined and min() just acts as a defensive clamp. But if userspace advances head past tail, (tail - head) wraps to a huge value, free becomes 0, and io_cqe_cache_refill() fails. The CQE is pushed onto the overflow list and IO_CHECK_CQ_OVERFLOW_BIT is set. The wait loop in io_cqring_wait() relies on an invariant: refill() only fails when the CQ is *physically* full, in which case rings->cq.tail has been advanced to iowq->cq_tail and io_should_wake() returns true. The tampered head breaks this: refill() fails while the ring is not full, no OCQE is copied in, rings->cq.tail never catches up, io_should_wake() stays false, and io_cqring_wait_schedule() keeps returning early because IO_CHECK_CQ_OVERFLOW_BIT is still set. The result is a tight retry loop that never returns to userspace. Introduce io_cqring_queued() as the single point that converts the (tail, head) pair into a trustworthy queued count. Since the real head/tail distance is bounded by cq_entries (far below 2^31), a signed comparison reliably detects userspace moving head past tail; in that case treat the queue as empty so callers see the full cache as free and forward progress is preserved. Suggested-by: Jens Axboe Signed-off-by: Zizhi Wo Link: https://patch.msgid.link/20260514021847.4062782-1-wozizhi@huaweicloud.com [axboe: fixup commit message, kill 'queued' var, and keep it all in io_uring.c] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2ebb0ba37c4f01..036145ee466cd2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -686,13 +686,27 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Compute queued CQEs for free-space calculation, clamped to cq_entries. + */ +static unsigned int io_cqring_queued(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); + int diff; + + diff = (int)(ctx->cached_cq_tail - READ_ONCE(rings->cq.head)); + if (diff >= 0) + return min((unsigned int)diff, ctx->cq_entries); + return 0; +} + /* * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE * because the ring is a single 16b entry away from wrapping. */ static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) { - if (__io_cqring_events(ctx) < ctx->cq_entries) { + if (io_cqring_queued(ctx) < ctx->cq_entries) { struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; cqe->user_data = 0; @@ -713,7 +727,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int free, queued, len; + unsigned int free, len; /* * Posting into the CQ when there are pending overflowed CQEs may break @@ -733,9 +747,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) off = 0; } - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; + free = ctx->cq_entries - io_cqring_queued(ctx); /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (len < (cqe32 + 1)) From 50da1c9ccb70fc5250c37ac474b54ee072732ea3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Apr 2026 16:23:04 -0700 Subject: [PATCH 861/972] riscv: Docs: fix unmatched quote warning 'make htmldocs' complains about ``prctrl` -- so add a second '`' to avoid the warning. Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils] Fixes: 08ee1559052b ("prctl: cfi: change the branch landing pad prctl()s to be more descriptive") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260406232304.1892528-1-rdunlap@infradead.org Signed-off-by: Paul Walmsley --- Documentation/arch/riscv/zicfilp.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/riscv/zicfilp.rst b/Documentation/arch/riscv/zicfilp.rst index ab7d8e62ddaffb..12b35969d17ad2 100644 --- a/Documentation/arch/riscv/zicfilp.rst +++ b/Documentation/arch/riscv/zicfilp.rst @@ -78,7 +78,7 @@ the program. Per-task indirect branch tracking state can be monitored and controlled via the :c:macro:`PR_GET_CFI` and :c:macro:`PR_SET_CFI` -``prctl()` arguments (respectively), by supplying +``prctl()`` arguments (respectively), by supplying :c:macro:`PR_CFI_BRANCH_LANDING_PADS` as the second argument. These are architecture-agnostic, and will return -EINVAL if the underlying functionality is not supported. From b69bcb13ed7024a84d6cd8ad330f1e32782fcf28 Mon Sep 17 00:00:00 2001 From: Vivian Wang Date: Wed, 1 Apr 2026 09:53:17 +0800 Subject: [PATCH 862/972] riscv: misaligned: Make enabling delegation depend on NONPORTABLE The unaligned access emulation code in Linux has various deficiencies. For example, it doesn't emulate vector instructions [1] [2], and doesn't emulate KVM guest accesses. Therefore, requesting misaligned exception delegation with SBI FWFT actually regresses vector instructions' and KVM guests' behavior. Until Linux can handle it properly, guard these sbi_fwft_set() calls behind RISCV_SBI_FWFT_DELEGATE_MISALIGNED, which in turn depends on NONPORTABLE. Those who are sure that this wouldn't be a problem can enable this option, perhaps getting better performance. The rest of the existing code proceeds as before, except as if SBI_FWFT_MISALIGNED_EXC_DELEG is not available, to handle any remaining address misaligned exceptions on a best-effort basis. The KVM SBI FWFT implementation is also not touched, but it is disabled if the firmware emulates unaligned accesses. Cc: stable@vger.kernel.org Fixes: cf5a8abc6560 ("riscv: misaligned: request misaligned exception from SBI") Reported-by: Songsong Zhang # KVM Link: https://lore.kernel.org/linux-riscv/38ce44c1-08cf-4e3f-8ade-20da224f529c@iscas.ac.cn/ [1] Link: https://lore.kernel.org/linux-riscv/b3cfcdac-0337-4db0-a611-258f2868855f@iscas.ac.cn/ [2] Signed-off-by: Vivian Wang Acked-by: Conor Dooley Link: https://patch.msgid.link/20260401-riscv-misaligned-dont-delegate-v2-1-5014a288c097@iscas.ac.cn Signed-off-by: Paul Walmsley --- arch/riscv/Kconfig | 22 ++++++++++++++++++++++ arch/riscv/kernel/traps_misaligned.c | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index d235396c451413..c5754942cf85a4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -937,6 +937,28 @@ config RISCV_VECTOR_MISALIGNED help Enable detecting support for vector misaligned loads and stores. +config RISCV_SBI_FWFT_DELEGATE_MISALIGNED + bool "Request firmware delegation of unaligned access exceptions" + depends on RISCV_SBI + depends on NONPORTABLE + help + Use SBI FWFT to request delegation of load address misaligned and + store address misaligned exceptions, if possible, and prefer Linux + kernel emulation of these accesses to firmware emulation. + + Unfortunately, Linux's emulation is still incomplete. Namely, it + currently does not handle vector instructions and KVM guest accesses. + On platforms where these accesses would have been handled by firmware, + enabling this causes unexpected kernel oopses, userspaces crashes and + KVM guest crashes. If you are sure that these are not a problem for + your platform, you can say Y here, which may improve performance. + + Saying N here will not worsen emulation support for unaligned accesses + even in the case where the firmware also has incomplete support. It + simply keeps the firmware's emulation enabled. + + If you don't know what to do here, say N. + choice prompt "Unaligned Accesses Support" default RISCV_PROBE_UNALIGNED_ACCESS diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 2a27d3ff4ac66d..81b7682e6c6dbc 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -584,7 +584,7 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) static bool misaligned_traps_delegated; -#ifdef CONFIG_RISCV_SBI +#if defined(CONFIG_RISCV_SBI_FWFT_DELEGATE_MISALIGNED) static int cpu_online_sbi_unaligned_setup(unsigned int cpu) { From 31467b23823ffec1f6fff407f8e3ca9af8b7491a Mon Sep 17 00:00:00 2001 From: Sayali Patil Date: Wed, 13 May 2026 13:44:13 +0530 Subject: [PATCH 863/972] powerpc/time: Remove redundant preempt_disable|enable() calls from arch_irq_work_raise() A kernel panic is observed when handling machine check exceptions from real mode. BUG: Unable to handle kernel data access on read at 0xc00000006be21300 Oops: Kernel access of bad area, sig: 11 [#1] MSR: 8000000000001003 CR: 88222248 XER: 00000005 CFAR: c00000000003ffc4 DAR: c00000006be21300 DSISR: 40000000 IRQMASK: 0 NIP [c000000000029e40] arch_irq_work_raise+0x10/0x70 LR [c00000000003ffc8] machine_check_queue_event+0xa8/0x150 Call Trace: [c0000000179d3c70] [c00000000003ff64] machine_check_queue_event+0x44/0x150 [c0000000179d3d30] [c0000000000084e0] machine_check_early_common+0x1f0/0x2c0 The crash occurs because arch_irq_work_raise() calls preempt_disable() from machine check exception (MCE) handlers running in real mode. In this context, accessing the preempt_count can fault, leading to the panic. The preempt_disable()/preempt_enable() pair in arch_irq_work_raise() was originally added by commit 0fe1ac48bef0 ("powerpc/perf_event: Fix oops due to perf_event_do_pending call") to avoid races while raising irq work from exception context. Later, commit 471ba0e686cb ("irq_work: Do not raise an IPI when queueing work on the local CPU") added preemption protection in irq_work_queue() path, while commit 20b876918c06 ("irq_work: Use per cpu atomics instead of regular atomics") added equivalent protection in irq_work_queue_on() before reaching arch_irq_work_raise(): irq_work_queue() / irq_work_queue_on() -> preempt_disable() -> __irq_work_queue_local() -> irq_work_raise() -> arch_irq_work_raise() As a result, callers other than mce_irq_work_raise() already execute with preemption disabled, making the additional preempt_disable()/preempt_enable() pair in arch_irq_work_raise() redundant. The arch_irq_work_raise() function executes in NMI context when called from MCE handler. Hence we will not be preempted or scheduled out since we are in NMI context with MSR[EE]=0. Therefore, it is safe to remove the preempt_disable()/preempt_enable() calls from here. Remove it to avoid accessing preempt_count from real mode context. Fixes: cc15ff327569 ("powerpc/mce: Avoid using irq_work_queue() in realmode") Suggested-by: Mahesh Salgaonkar Acked-by: Shrikanth Hegde Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Sayali Patil [Maddy: Fixed the commit title] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20260513081413.222490-1-sayalip@linux.ibm.com --- arch/powerpc/kernel/time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 4bbeb8644d3da4..b4472288e0d434 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -458,6 +458,10 @@ DEFINE_PER_CPU(u8, irq_work_pending); #endif /* 32 vs 64 bit */ +/* + * Must be called with preemption disabled since it updates + * per-CPU irq_work state and programs the local CPU decrementer. + */ void arch_irq_work_raise(void) { /* @@ -471,10 +475,8 @@ void arch_irq_work_raise(void) * which could get tangled up if we're messing with the same state * here. */ - preempt_disable(); set_irq_work_pending_flag(); set_dec(1); - preempt_enable(); } static void set_dec_or_work(u64 val) From 1ef2a89584b7b788b2603590d886db076b2f24cc Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:12 +0100 Subject: [PATCH 864/972] arm_mpam: Fix monitor instance selection when checking for hardware NRDY In _mpam_ris_hw_probe_hw_nrdy() a new register value to select the first monitor and relevant RIS is prepared in mon_sel. However, it is written to the monitor value register, e.g. MSMON_CSU, rather than MSMON_CFG_MON_SEL. As MSMON_CFG_MON_SEL is a 32 bit register update the type of mon_sel to u32. Write mon_sel to the intended register, MSMON_CFG_MON_SEL. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 41b14344b16f2d..817cb10a8e795f 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -731,7 +731,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) { u32 now; - u64 mon_sel; + u32 mon_sel; bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; @@ -740,7 +740,7 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); - _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); now = _mpam_read_monsel_reg(msc, mon_reg); From 4387970bbd84fd14e0c49c3089c5061ccd86b98a Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:13 +0100 Subject: [PATCH 865/972] arm_mpam: Pretend that NRDY is always hardware managed Rule ZTXDS of the MPAM specification, IHI009 version B.b, states: "If a monitor does not support automatic updates of NRDY, software can use that bit for any purpose." As software is not reliably informed whether or not the monitor supports automatic updates of NRDY always assume that hardware may manage NRDY but don't rely on it. When NRDY is truly untouched by hardware then, as it is written to 0 on configuration, it will always read 0. At probe it's checked if MSMON_CSU.NRDY and MSMON_MBWU.NRDY are hardware managed but not MSMON_MBWU_L.NDRY. Specialize the checking for hardware managed NRDY to CSU counters as this is the only case where hardware management makes sense. Continue to inform the user if MSMON_CSU.NRDY appears to be hardware managed but the firmware doesn't provide the associated time limit for the automatic clearing of NRDY. Remove the NRDY feature flags as they are now unused. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 53 +++++++++++---------------------- drivers/resctrl/mpam_internal.h | 2 -- 2 files changed, 17 insertions(+), 38 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 817cb10a8e795f..58e0c8970e8cf4 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -728,7 +728,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) * Try and see what values stick in this bit. If we can write either value, * its probably not implemented by hardware. */ -static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) +static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) { u32 now; u32 mon_sel; @@ -742,21 +742,18 @@ static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); - _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); - now = _mpam_read_monsel_reg(msc, mon_reg); + _mpam_write_monsel_reg(msc, MSMON_CSU, MSMON___NRDY); + now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_set = now & MSMON___NRDY; - _mpam_write_monsel_reg(msc, mon_reg, 0); - now = _mpam_read_monsel_reg(msc, mon_reg); + _mpam_write_monsel_reg(msc, MSMON_CSU, 0); + now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_clear = !(now & MSMON___NRDY); mpam_mon_sel_unlock(msc); return (!can_set || !can_clear); } -#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ - _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) - static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) { int err; @@ -873,20 +870,18 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) mpam_set_feature(mpam_feat_msmon_csu_xcl, props); /* Is NRDY hardware managed? */ - hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); - if (hw_managed) - mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); - } + hw_managed = mpam_ris_hw_probe_csu_nrdy(ris); - /* - * Accept the missing firmware property if NRDY appears - * un-implemented. - */ - if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props)) - dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + /* + * Accept the missing firmware property if NRDY appears + * un-implemented. + */ + if (err && hw_managed) + dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + } } if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { - bool has_long, hw_managed; + bool has_long; u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); @@ -905,16 +900,6 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) } else { mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props); } - - /* Is NRDY hardware managed? */ - hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); - if (hw_managed) - mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); - - /* - * Don't warn about any missing firmware property for - * MBWU NRDY - it doesn't make any sense! - */ } } } @@ -1197,7 +1182,6 @@ static void __ris_msmon_read(void *arg) bool reset_on_next_read = false; struct mpam_msc_ris *ris = m->ris; struct msmon_mbwu_state *mbwu_state; - struct mpam_props *rprops = &ris->props; struct mpam_msc *msc = m->ris->vmsc->msc; u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; @@ -1253,8 +1237,7 @@ static void __ris_msmon_read(void *arg) switch (m->type) { case mpam_feat_msmon_csu: now = mpam_read_monsel_reg(msc, CSU); - if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; + nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) @@ -1266,8 +1249,7 @@ static void __ris_msmon_read(void *arg) case mpam_feat_msmon_mbwu_63counter: if (m->type != mpam_feat_msmon_mbwu_31counter) { now = mpam_msc_read_mbwu_l(msc); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___L_NRDY; + nrdy = now & MSMON___L_NRDY; if (m->type == mpam_feat_msmon_mbwu_63counter) now = FIELD_GET(MSMON___LWD_VALUE, now); @@ -1275,8 +1257,7 @@ static void __ris_msmon_read(void *arg) now = FIELD_GET(MSMON___L_VALUE, now); } else { now = mpam_read_monsel_reg(msc, MBWU); - if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) - nrdy = now & MSMON___NRDY; + nrdy = now & MSMON___NRDY; now = FIELD_GET(MSMON___VALUE, now); } diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h index 1914aefdcba9ef..04d1a59f02afb4 100644 --- a/drivers/resctrl/mpam_internal.h +++ b/drivers/resctrl/mpam_internal.h @@ -181,14 +181,12 @@ enum mpam_device_features { mpam_feat_msmon_csu, mpam_feat_msmon_csu_capture, mpam_feat_msmon_csu_xcl, - mpam_feat_msmon_csu_hw_nrdy, mpam_feat_msmon_mbwu, mpam_feat_msmon_mbwu_31counter, mpam_feat_msmon_mbwu_44counter, mpam_feat_msmon_mbwu_63counter, mpam_feat_msmon_mbwu_capture, mpam_feat_msmon_mbwu_rwbw, - mpam_feat_msmon_mbwu_hw_nrdy, mpam_feat_partid_nrw, MPAM_FEATURE_LAST }; From ccad6001be5c38426ccf45790c411467ad3c03c6 Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Thu, 7 May 2026 16:28:14 +0100 Subject: [PATCH 866/972] arm_mpam: Improve check for whether or not NRDY is hardware managed mpam_ris_hw_probe_csu_nrdy() sets and clears MSMON_CSU.NRDY and checks whether it's configuration sticks. However, hardware isn't given a chance to disagree. Based on rule LRTGP, in MPAM specification IHI0099 version B.b, the hardware will set NRDY if it needs time to establish a count after a configuration change. Enable the monitor so that NRDY becomes relevant and change the configuration after clearing NRDY to try and coax the hardware into setting it. Fixes: 8c90dc68a5de ("arm_mpam: Probe the hardware features resctrl supports") Cc: Signed-off-by: Ben Horgan Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 58e0c8970e8cf4..e145828f3f736b 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -730,8 +730,7 @@ static void mpam_enable_quirks(struct mpam_msc *msc) */ static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) { - u32 now; - u32 mon_sel; + u32 now, mon_sel, ctl_val; bool can_set, can_clear; struct mpam_msc *msc = ris->vmsc->msc; @@ -742,11 +741,21 @@ static bool mpam_ris_hw_probe_csu_nrdy(struct mpam_msc_ris *ris) FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + /* Hardware might ignore nrdy if it's not enabled */ + ctl_val = MSMON_CFG_CSU_CTL_TYPE_CSU; + ctl_val |= MSMON_CFG_x_CTL_MATCH_PARTID; + ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG; + ctl_val |= MSMON_CFG_x_CTL_EN; + mpam_write_monsel_reg(msc, CFG_CSU_FLT, 0); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val); + _mpam_write_monsel_reg(msc, MSMON_CSU, MSMON___NRDY); now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_set = now & MSMON___NRDY; _mpam_write_monsel_reg(msc, MSMON_CSU, 0); + /* Configuration change to try and coax hardware into setting nrdy */ + mpam_write_monsel_reg(msc, CFG_CSU_FLT, 0x1); now = _mpam_read_monsel_reg(msc, MSMON_CSU); can_clear = !(now & MSMON___NRDY); mpam_mon_sel_unlock(msc); From f1caff3335ea6eab88cdc84ec8f2e3c45ca05486 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:38 +0100 Subject: [PATCH 867/972] arm_mpam: Fix false positive assert failure during mpam_disable() mpam_assert_partid_sizes_fixed() is used to document that the caller doesn't expect the discovered PARTID size to change while it is walking a list sized by PARTID. Typically the MSC state is not written to until all the MSC have been discovered and this value is set. However, if discovering the MSC fails and schedules mpam_disable(), then the MSC state is written to reset it. In this case the discovered PARTID size may be become smaller - but only PARTID 0 will be used once resctrl_exit() has been called. Skip the WARN_ON_ONCE() if mpam_disable_reason has been set. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index e145828f3f736b..5c48812d8573b5 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -164,11 +164,17 @@ static void mpam_free_garbage(void) /* * Once mpam is enabled, new requestors cannot further reduce the available * partid. Assert that the size is fixed, and new requestors will be turned - * away. + * away. This is needed when walking over structures sized by PARTID. + * + * During mpam_disable() these structures are not fixed, but the MSC state + * is still reset using whatever sizes have been discovered so far. As only + * PARTID 0 will be used after mpam_disable(), any race would be benign. + * Skip the check if a mpam_disable_reason has been set. */ static void mpam_assert_partid_sizes_fixed(void) { - WARN_ON_ONCE(!partid_max_published); + if (!mpam_disable_reason) + WARN_ON_ONCE(!partid_max_published); } static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) From 6ccbb613b42a1f1ba7bfd547a148f644a902a25c Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 8 May 2026 17:23:39 +0100 Subject: [PATCH 868/972] arm_mpam: Check whether the config array is allocated before destroying it __destroy_component_cfg() is called to free the configuration array. It uses the embedded 'garbage' structure, which means the array has to be allocated. If __destroy_component_cfg() is called from mpam_disable() before the configuration was ever allocated, then a NULL pointer is dereferenced. Check for this case and return early if the configuration is not allocated. __destroy_component_cfg() also frees the mbwu_state as this is allocated by __allocate_component_cfg(). As the mbwu_state is allocated after comp->cfg is set, and is also under mpam_list_lock, only the first pointer needs checking. Fixes: 3bd04fe7d807 ("arm_mpam: Extend reset logic to allow devices to be reset any time") Cc: Signed-off-by: James Morse Reviewed-by: Ben Horgan Signed-off-by: Catalin Marinas --- drivers/resctrl/mpam_devices.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c index 5c48812d8573b5..988fc291241d23 100644 --- a/drivers/resctrl/mpam_devices.c +++ b/drivers/resctrl/mpam_devices.c @@ -2581,6 +2581,9 @@ static void __destroy_component_cfg(struct mpam_component *comp) lockdep_assert_held(&mpam_list_lock); + if (!comp->cfg) + return; + add_to_garbage(comp->cfg); list_for_each_entry(vmsc, &comp->vmsc, comp_list) { msc = vmsc->msc; From 277740023def559a4a2ddc3e8e784ee37a0f16a9 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Sun, 10 May 2026 23:21:38 -0700 Subject: [PATCH 869/972] net/smc: reject CHID-0 ACCEPT that matches an empty ism_dev slot On the SMC-D client, slot 0 of ini->ism_dev[]/ini->ism_chid[] is reserved for an SMC-Dv1 device. smc_find_ism_v2_device_clnt() populates V2 entries starting at index 1, so when no V1 device is selected slot 0 is left in its kzalloc()'ed state with ism_dev[0] == NULL and ism_chid[0] == 0. smc_v2_determine_accepted_chid() then matches the peer's CHID against the array starting from index 0 using the CHID alone. A malicious peer replying to a SMC-Dv2-only proposal with d1.chid == 0 matches the empty slot, ini->ism_selected becomes 0, and the subsequent ism_dev[0]->lgr_lock dereference in smc_conn_create() faults at offsetof(struct smcd_dev, lgr_lock) == 0x68: BUG: KASAN: null-ptr-deref in _raw_spin_lock_bh+0x79/0xe0 Write of size 4 at addr 0000000000000068 by task exploit/144 Call Trace: _raw_spin_lock_bh smc_conn_create (net/smc/smc_core.c:1997) __smc_connect (net/smc/af_smc.c:1447) smc_connect (net/smc/af_smc.c:1720) __sys_connect __x64_sys_connect do_syscall_64 Require ism_dev[i] to be non-NULL before accepting a CHID match. Fixes: a7c9c5f4af7f ("net/smc: CLC accept / confirm V2") Reported-by: Weiming Shi Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Xiang Mei Link: https://patch.msgid.link/20260511062138.2839584-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da28652f681088..dffbd529762d6b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1400,7 +1400,8 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { + if (ini->ism_dev[i] && + ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } From 602d60ebae0f10bfbc7ba90eee026fdbd0203df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 22 Apr 2026 11:42:32 +0200 Subject: [PATCH 870/972] vdso/gettimeofday: Reload sequence counter after switch to time page in do_aux() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching to the real data pages, the sequence counter needs to be reloaded from there. The code using vdso_read_begin_timens() assumed this worked by 'continue' jumping to the *beginning* of the do-while retry loop. However the 'continue' jumps to the *end* of said loop, evaluating the exit condition. If the data page has a sequence counter of '1' it will match the one from the time namespace page and prematurely exit the retry loop. This would result in garbage returned to the caller. Reload the sequence counter after switching the pages by using an inner while loop again, which will loop at most once. The loop generates slightly better code than an explicit reload through 'seq = vdso_read_begin()'. Fixes: ed78b7b2c5ae ("vdso/gettimeofday: Add a helper to read the sequence lock of a time namespace aware clock") Reported-by: Ricardo Ribalda Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Ricardo Ribalda Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260422-vdso-aux-timens-loop-v1-1-e2dd8c7164cc@linutronix.de Closes: https://lore.kernel.org/lkml/CANiDSCsOy0P1if-gJZqOM5pTJ0RDcwVfru1B7KFbTOEMqjPKJw@mail.gmail.com/ --- lib/vdso/gettimeofday.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c index a5798bd26d20b1..da224011fafd93 100644 --- a/lib/vdso/gettimeofday.c +++ b/lib/vdso/gettimeofday.c @@ -248,11 +248,10 @@ bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_ti vc = &vd->aux_clock_data[idx]; do { - if (vdso_read_begin_timens(vc, &seq)) { + while (vdso_read_begin_timens(vc, &seq)) { + /* Re-read from the real time data page, reload seq by looping */ vd = __arch_get_vdso_u_timens_data(vd); vc = &vd->aux_clock_data[idx]; - /* Re-read from the real time data page */ - continue; } /* Auxclock disabled? */ From 591711b32681a04b57d00c2a404658f8419a081c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 8 May 2026 18:09:20 +0200 Subject: [PATCH 871/972] drm/ttm: Convert -EAGAIN from dmem_cgroup_try_charge to -ENOSPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dmem_cgroup_try_charge() returns -EAGAIN when the cgroup limit is hit and the charge fails. TTM has no concept of -EAGAIN from resource allocation; -ENOSPC is the canonical error meaning "no space, try eviction". Convert at the source in ttm_resource_alloc() so no caller needs to handle an unexpected error code, and clean up the now-redundant -EAGAIN check in ttm_bo_alloc_resource(). Without this, -EAGAIN escaping ttm_resource_alloc() during an eviction walk causes the walk to terminate early instead of continuing to the next candidate. Cc: Friedrich Vock Cc: Maarten Lankhorst Cc: Tejun Heo Cc: Maxime Ripard Cc: Christian Koenig Cc: dri-devel@lists.freedesktop.org Cc: # v6.14+ Fixes: 2b624a2c1865 ("drm/ttm: Handle cgroup based eviction in TTM") Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260508160920.230339-1-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_resource.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2934017055429c..bcd76f6bb7f02d 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -739,7 +739,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM); ret = ttm_resource_alloc(bo, place, res, force_space ? &limit_pool : NULL); if (ret) { - if (ret != -ENOSPC && ret != -EAGAIN) { + if (ret != -ENOSPC) { dmem_cgroup_pool_state_put(limit_pool); return ret; } diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 0e5f1582f13d84..154d6739256f86 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -398,8 +398,11 @@ int ttm_resource_alloc(struct ttm_buffer_object *bo, if (man->cg) { ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool, ret_limit_pool); - if (ret) + if (ret) { + if (ret == -EAGAIN) + ret = -ENOSPC; return ret; + } } ret = man->func->alloc(man, bo, place, res_ptr); From 285943c6e7ca309bbea84b253745154241d9788a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 May 2026 10:49:17 -0700 Subject: [PATCH 872/972] net: tls: fix off-by-one in sg_chain entry count for wrapped sk_msg ring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an sk_msg scatterlist ring wraps (sg.end < sg.start), tls_push_record() chains the tail portion of the ring to the head using sg_chain(). An extra entry in the sg array is reserved for this: struct sk_msg_sg { [...] /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the * chaining; * 2) to chain tailer SG entries after the message. */ struct scatterlist data[MAX_MSG_FRAGS + 2]; The current code uses MAX_SKB_FRAGS + 1 as the ring size: sg_chain(&msg_pl->sg.data[msg_pl->sg.start], MAX_SKB_FRAGS - msg_pl->sg.start + 1, msg_pl->sg.data); This places the chain pointer at sg_chain(data[start], (MAX_SKB_FRAGS - msg_start + 1) .. = &data[start] + (MAX_SKB_FRAGS - msg_start + 1) - 1 = data[start + (MAX_SKB_FRAGS - start + 1) - 1] = data[MAX_SKB_FRAGS] instead of the true last entry. This is likely due to a "race" of the commit under Fixes landing close to commit 031097d9e079 ("bpf: sk_msg, zap ingress queue on psock down") Convert to ARRAY_SIZE and drop the data[start] / - start (as suggested by Sabrina). Reported-by: 钱一铭 Fixes: 9aaaa56845a0 ("bpf: Sockmap/tls, skmsg can have wrapped skmsg that needs extra chaining") Signed-off-by: Jakub Kicinski Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20260511174920.433155-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2590e855f6a5a9..2608b0c01849f0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -800,11 +800,9 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) { - sg_chain(&msg_pl->sg.data[msg_pl->sg.start], - MAX_SKB_FRAGS - msg_pl->sg.start + 1, + if (msg_pl->sg.end < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), msg_pl->sg.data); - } i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); From ff26a0e8377dec07e4a7230db7675bed1b9a6d03 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 May 2026 10:49:18 -0700 Subject: [PATCH 873/972] net: tls: prevent chain-after-chain in plain text SG Sashiko points out that if end = 0 (start != 0) the current code will create a chain link to content type right after the wrap link: This would create a chain where the wrap link points directly to another chain link. The scatterlist API sg_next iterator does not recursively resolve consecutive chain links. meaning this is illegal input to crypto. The wrapping link is unnecessary if end = 0. end is the entry after the last one used so end = 0 means there's nothing pushed after the wrap: end start i v v v [ ]...[ ][ d ][ d ][ d ][ d ][rsv for wrap] Skip the wrapping in this case. TLS 1.3 can use the "wrapping slot" for it's chaining if end = 0. This avoids the chain-after-chain. Move the wrap chaining before marking END and chaining off content type, that feels like more logical ordering to me, but should not matter from functional perspective. Reported-by: Sashiko Fixes: 9aaaa56845a0 ("bpf: Sockmap/tls, skmsg can have wrapped skmsg that needs extra chaining") Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20260511174920.433155-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls_sw.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 2608b0c01849f0..3bfdaf5e64f501 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -789,21 +789,33 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_pl->sg.end; sk_msg_iter_var_prev(i); + /* msg_pl->sg.data is a ring; data[MAX+1] is reserved for the wrap + * link (frags won't use it). 'i' is now the last filled entry: + * + * i end start + * v v v [ rsv ] + * [ d ][ d ][ ][ ]...[ ][ d ][ d ][ d ][chain] + * ^ END v + * `-----------------------------------------' + * + * Note that SGL does not allow chain-after-chain, so for TLS 1.3, + * we must make sure we don't create the wrap entry and then chain + * link to content_type immediately at index 0. + */ + if (i < msg_pl->sg.start) + sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), + msg_pl->sg.data); + rec->content_type = record_type; if (prot->version == TLS_1_3_VERSION) { /* Add content type to end of message. No padding added */ sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); sg_mark_end(&rec->sg_content_type); - sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, - &rec->sg_content_type); + sg_chain(msg_pl->sg.data, i + 2, &rec->sg_content_type); } else { sg_mark_end(sk_msg_elem(msg_pl, i)); } - if (msg_pl->sg.end < msg_pl->sg.start) - sg_chain(msg_pl->sg.data, ARRAY_SIZE(msg_pl->sg.data), - msg_pl->sg.data); - i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]); From 561458db0d6b08b4e4956c6e4456d7781b18676f Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 13 May 2026 14:51:29 -0600 Subject: [PATCH 874/972] docs: security-bugs: add a link to the threat-model documentation Rather than make readers search for this document, just a link to it where it is referenced. (While I was at it, I removed the unused and unneeded _threatmodel label from the top of threat-model.rst). Acked-by: Willy Tarreau Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jonathan Corbet --- Documentation/process/security-bugs.rst | 13 +++++++------ Documentation/process/threat-model.rst | 2 -- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Documentation/process/security-bugs.rst b/Documentation/process/security-bugs.rst index f85c65f31f12f7..3c51ddde31dd91 100644 --- a/Documentation/process/security-bugs.rst +++ b/Documentation/process/security-bugs.rst @@ -191,12 +191,13 @@ handle: Please **always convert your report to plain text** without any formatting decorations before sending it. - * **Impact Evaluation**: Many AI-generated reports lack an understanding of - the kernel's threat model and go to great lengths inventing theoretical - consequences. This adds noise and complicates triage. Please stick to - verifiable facts (e.g., "this bug permits any user to gain CAP_NET_ADMIN") - without enumerating speculative implications. Have your tool read this - documentation as part of the evaluation process. + * **Impact Evaluation**: Many AI-generated reports lack an understanding + of the kernel's threat model (see Documentation/process/threat-model.rst) + and go to great lengths inventing theoretical consequences. This adds + noise and complicates triage. Please stick to verifiable facts (e.g., + "this bug permits any user to gain CAP_NET_ADMIN") without enumerating + speculative implications. Have your tool read this documentation as + part of the evaluation process. * **Reproducer**: AI-based tools are often capable of generating reproducers. Please always ensure your tool provides one and **test it thoroughly**. If diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst index ecb432390e7924..91da52f7114fdb 100644 --- a/Documentation/process/threat-model.rst +++ b/Documentation/process/threat-model.rst @@ -1,5 +1,3 @@ -.. _threatmodel: - The Linux Kernel threat model ============================= From f2e65e4e5b4b4b9ecf43f03c3fdbe8c9a8a43a9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 13 May 2026 14:58:53 -0600 Subject: [PATCH 875/972] docs: threat-model: don't limit root capabilities to CAP_SYS_ADMIN The threat-model document says that only users with CAP_SYS_ADMIN can carry out a number of admin-level tasks, but there are numerous capabilities that can confer that sort of power. Generalize the text slightly to make it clear that CAP_SYS_ADMIN is not the only all-powerful capability. Acked-by: Willy Tarreau Signed-off-by: Jonathan Corbet --- Documentation/process/threat-model.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/process/threat-model.rst b/Documentation/process/threat-model.rst index 91da52f7114fdb..f177b8d3c1cafd 100644 --- a/Documentation/process/threat-model.rst +++ b/Documentation/process/threat-model.rst @@ -62,7 +62,8 @@ on common processors featuring privilege levels and memory management units: * **Capability-based protection**: - * users not having the ``CAP_SYS_ADMIN`` capability may not alter the + * users not having elevated capabilities (including but not limited to + CAP_SYS_ADMIN) may not alter the kernel's configuration, memory nor state, change other users' view of the file system layout, grant any user capabilities they do not have, nor affect the system's availability (shutdown, reboot, panic, hang, or making From c78bdba7b9666020c0832150a4fc4c0aebc7c6ac Mon Sep 17 00:00:00 2001 From: Sven Schuchmann Date: Tue, 12 May 2026 09:19:47 +0200 Subject: [PATCH 876/972] net: phy: DP83TC811: add reading of abilities At this time the driver is not listing any speeds it supports. This should be ETHTOOL_LINK_MODE_100baseT1_Full_BIT for DP83TC811. Add the missing call for phylib to read the abilities. Fixes: b753a9faaf9a ("net: phy: DP83TC811: Introduce support for the DP83TC811 phy") Suggested-by: Andrew Lunn Signed-off-by: Sven Schuchmann Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260512071949.6218-1-schuchmann@schleissheimer.de [pabeni@redhat.com: dropped revision history] Signed-off-by: Paolo Abeni --- drivers/net/phy/dp83tc811.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/phy/dp83tc811.c b/drivers/net/phy/dp83tc811.c index e480c2a0745057..252fb12b3e68eb 100644 --- a/drivers/net/phy/dp83tc811.c +++ b/drivers/net/phy/dp83tc811.c @@ -393,6 +393,7 @@ static struct phy_driver dp83811_driver[] = { .config_init = dp83811_config_init, .config_aneg = dp83811_config_aneg, .soft_reset = dp83811_phy_reset, + .get_features = genphy_c45_pma_read_ext_abilities, .get_wol = dp83811_get_wol, .set_wol = dp83811_set_wol, .config_intr = dp83811_config_intr, From 1d59f36e95f7f7134db0e313c9d787cb0adb2153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Mon, 11 May 2026 18:24:43 +0200 Subject: [PATCH 877/972] drm/ttm: Fix ttm_bo_shrink() infinite LRU walk on backup failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the same fix as b2ed01e7ad ("drm/ttm: Fix ttm_bo_swapout() infinite LRU walk on swapout failure") to the ttm_bo_shrink() path. Move del_bulk_move from before the backup to after success only, using ttm_resource_del_bulk_move_unevictable() since the resource is now unevictable once fully backed up. Fixes: 70d645deac98 ("drm/ttm: Add helpers for shrinking") Cc: Christian König Cc: Huang Rui Cc: Matthew Auld Cc: Matthew Brost Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v6.15+ Assisted-by: GitHub_Copilot:claude-opus-4.6 Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260511162443.24352-1-thomas.hellstrom@linux.intel.com Signed-off-by: Thomas Hellström --- drivers/gpu/drm/ttm/ttm_bo_util.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index f83b7d5ec6c6d0..3e3c201a022267 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -1112,19 +1112,14 @@ long ttm_bo_shrink(struct ttm_operation_ctx *ctx, struct ttm_buffer_object *bo, if (lret < 0) return lret; - if (bo->bulk_move) { - spin_lock(&bdev->lru_lock); - ttm_resource_del_bulk_move(bo->resource, bo); - spin_unlock(&bdev->lru_lock); - } - lret = ttm_tt_backup(bdev, bo->ttm, (struct ttm_backup_flags) {.purge = flags.purge, .writeback = flags.writeback}); - if (lret <= 0 && bo->bulk_move) { + if (lret > 0) { spin_lock(&bdev->lru_lock); - ttm_resource_add_bulk_move(bo->resource, bo); + ttm_resource_del_bulk_move_unevictable(bo->resource, bo); + ttm_resource_move_to_lru_tail(bo->resource); spin_unlock(&bdev->lru_lock); } From 7d9a7f1f96cd617ee9e75bb22217c709038e26b8 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 14 May 2026 21:14:18 +0800 Subject: [PATCH 878/972] smb/client: fix possible infinite loop and oob read in symlink_data() On 32-bit architectures, the infinite loop is as follows: len = p->ErrorDataLength == 0xfffffff8 u8 *next = p->ErrorContextData + len next == p On 32-bit architectures, the out-of-bounds read is as follows: len = p->ErrorDataLength == 0xfffffff0 u8 *next = p->ErrorContextData + len next == (u8 *)p - 8 Reported-by: ChenXiaoSong Fixes: 76894f3e2f71 ("cifs: improve symlink handling for smb2+") Cc: stable@vger.kernel.org Signed-off-by: Ye Bin Reviewed-by: ChenXiaoSong Signed-off-by: Steve French --- fs/smb/client/smb2file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index b292aa94a5932a..6860eff3169329 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -49,6 +49,9 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov) __func__, le32_to_cpu(p->ErrorId)); len = ALIGN(le32_to_cpu(p->ErrorDataLength), 8); + if (len > end - ((u8 *)p + sizeof(*p))) + return ERR_PTR(-EINVAL); + p = (struct smb2_error_context_rsp *)(p->ErrorContextData + len); } } else if (le32_to_cpu(err->ByteCount) >= sizeof(*sym) && From a6ab75639e23169a741b0b2e12191fd8acb32c73 Mon Sep 17 00:00:00 2001 From: Nick Chan Date: Thu, 14 May 2026 21:16:01 +0800 Subject: [PATCH 879/972] nvme-apple: Reset q->sq_tail during queue init Fixes a "duplicate tag error for tag 0" firmware crash during controller reset while setting up a queue on Apple A11 / T8015 caused by stale entries in the submission queue due to an invalid sq_tail offset after reset. Fixes: 04d8ecf37b5e ("nvme: apple: Add Apple A11 support") Cc: stable@vger.kernel.org Suggested-by: Yuriy Havrylyuk Reviewed-by: Sven Peter Signed-off-by: Nick Chan Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 423c9c628e7bfa..c692fc73babfe5 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1009,6 +1009,7 @@ static void apple_nvme_init_queue(struct apple_nvme_queue *q) unsigned int depth = apple_nvme_queue_depth(q); struct apple_nvme *anv = queue_to_apple_nvme(q); + q->sq_tail = 0; q->cq_head = 0; q->cq_phase = 1; if (anv->hw->has_lsq_nvmmu) From ab26dfeba278b0efbcea012f1698cf524d9b5695 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Wed, 13 May 2026 22:26:22 +0900 Subject: [PATCH 880/972] cifs: client: stage smb3_reconfigure() updates and restore ctx on failure smb3_reconfigure() moves strings out of cifs_sb->ctx before the multichannel update, so a later failure can leave the live context with NULL strings or options that do not match the session. Stage the new ctx separately, commit it only on success, and restore the snapshot on failure. Also make smb3_sync_session_ctx_passwords() all-or-nothing. Commit session passwords before channel updates so newly added channels authenticate with the staged credentials. Fixes: ef529f655a2c ("cifs: client: allow changing multichannel mount options on remount") Reported-by: RAJASI MANDAL Closes: https://lore.kernel.org/lkml/CAEY6_V1+dzW3OD5zqXhsWyXwrDTrg5tAMGZ1AJ7_GAuRE+aevA@mail.gmail.com/ Link: https://lore.kernel.org/lkml/xkr2dlvgibq5j6gkcxd3yhhnj4atgxw2uy4eug2pxm7wy7nbms@iq6cf5taa65v/ Reviewed-by: Henrique Carvalho Signed-off-by: DaeMyung Kang Signed-off-by: Steve French --- fs/smb/client/fs_context.c | 161 +++++++++++++++++++++++++------------ 1 file changed, 108 insertions(+), 53 deletions(-) diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b63ec7ab6e5131..4c8b1b9ade8b83 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -736,7 +736,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data); static int smb3_get_tree(struct fs_context *fc); -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels); +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels); static int smb3_reconfigure(struct fs_context *fc); static const struct fs_context_operations smb3_fs_context_ops = { @@ -1010,25 +1010,34 @@ do { \ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { + char *password = NULL, *password2 = NULL; + if (ses->password && cifs_sb->ctx->password && strcmp(ses->password, cifs_sb->ctx->password)) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL); - if (!cifs_sb->ctx->password) + password = kstrdup(ses->password, GFP_KERNEL); + if (!password) return -ENOMEM; } if (ses->password2 && cifs_sb->ctx->password2 && strcmp(ses->password2, cifs_sb->ctx->password2)) { - kfree_sensitive(cifs_sb->ctx->password2); - cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL); - if (!cifs_sb->ctx->password2) { - kfree_sensitive(cifs_sb->ctx->password); - cifs_sb->ctx->password = NULL; + password2 = kstrdup(ses->password2, GFP_KERNEL); + if (!password2) { + kfree_sensitive(password); return -ENOMEM; } } + + if (password) { + kfree_sensitive(cifs_sb->ctx->password); + cifs_sb->ctx->password = password; + } + if (password2) { + kfree_sensitive(cifs_sb->ctx->password2); + cifs_sb->ctx->password2 = password2; + } + return 0; } @@ -1041,7 +1050,7 @@ int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_se * with the session's channel lock. This should be called whenever the maximum * allowed channels for a session changes (e.g., after a remount or reconfigure). */ -static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channels) +static void smb3_sync_ses_chan_max(struct cifs_ses *ses, size_t max_channels) { spin_lock(&ses->chan_lock); ses->chan_max = max_channels; @@ -1051,12 +1060,15 @@ static void smb3_sync_ses_chan_max(struct cifs_ses *ses, unsigned int max_channe static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); + struct smb3_fs_context *new_ctx = NULL; + struct smb3_fs_context *old_ctx = NULL; struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; unsigned int rsize = ctx->rsize, wsize = ctx->wsize; char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; + bool need_mchan_update; int rc; if (ses->expired_pwd) @@ -1066,6 +1078,16 @@ static int smb3_reconfigure(struct fs_context *fc) if (rc) return rc; + old_ctx = kzalloc_obj(*old_ctx); + if (!old_ctx) + return -ENOMEM; + + rc = smb3_fs_context_dup(old_ctx, cifs_sb->ctx); + if (rc) { + kfree(old_ctx); + return rc; + } + /* * We can not change UNC/username/password/domainname/ * workstation_name/nodename/iocharset @@ -1075,16 +1097,22 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); + STEAL_STRING(cifs_sb, ctx, domainname); + STEAL_STRING(cifs_sb, ctx, nodename); + STEAL_STRING(cifs_sb, ctx, iocharset); - if (need_recon == false) + if (!need_recon) { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); - else { + } else { if (ctx->password) { new_password = kstrdup(ctx->password, GFP_KERNEL); - if (!new_password) - return -ENOMEM; - } else + if (!new_password) { + rc = -ENOMEM; + goto restore_ctx; + } + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); + } } /* @@ -1094,11 +1122,29 @@ static int smb3_reconfigure(struct fs_context *fc) if (ctx->password2) { new_password2 = kstrdup(ctx->password2, GFP_KERNEL); if (!new_password2) { - kfree_sensitive(new_password); - return -ENOMEM; + rc = -ENOMEM; + goto restore_ctx; } - } else + } else { STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2); + } + + /* if rsize or wsize not passed in on remount, use previous values */ + ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; + ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; + + new_ctx = kzalloc_obj(*new_ctx); + if (!new_ctx) { + rc = -ENOMEM; + goto restore_ctx; + } + + rc = smb3_fs_context_dup(new_ctx, ctx); + if (rc) + goto restore_ctx; + + need_mchan_update = ctx->multichannel != cifs_sb->ctx->multichannel || + ctx->max_channels != cifs_sb->ctx->max_channels; /* * we may update the passwords in the ses struct below. Make sure we do @@ -1109,54 +1155,55 @@ static int smb3_reconfigure(struct fs_context *fc) /* * smb2_reconnect may swap password and password2 in case session setup * failed. First get ctx passwords in sync with ses passwords. It should - * be okay to do this even if this function were to return an error at a - * later stage + * be done before committing new passwords. */ rc = smb3_sync_session_ctx_passwords(cifs_sb, ses); if (rc) { mutex_unlock(&ses->session_mutex); - kfree_sensitive(new_password); - kfree_sensitive(new_password2); - return rc; + goto cleanup_new_ctx; + } + + /* + * If multichannel or max_channels has changed, update the session's channels accordingly. + * This may add or remove channels to match the new configuration. + */ + if (need_mchan_update) { + /* Prevent concurrent scaling operations */ + spin_lock(&ses->ses_lock); + if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { + spin_unlock(&ses->ses_lock); + mutex_unlock(&ses->session_mutex); + rc = -EINVAL; + goto cleanup_new_ctx; + } + ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; + spin_unlock(&ses->ses_lock); } /* - * now that allocations for passwords are done, commit them + * Commit session passwords before any channel work so newly added + * channels authenticate with the new credentials. */ if (new_password) { kfree_sensitive(ses->password); ses->password = new_password; + new_password = NULL; } if (new_password2) { kfree_sensitive(ses->password2); ses->password2 = new_password2; + new_password2 = NULL; } - /* - * If multichannel or max_channels has changed, update the session's channels accordingly. - * This may add or remove channels to match the new configuration. - */ - if ((ctx->multichannel != cifs_sb->ctx->multichannel) || - (ctx->max_channels != cifs_sb->ctx->max_channels)) { - + if (need_mchan_update) { /* Synchronize ses->chan_max with the new mount context */ smb3_sync_ses_chan_max(ses, ctx->max_channels); - /* Now update the session's channels to match the new configuration */ - /* Prevent concurrent scaling operations */ - spin_lock(&ses->ses_lock); - if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) { - spin_unlock(&ses->ses_lock); - mutex_unlock(&ses->session_mutex); - return -EINVAL; - } - ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS; - spin_unlock(&ses->ses_lock); mutex_unlock(&ses->session_mutex); - rc = smb3_update_ses_channels(ses, ses->server, - false /* from_reconnect */, - false /* disable_mchan */); + smb3_update_ses_channels(ses, ses->server, + false /* from_reconnect */, + false /* disable_mchan */); /* Clear scaling flag after operation */ spin_lock(&ses->ses_lock); @@ -1166,22 +1213,30 @@ static int smb3_reconfigure(struct fs_context *fc) mutex_unlock(&ses->session_mutex); } - STEAL_STRING(cifs_sb, ctx, domainname); - STEAL_STRING(cifs_sb, ctx, nodename); - STEAL_STRING(cifs_sb, ctx, iocharset); - - /* if rsize or wsize not passed in on remount, use previous values */ - ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; - ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; - smb3_cleanup_fs_context_contents(cifs_sb->ctx); - rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); + memcpy(cifs_sb->ctx, new_ctx, sizeof(*new_ctx)); + kfree(new_ctx); + new_ctx = NULL; + smb3_cleanup_fs_context(old_ctx); + old_ctx = NULL; smb3_update_mnt_flags(cifs_sb); #ifdef CONFIG_CIFS_DFS_UPCALL if (!rc) rc = dfs_cache_remount_fs(cifs_sb); #endif + return rc; + +cleanup_new_ctx: + smb3_cleanup_fs_context_contents(new_ctx); +restore_ctx: + kfree(new_ctx); + kfree_sensitive(new_password); + kfree_sensitive(new_password2); + smb3_cleanup_fs_context_contents(cifs_sb->ctx); + memcpy(cifs_sb->ctx, old_ctx, sizeof(*old_ctx)); + kfree(old_ctx); + return rc; } From 4e90368e8680d9ddcb06f82f2e63cbbcf21cef2c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 14 May 2026 15:52:06 +0800 Subject: [PATCH 881/972] soundwire: intel_auxdevice: Add es9356 to wake_capable_list Add es9356 to the wake_capable_list because it can generate jack events whilst the bus is stopped Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260514075206.3483-7-zhangyi@everest-semi.com Signed-off-by: Vinod Koul --- drivers/soundwire/intel_auxdevice.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c index ee951be1546577..0b8107bec9abe1 100644 --- a/drivers/soundwire/intel_auxdevice.c +++ b/drivers/soundwire/intel_auxdevice.c @@ -72,6 +72,7 @@ static struct wake_capable_part wake_capable_list[] = { {0x025d, 0x717}, {0x025d, 0x721}, {0x025d, 0x722}, + {0x04b3, 0x9356}, }; static bool is_wake_capable(struct sdw_slave *slave) From acf676b9de0c86bc735a7f04962d3d688e156ffc Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 8 May 2026 18:17:55 +0800 Subject: [PATCH 882/972] soundwire: intel: Move suspend tracking from trigger to pm suspend Mark all open DAI runtimes as suspended in the component .suspend callback instead of relying on SNDRV_PCM_TRIGGER_SUSPEND, which is not delivered during PAUSE or xrun states. If during system suspend a dai is open it means that it is in either in SUSPENDED, PAUSED or STOPPED (due to xrun) state and they will need to be re-initialized during resume (which is done in .prepare callback). Signed-off-by: Peter Ujfalusi Signed-off-by: Bard Liao Link: https://patch.msgid.link/20260508101755.1247039-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/intel.c | 31 ++++++-------------------- drivers/soundwire/intel_ace2x.c | 39 ++++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c index dcd7440e78fa64..af65214836b4d0 100644 --- a/drivers/soundwire/intel.c +++ b/drivers/soundwire/intel.c @@ -906,19 +906,6 @@ static int intel_trigger(struct snd_pcm_substream *substream, int cmd, struct sn } switch (cmd) { - case SNDRV_PCM_TRIGGER_SUSPEND: - - /* - * The .prepare callback is used to deal with xruns and resume operations. - * In the case of xruns, the DMAs and SHIM registers cannot be touched, - * but for resume operations the DMAs and SHIM registers need to be initialized. - * the .trigger callback is used to track the suspend case only. - */ - - dai_runtime->suspended = true; - - break; - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: dai_runtime->paused = true; break; @@ -955,10 +942,12 @@ static int intel_component_dais_suspend(struct snd_soc_component *component) struct snd_soc_dai *dai; /* - * In the corner case where a SUSPEND happens during a PAUSE, the ALSA core - * does not throw the TRIGGER_SUSPEND. This leaves the DAIs in an unbalanced state. - * Since the component suspend is called last, we can trap this corner case - * and force the DAIs to release their resources. + * Mark all open streams as suspended. + * Open streams at this point can be in SUSPENDED, PAUSED or STOPPED + * state and during prepare the DMAs and SHIM registers need to be + * initialized for them. + * The STOPPED state is a special corner case which can happen if audio + * experiences xrun at suspend time. */ for_each_component_dais(component, dai) { struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai); @@ -966,13 +955,7 @@ static int intel_component_dais_suspend(struct snd_soc_component *component) dai_runtime = cdns->dai_runtime_array[dai->id]; - if (!dai_runtime) - continue; - - if (dai_runtime->suspended) - continue; - - if (dai_runtime->paused) + if (dai_runtime) dai_runtime->suspended = true; } diff --git a/drivers/soundwire/intel_ace2x.c b/drivers/soundwire/intel_ace2x.c index 20422534baf19f..0a97253fe0c278 100644 --- a/drivers/soundwire/intel_ace2x.c +++ b/drivers/soundwire/intel_ace2x.c @@ -894,19 +894,6 @@ static int intel_trigger(struct snd_pcm_substream *substream, int cmd, struct sn } switch (cmd) { - case SNDRV_PCM_TRIGGER_SUSPEND: - - /* - * The .prepare callback is used to deal with xruns and resume operations. - * In the case of xruns, the DMAs and SHIM registers cannot be touched, - * but for resume operations the DMAs and SHIM registers need to be initialized. - * the .trigger callback is used to track the suspend case only. - */ - - dai_runtime->suspended = true; - - break; - case SNDRV_PCM_TRIGGER_PAUSE_PUSH: dai_runtime->paused = true; break; @@ -930,8 +917,34 @@ static const struct snd_soc_dai_ops intel_pcm_dai_ops = { .get_stream = intel_get_sdw_stream, }; +static int intel_component_dais_suspend(struct snd_soc_component *component) +{ + struct snd_soc_dai *dai; + + /* + * Mark all open streams as suspended. + * Open streams at this point can be in SUSPENDED, PAUSED or STOPPED + * state and during prepare the DMAs and SHIM registers need to be + * initialized for them. + * The STOPPED state is a special corner case which can happen if audio + * experiences xrun at suspend time. + */ + for_each_component_dais(component, dai) { + struct sdw_cdns *cdns = snd_soc_dai_get_drvdata(dai); + struct sdw_cdns_dai_runtime *dai_runtime; + + dai_runtime = cdns->dai_runtime_array[dai->id]; + + if (dai_runtime) + dai_runtime->suspended = true; + } + + return 0; +} + static const struct snd_soc_component_driver dai_component = { .name = "soundwire", + .suspend = intel_component_dais_suspend, }; /* From 31e62c2ebbfdc3fe3dbdf5e02c92a9dc67087a3a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 May 2026 11:37:18 -0700 Subject: [PATCH 883/972] ptrace: slightly saner 'get_dumpable()' logic The 'dumpability' of a task is fundamentally about the memory image of the task - the concept comes from whether it can core dump or not - and makes no sense when you don't have an associated mm. And almost all users do in fact use it only for the case where the task has a mm pointer. But we have one odd special case: ptrace_may_access() uses 'dumpable' to check various other things entirely independently of the MM (typically explicitly using flags like PTRACE_MODE_READ_FSCREDS). Including for threads that no longer have a VM (and maybe never did, like most kernel threads). It's not what this flag was designed for, but it is what it is. The ptrace code does check that the uid/gid matches, so you do have to be uid-0 to see kernel thread details, but this means that the traditional "drop capabilities" model doesn't make any difference for this all. Make it all make a *bit* more sense by saying that if you don't have a MM pointer, we'll use a cached "last dumpability" flag if the thread ever had a MM (it will be zero for kernel threads since it is never set), and require a proper CAP_SYS_PTRACE capability to override. Reported-by: Qualys Security Advisory Cc: Oleg Nesterov Cc: Kees Cook Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ kernel/exit.c | 1 + kernel/ptrace.c | 22 ++++++++++++++++------ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 368c7b4d7cb510..ee06cba5c6f538 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1002,6 +1002,9 @@ struct task_struct { unsigned sched_rt_mutex:1; #endif + /* Save user-dumpable when mm goes away */ + unsigned user_dumpable:1; + /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; diff --git a/kernel/exit.c b/kernel/exit.c index 9a909993ab1d8b..f50d73c272d6ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -571,6 +571,7 @@ static void exit_mm(void) */ smp_mb__after_spinlock(); local_irq_disable(); + current->user_dumpable = (get_dumpable(mm) == SUID_DUMP_USER); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 68c17daef8d40b..130043bfc2091c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -272,11 +272,24 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return ns_capable(ns, CAP_SYS_PTRACE); } +static bool task_still_dumpable(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm = task->mm; + if (mm) { + if (get_dumpable(mm) == SUID_DUMP_USER) + return true; + return ptrace_has_cap(mm->user_ns, mode); + } + + if (task->user_dumpable) + return true; + return ptrace_has_cap(&init_user_ns, mode); +} + /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -337,11 +350,8 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * Pairs with a write barrier in commit_creds(). */ smp_rmb(); - mm = task->mm; - if (mm && - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; + if (!task_still_dumpable(task, mode)) + return -EPERM; return security_ptrace_access_check(task, mode); } From 81a874233c305d29e37fdb70b691ff4254294c0b Mon Sep 17 00:00:00 2001 From: Jeremy Erazo Date: Thu, 14 May 2026 12:03:34 +0000 Subject: [PATCH 884/972] smb: client: avoid integer overflow in SMB2 READ length check SMB2 READ response validation in cifs_readv_receive() and handle_read_data() checks data_offset + data_len against the received buffer length. Both values are attacker-controlled fields from the server response and are stored as unsigned int, so the addition can wrap before the bounds check: fs/smb/client/transport.c:1259 if (!use_rdma_mr && (data_offset + data_len > buflen)) fs/smb/client/smb2ops.c:4839 else if (buf_len >= data_offset + data_len) A malicious SMB server can use this to bypass validation. In the non-encrypted receive path the client attempts an oversized socket read and stalls for the SMB response timeout (180 seconds) before reconnecting. In the SMB3 encrypted path, runtime testing shows the malformed length can reach copy_to_iter() in handle_read_data() with attacker-controlled size, where usercopy hardening stops the oversized copy before bytes reach userspace. Guard both call sites with check_add_overflow(), which is already used elsewhere in this subsystem (smb2pdu.c). On overflow, treat the response as malformed and reject with -EIO. Signed-off-by: Jeremy Erazo Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 4 +++- fs/smb/client/transport.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6cb9b144530d2..3738204984f5a0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4721,6 +4721,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, { unsigned int data_offset; unsigned int data_len; + unsigned int end_off; unsigned int cur_off; unsigned int cur_page_idx; unsigned int pad_len; @@ -4836,7 +4837,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } rdata->got_bytes = buffer_len; - } else if (buf_len >= data_offset + data_len) { + } else if (!check_add_overflow(data_offset, data_len, &end_off) && + buf_len >= end_off) { /* read response payload is in buf */ WARN_ONCE(buffer, "read data can be either in buf or in buffer"); copied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 05f8099047e1a4..fdf4e50c27ceb4 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1158,7 +1158,7 @@ int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { int length, len; - unsigned int data_offset, data_len; + unsigned int data_offset, data_len, end_off; struct cifs_io_subrequest *rdata = mid->callback_data; char *buf = server->smallbuf; unsigned int buflen = server->pdu_size; @@ -1256,11 +1256,14 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) use_rdma_mr = rdata->mr; #endif data_len = server->ops->read_data_length(buf, use_rdma_mr); - if (!use_rdma_mr && (data_offset + data_len > buflen)) { - /* data_len is corrupt -- discard frame */ - rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, - data_offset + data_len, buflen); - return cifs_readv_discard(server, mid); + if (!use_rdma_mr) { + if (check_add_overflow(data_offset, data_len, &end_off) || + end_off > buflen) { + /* data_len is corrupt -- discard frame */ + rdata->result = smb_EIO2(smb_eio_trace_read_rsp_malformed, + end_off, buflen); + return cifs_readv_discard(server, mid); + } } #ifdef CONFIG_CIFS_SMB_DIRECT From 905c559e51497b8bfdbb68df8be56d2f70f0de8e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 14 Mar 2026 14:24:56 +0100 Subject: [PATCH 885/972] gcc-plugins: Always define CONST_CAST_GIMPLE and CONST_CAST_TREE For gcc-16, the CONST_CAST macro family was removed. Add back what we were using in gcc-common.h, as they are simple wrappers. See GCC commits: c3d96ff9e916c02584aa081f03ab999292efbb50 458c7926d48959abcb2c1adaa22458e27459a551 Suggested-by: Ingo Saitz Link: https://lore.kernel.org/lkml/ab6OKoay0OWkywjK@spatz.zoo Fixes: 6b90bd4ba40b ("GCC plugin infrastructure") Tested-by: Ivan Bulatovic Tested-by: Christopher Cradock Signed-off-by: Kees Cook --- scripts/gcc-plugins/gcc-common.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 8f1b3500f8e2dc..abb1964c44d4ee 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -309,7 +309,9 @@ typedef const gimple *const_gimple_ptr; #define gimple gimple_ptr #define const_gimple const_gimple_ptr #undef CONST_CAST_GIMPLE -#define CONST_CAST_GIMPLE(X) CONST_CAST(gimple, (X)) +#define CONST_CAST_GIMPLE(X) const_cast((X)) +#undef CONST_CAST_TREE +#define CONST_CAST_TREE(X) const_cast((X)) /* gimple related */ static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree lhs, tree op1, tree op2 MEM_STAT_DECL) From 28e03f78e69cf6628b81f24777799778528a84c1 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 5 May 2026 12:24:17 +0200 Subject: [PATCH 886/972] x86/xen: Fix xen_e820_swap_entry_with_ram() When swapping a not page-aligned E820 map entry with RAM, the start address of the modified entry is calculated wrong (the offset into the page is subtracted instead of being added to the page address). Fixes: be35d91c8880 ("xen: tolerate ACPI NVS memory overlapping with Xen allocated memory") Reported-by: Jan Beulich Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross Message-ID: <20260505102417.208138-1-jgross@suse.com> --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index bb95a05259b895..41251d4cf953f2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -655,7 +655,7 @@ static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) /* Fill new entry (keep size and page offset). */ entry->type = swap_entry->type; entry->addr = entry_end - swap_size + - swap_addr - swap_entry->addr; + swap_entry->addr - swap_addr; entry->size = swap_entry->size; /* Convert old entry to RAM, align to pages. */ From 4594437880ce347ac8438758fd91543f70da1aa9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 8 May 2026 16:39:33 +0200 Subject: [PATCH 887/972] x86/xen: Tolerate nested XEN_LAZY_MMU entering/leaving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the support of nested lazy mmu sections it can happen that arch_enter_lazy_mmu_mode() is being called twice without a call of arch_leave_lazy_mmu_mode() in between, as the lazy_mmu_*() helpers are not disabling preemption when checking for nested lazy mmu sections. This is a problem when running as a Xen PV guest, as xen_enter_lazy_mmu() and xen_leave_lazy_mmu() don't tolerate this case. Fix that in xen_enter_lazy_mmu() and xen_leave_lazy_mmu() in order not to hurt all other lazy mmu mode users. Fixes: 291b3abed657 ("x86/xen: use lazy_mmu_state when context-switching") Tested-by: Marek Marczykowski-Górecki Signed-off-by: Juergen Gross Message-ID: <20260508143933.493013-1-jgross@suse.com> --- arch/x86/xen/mmu_pv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index c80d0058efd18f..3eee5f84f8a70d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2145,7 +2145,10 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void xen_enter_lazy_mmu(void) { - enter_lazy(XEN_LAZY_MMU); + preempt_disable(); + if (xen_get_lazy_mode() != XEN_LAZY_MMU) + enter_lazy(XEN_LAZY_MMU); + preempt_enable(); } static void xen_flush_lazy_mmu(void) @@ -2182,7 +2185,8 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - leave_lazy(XEN_LAZY_MMU); + if (xen_get_lazy_mode() != XEN_LAZY_NONE) + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } From 05f2a68b407a6817fe141dd64972c6ab8725312d Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:58:23 -0700 Subject: [PATCH 888/972] vfio/pci: Set up BAR resources and maps in vfio_pci_core_enable() Previously BAR resource requests and the corresponding pci_iomap() were performed on-demand and without synchronisation, which was racy. Rather than add synchronisation, it's simplest to address this by doing both activities from vfio_pci_core_enable(). The resource allocation and/or pci_iomap() can still fail; their status is tracked and existing calls to vfio_pci_core_setup_barmap() will fail in a similar way to before. This keeps the point of failure as observed by userspace the same, i.e. failures to request/map unused BARs are benign. Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511145829.2993601-2-mattev@meta.com [ERR_PTR -> IOMEM_ERR_PTR per lkp report] Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 37 +++++++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_rdwr.c | 26 ++++++---------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 3f8d093aacf8a0..050e7542952ed1 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -482,6 +482,40 @@ static int vfio_pci_core_runtime_resume(struct device *dev) } #endif /* CONFIG_PM */ +/* + * Eager-request BAR resources, and iomap them. Soft failures are + * allowed, and consumers must check the barmap before use in order to + * give compatible user-visible behaviour with the previous on-demand + * allocation method. + */ +static void vfio_pci_core_map_bars(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + int bar = i + PCI_STD_RESOURCES; + + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENODEV); + + if (!pci_resource_len(pdev, i)) + continue; + + if (pci_request_selected_regions(pdev, 1 << bar, "vfio")) { + pci_dbg(pdev, "Failed to reserve region %d\n", bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-EBUSY); + continue; + } + + vdev->barmap[bar] = pci_iomap(pdev, bar, 0); + if (!vdev->barmap[bar]) { + pci_dbg(pdev, "Failed to iomap region %d\n", bar); + pci_release_selected_regions(pdev, 1 << bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENOMEM); + } + } +} + /* * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power @@ -568,6 +602,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; + vfio_pci_core_map_bars(vdev); return 0; @@ -648,7 +683,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) for (i = 0; i < PCI_STD_NUM_BARS; i++) { bar = i + PCI_STD_RESOURCES; - if (!vdev->barmap[bar]) + if (IS_ERR_OR_NULL(vdev->barmap[bar])) continue; pci_iounmap(pdev, vdev->barmap[bar]); pci_release_selected_regions(pdev, 1 << bar); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 4251ee03e1463b..3bfbb879a00513 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -198,27 +198,15 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, } EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); +/* + * The barmap is set up in vfio_pci_core_enable(). Callers use this + * function to check that the BAR resources are requested or that the + * pci_iomap() was done. + */ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { - struct pci_dev *pdev = vdev->pdev; - int ret; - void __iomem *io; - - if (vdev->barmap[bar]) - return 0; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); - if (ret) - return ret; - - io = pci_iomap(pdev, bar, 0); - if (!io) { - pci_release_selected_regions(pdev, 1 << bar); - return -ENOMEM; - } - - vdev->barmap[bar] = io; - + if (IS_ERR(vdev->barmap[bar])) + return PTR_ERR(vdev->barmap[bar]); return 0; } EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap); From 702809dabdecca807bdd50cfdcc1c980feb2ba62 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:58:24 -0700 Subject: [PATCH 889/972] vfio/pci: Check BAR resources before exporting a DMABUF A DMABUF exports access to BAR resources and, although they are requested at startup time, we need to ensure they really were reserved before exporting. Otherwise, it's possible to access unreserved resources through the export. Add a check to the DMABUF-creation path. Fixes: 5d74781ebc86c ("vfio/pci: Add dma-buf export support for MMIO regions") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511145829.2993601-3-mattev@meta.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_dmabuf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index fdc22e8b465680..1a177ce7de546a 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -244,9 +244,11 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, return -EINVAL; /* - * For PCI the region_index is the BAR number like everything else. + * For PCI the region_index is the BAR number like everything + * else. Check that PCI resources have been claimed for it. */ - if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX || + vfio_pci_core_setup_barmap(vdev, get_dma_buf.region_index)) return -ENODEV; dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, From c207f1d785044667f87cc8c72355e33f3981f2d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 13 May 2026 19:50:02 +0100 Subject: [PATCH 890/972] smbdirect: Fix error cleanup in smbdirect_map_sges_from_iter() Fix smbdirect_map_sges_from_iter() to use pre-decrement, not post-decrement so that it cleans up the correct slots. Fixes: e5fbdde43017 ("cifs: Add a function to build an RDMA SGE list from an iterator") Closes: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Reviewed-by: Stefan Metzmacher cc: Paulo Alcantara cc: Tom Talpey cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/smbdirect/connection.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/smbdirect/connection.c b/fs/smb/smbdirect/connection.c index fe9912e53da69b..8adf580975344b 100644 --- a/fs/smb/smbdirect/connection.c +++ b/fs/smb/smbdirect/connection.c @@ -2168,7 +2168,7 @@ static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len, if (ret < 0) { while (state->num_sge > before) { - struct ib_sge *sge = &state->sge[state->num_sge--]; + struct ib_sge *sge = &state->sge[--state->num_sge]; ib_dma_unmap_page(state->device, sge->addr, From 51f57607e30bee282a1d40845f89a311cbb26481 Mon Sep 17 00:00:00 2001 From: Chen-Shi-Hong Date: Thu, 14 May 2026 23:39:13 +0800 Subject: [PATCH 891/972] docs: hwmon: sy7636a: fix temperature sysfs attribute name The hwmon sysfs naming convention uses temp[1-*]_input for temperature channels. Documentation/hwmon/sy7636a-hwmon.rst currently documents temp0_input, while the driver uses the standard hwmon temperature channel interface. Update the documentation to use temp1_input. Signed-off-by: Chen-Shi-Hong Link: https://lore.kernel.org/r/20260514154108.1937-1-eric039eric@gmail.com Signed-off-by: Guenter Roeck --- Documentation/hwmon/sy7636a-hwmon.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/hwmon/sy7636a-hwmon.rst b/Documentation/hwmon/sy7636a-hwmon.rst index 0143ce0e5db765..03d866aba6e81c 100644 --- a/Documentation/hwmon/sy7636a-hwmon.rst +++ b/Documentation/hwmon/sy7636a-hwmon.rst @@ -22,5 +22,5 @@ The following sensors are supported sysfs-Interface --------------- -temp0_input +temp1_input - Temperature of external NTC (milli-degree C) From ab50528ab5b5dd282c43384d3e246068048e0840 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 14 May 2026 09:33:58 +0100 Subject: [PATCH 892/972] ASoC: tac5xx2-sdw: Use new SoundWire enumeration helper Update the driver to use the new core helper that waits for the device to enumerate on SoundWire and be initialised by the SoundWire core. Suggested-by: Niranjan H Y Signed-off-by: Charles Keepax Tested-by: Niranjan H Y Link: https://patch.msgid.link/20260514083358.2559733-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/tac5xx2-sdw.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/sound/soc/codecs/tac5xx2-sdw.c b/sound/soc/codecs/tac5xx2-sdw.c index 917b36ac1cd3b4..bb12cfb6da12bc 100644 --- a/sound/soc/codecs/tac5xx2-sdw.c +++ b/sound/soc/codecs/tac5xx2-sdw.c @@ -1437,7 +1437,6 @@ static s32 tac5xx2_sdca_dev_resume(struct device *dev) { struct tac5xx2_prv *tac_dev = dev_get_drvdata(dev); struct sdw_slave *slave = dev_to_sdw_dev(dev); - unsigned long t; int ret; if (!tac_dev->first_hw_init_done) { @@ -1445,19 +1444,10 @@ static s32 tac5xx2_sdca_dev_resume(struct device *dev) return 0; } - if (!slave->unattach_request) - goto regmap_sync; - - t = wait_for_completion_timeout(&slave->initialization_complete, - msecs_to_jiffies(TAC5XX2_PROBE_TIMEOUT_MS)); - if (!t) { - dev_err(&slave->dev, "resume: initialization timed out\n"); - sdw_show_ping_status(slave->bus, true); - return -ETIMEDOUT; - } - slave->unattach_request = 0; + ret = sdw_slave_wait_for_init(slave, TAC5XX2_PROBE_TIMEOUT_MS); + if (ret) + return ret; -regmap_sync: regcache_cache_only(tac_dev->regmap, false); regcache_mark_dirty(tac_dev->regmap); ret = regcache_sync(tac_dev->regmap); From b96fe527935b0671194bc436d7d78d3b0f87b2e1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 May 2026 18:26:12 +0200 Subject: [PATCH 893/972] ASoC: cs35l56: Drop malformed default N from Kconfig First of all, it has to be 'default n' (small letter n), otherwise it looks for CONFIG_N which is absent and in case of appearance will enable something unrelated. Second and most important is that 'n' *is* the default 'default' already. Hence just drop malformed line. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20260513162612.365729-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index cf94a1c756e09e..269c31ce081431 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -898,7 +898,6 @@ menu "CS35L56 driver options" config SND_SOC_CS35L56_CAL_DEBUGFS bool "CS35L56 create debugfs for factory calibration" - default N depends on DEBUG_FS select SND_SOC_CS35L56_CAL_DEBUGFS_COMMON help @@ -909,7 +908,6 @@ config SND_SOC_CS35L56_CAL_DEBUGFS config SND_SOC_CS35L56_CAL_SET_CTRL bool "CS35L56 ALSA control to restore factory calibration" - default N select SND_SOC_CS35L56_CAL_DEBUGFS_COMMON help Allow restoring factory calibration data through an ALSA @@ -923,7 +921,6 @@ config SND_SOC_CS35L56_CAL_SET_CTRL config SND_SOC_CS35L56_CAL_PERFORM_CTRL bool "CS35L56 ALSA control to perform factory calibration" - default N select SND_SOC_CS35L56_CAL_DEBUGFS_COMMON help Allow performing factory calibration data through an ALSA From 7c0acb8f766a5c861595d6de45b6751444c2680d Mon Sep 17 00:00:00 2001 From: bui duc phuc Date: Thu, 14 May 2026 18:06:02 +0700 Subject: [PATCH 894/972] ASoC: ti: omap-mcbsp: Remove mixed goto/scoped cleanup handling After converting to guard()/scoped_guard() helpers, omap_mcbsp_request() still mixes scoped cleanup with goto based error handling. Remove the remaining goto based cleanup paths for a more consistent cleanup flow. Suggested-by: Mark Brown Signed-off-by: bui duc phuc Acked-by: Sen Wang Link: https://patch.msgid.link/20260514110602.30480-1-phucduc.bui@gmail.com Signed-off-by: Mark Brown --- sound/soc/ti/omap-mcbsp.c | 46 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/sound/soc/ti/omap-mcbsp.c b/sound/soc/ti/omap-mcbsp.c index d82fef629867aa..26af616c33f510 100644 --- a/sound/soc/ti/omap-mcbsp.c +++ b/sound/soc/ti/omap-mcbsp.c @@ -307,7 +307,7 @@ static int omap_mcbsp_request(struct omap_mcbsp *mcbsp) reg_cache = NULL; } - if(mcbsp->pdata->ops && mcbsp->pdata->ops->request) + if (mcbsp->pdata->ops && mcbsp->pdata->ops->request) mcbsp->pdata->ops->request(mcbsp->id - 1); /* @@ -322,42 +322,40 @@ static int omap_mcbsp_request(struct omap_mcbsp *mcbsp) "McBSP", (void *)mcbsp); if (err != 0) { dev_err(mcbsp->dev, "Unable to request IRQ\n"); - goto err_clk_disable; } } else { err = request_irq(mcbsp->tx_irq, omap_mcbsp_tx_irq_handler, 0, "McBSP TX", (void *)mcbsp); if (err != 0) { dev_err(mcbsp->dev, "Unable to request TX IRQ\n"); - goto err_clk_disable; - } - - err = request_irq(mcbsp->rx_irq, omap_mcbsp_rx_irq_handler, 0, - "McBSP RX", (void *)mcbsp); - if (err != 0) { - dev_err(mcbsp->dev, "Unable to request RX IRQ\n"); - goto err_free_irq; + } else { + err = request_irq(mcbsp->rx_irq, omap_mcbsp_rx_irq_handler, 0, + "McBSP RX", (void *)mcbsp); + if (err != 0) { + dev_err(mcbsp->dev, "Unable to request RX IRQ\n"); + free_irq(mcbsp->tx_irq, (void *)mcbsp); + } } } - return 0; -err_free_irq: - free_irq(mcbsp->tx_irq, (void *)mcbsp); -err_clk_disable: - if(mcbsp->pdata->ops && mcbsp->pdata->ops->free) - mcbsp->pdata->ops->free(mcbsp->id - 1); + if (err != 0) { + if (mcbsp->pdata->ops && mcbsp->pdata->ops->free) + mcbsp->pdata->ops->free(mcbsp->id - 1); - /* Disable wakeup behavior */ - if (mcbsp->pdata->has_wakeup) - MCBSP_WRITE(mcbsp, WAKEUPEN, 0); + /* Disable wakeup behavior */ + if (mcbsp->pdata->has_wakeup) + MCBSP_WRITE(mcbsp, WAKEUPEN, 0); - scoped_guard(spinlock, &mcbsp->lock) { - reg_cache = mcbsp->reg_cache; - mcbsp->free = true; - mcbsp->reg_cache = NULL; + scoped_guard(spinlock, &mcbsp->lock) { + reg_cache = mcbsp->reg_cache; + mcbsp->free = true; + mcbsp->reg_cache = NULL; + } + + return err; } - return err; + return 0; } static void omap_mcbsp_free(struct omap_mcbsp *mcbsp) From c996a4418dd4ee45cd086586c04a1103e8160308 Mon Sep 17 00:00:00 2001 From: Ingyu Jang Date: Fri, 15 May 2026 03:52:15 +0900 Subject: [PATCH 895/972] ASoC: ti: omap-dmic: Fix IS_ERR() vs NULL check bug in omap_dmic_select_fclk() clk_get_parent() returns NULL when the clock has no parent (or when the input clk is NULL); it never returns an ERR_PTR. The current IS_ERR(mux) check therefore never triggers - a NULL return falls through silently to clk_set_parent(NULL, parent_clk), which simply fails with -EINVAL. Use a NULL check so the dedicated error path runs and the prior clk_get() reference is released via clk_put(). Signed-off-by: Ingyu Jang Acked-by: Sen Wang Link: https://patch.msgid.link/20260514185215.3753998-1-ingyujang25@korea.ac.kr Signed-off-by: Mark Brown --- sound/soc/ti/omap-dmic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/ti/omap-dmic.c b/sound/soc/ti/omap-dmic.c index fb92bb88eb5c2b..f6c393c9489d62 100644 --- a/sound/soc/ti/omap-dmic.c +++ b/sound/soc/ti/omap-dmic.c @@ -328,7 +328,7 @@ static int omap_dmic_select_fclk(struct omap_dmic *dmic, int clk_id, } mux = clk_get_parent(dmic->fclk); - if (IS_ERR(mux)) { + if (!mux) { dev_err(dmic->dev, "can't get fck mux parent\n"); clk_put(parent_clk); return -ENODEV; From 9c0f5bbff146f09f449dd528addeabfb68aef997 Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Thu, 14 May 2026 16:18:54 +0100 Subject: [PATCH 896/972] ASoC: cs35l56: Log SoundWire status updates only on changes The SoundWire slave update_status() callback can be invoked when the status has not changed. To prevent large amounts of log noise with debug enabled, log them only when the status changes. This also helps with understanding them, because they now log an actual change in state. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260514151854.695145-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-sdw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 9dc47fec1ea048..d344217de7aa62 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -385,18 +385,19 @@ static int cs35l56_sdw_update_status(struct sdw_slave *peripheral, switch (status) { case SDW_SLAVE_ATTACHED: - dev_dbg(cs35l56->base.dev, "%s: ATTACHED\n", __func__); cs35l56->sdw_in_clock_stop_1 = false; if (cs35l56->sdw_attached) break; + dev_dbg(cs35l56->base.dev, "%s: ATTACHED\n", __func__); if (!cs35l56->base.init_done || cs35l56->soft_resetting) cs35l56_sdw_init(peripheral); cs35l56->sdw_attached = true; break; case SDW_SLAVE_UNATTACHED: - dev_dbg(cs35l56->base.dev, "%s: UNATTACHED\n", __func__); + if (cs35l56->sdw_attached) + dev_dbg(cs35l56->base.dev, "%s: UNATTACHED\n", __func__); cs35l56->sdw_attached = false; break; default: From 0d435a7ebcd4e97e47673c1ab6fb27f973a053ec Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Wed, 13 May 2026 21:08:52 +0200 Subject: [PATCH 897/972] ASoC: codecs: fs210x: fix possible buffer overflow In fs210x_effect_scene_info(), a string was copied like this: strscpy(DST, SRC, strlen(SRC) + 1); A buffer overflow would happen if strlen(SRC) >= sizeof(DST). Actually, strscpy() must be used this way: strscpy(DST, SRC, sizeof(DST)); strscpy(DST, SRC); // defaults to sizeof(DST) Fixes: 756117701779 ("ASoC: codecs: Add FourSemi FS2104/5S audio amplifier driver") Signed-off-by: Alexander A. Klimov Link: https://patch.msgid.link/20260513190852.196723-2-grandmaster@al2klimov.de Signed-off-by: Mark Brown --- sound/soc/codecs/fs210x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/fs210x.c b/sound/soc/codecs/fs210x.c index e6195b71adadcc..eda716f817b58c 100644 --- a/sound/soc/codecs/fs210x.c +++ b/sound/soc/codecs/fs210x.c @@ -968,7 +968,7 @@ static int fs210x_effect_scene_info(struct snd_kcontrol *kcontrol, if (scene->name) name = scene->name; - strscpy(uinfo->value.enumerated.name, name, strlen(name) + 1); + strscpy(uinfo->value.enumerated.name, name); return 0; } From 5d9cb740cd38fa17247b4a62af135901fa5c4d9c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 14 May 2026 15:52:03 +0800 Subject: [PATCH 898/972] ASoC: es9356-sdca: Add ES9356 SDCA driver This is the codec driver for es9356-sdca. Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20260514075206.3483-4-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 7 + sound/soc/codecs/Makefile | 2 + sound/soc/codecs/es9356.c | 1151 +++++++++++++++++++++++++++++++++++++ sound/soc/codecs/es9356.h | 208 +++++++ 4 files changed, 1368 insertions(+) create mode 100644 sound/soc/codecs/es9356.c create mode 100644 sound/soc/codecs/es9356.h diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 069ec05e4a6152..f2e62c2b75b9c6 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -122,6 +122,7 @@ config SND_SOC_ALL_CODECS imply SND_SOC_ES8328_I2C imply SND_SOC_ES8375 imply SND_SOC_ES8389 + imply SND_SOC_ES9356 imply SND_SOC_ES7134 imply SND_SOC_ES7241 imply SND_SOC_FRAMER @@ -1302,6 +1303,12 @@ config SND_SOC_ES8389 tristate "Everest Semi ES8389 CODEC" depends on I2C +config SND_SOC_ES9356 + tristate "Everest Semi ES9356 CODEC SDW" + depends on SOUNDWIRE + select REGMAP_SOUNDWIRE + select REGMAP_SOUNDWIRE_MBQ + config SND_SOC_FRAMER tristate "Framer codec" depends on GENERIC_FRAMER diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile index 2c2d0553f35fbb..73315d017c57b4 100644 --- a/sound/soc/codecs/Makefile +++ b/sound/soc/codecs/Makefile @@ -138,6 +138,7 @@ snd-soc-es8328-i2c-y := es8328-i2c.o snd-soc-es8328-spi-y := es8328-spi.o snd-soc-es8375-y := es8375.o snd-soc-es8389-y := es8389.o +snd-soc-es9356-y := es9356.o snd-soc-framer-y := framer-codec.o snd-soc-fs-amp-lib-y := fs-amp-lib.o snd-soc-fs210x-y := fs210x.o @@ -577,6 +578,7 @@ obj-$(CONFIG_SND_SOC_ES8328_I2C)+= snd-soc-es8328-i2c.o obj-$(CONFIG_SND_SOC_ES8328_SPI)+= snd-soc-es8328-spi.o obj-$(CONFIG_SND_SOC_ES8375) += snd-soc-es8375.o obj-$(CONFIG_SND_SOC_ES8389) += snd-soc-es8389.o +obj-$(CONFIG_SND_SOC_ES9356) += snd-soc-es9356.o obj-$(CONFIG_SND_SOC_FRAMER) += snd-soc-framer.o obj-$(CONFIG_SND_SOC_FS_AMP_LIB)+= snd-soc-fs-amp-lib.o obj-$(CONFIG_SND_SOC_FS210X) += snd-soc-fs210x.o diff --git a/sound/soc/codecs/es9356.c b/sound/soc/codecs/es9356.c new file mode 100644 index 00000000000000..78fddd9d01711d --- /dev/null +++ b/sound/soc/codecs/es9356.c @@ -0,0 +1,1151 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// es9356.c -- SoundWire codec driver +// +// Copyright(c) 2025 Everest Semiconductor Co., Ltd +// +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "es9356.h" + +struct es9356_sdw_priv { + struct sdw_slave *slave; + struct device *dev; + struct regmap *regmap; + struct snd_soc_component *component; + struct snd_soc_jack *hs_jack; + + /* lock for irq*/ + struct mutex disable_irq_lock; + + /* lock for pde*/ + struct mutex pde_lock; + + bool hw_init; + bool first_hw_init; + int jack_type; + bool disable_irq; + + struct delayed_work interrupt_handle_work; + struct delayed_work button_detect_work; + unsigned int sdca_status; +}; + +static int es9356_sdw_component_probe(struct snd_soc_component *component) +{ + struct es9356_sdw_priv *es9356 = snd_soc_component_get_drvdata(component); + + es9356->component = component; + + return 0; +} + +static const DECLARE_TLV_DB_SCALE(out_vol_tlv, -9600, 12, 0); +static const DECLARE_TLV_DB_SCALE(amic_gain_tlv, 0, 3, 0); +static const DECLARE_TLV_DB_SCALE(dmic_gain_tlv, 0, 6, 0); + +static const struct snd_kcontrol_new es9356_sdca_controls[] = { + SDCA_SINGLE_Q78_TLV("FU41 Left Playback Volume", + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_L), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU41 Right Playback Volume", + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_R), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU36 Left Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_L), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU36 Right Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_R), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU33 Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU33, ES9356_SDCA_CTL_FU_CH_GAIN, 0), + ES9356_GAIN_MIN, ES9356_AMIC_GAIN_MAX, ES9356_AMIC_GAIN_STEP, amic_gain_tlv), + SDCA_SINGLE_Q78_TLV("FU21 Left Playback Volume", + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_L), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU21 Right Playback Volume", + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_R), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU113 Left Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_L), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU113 Right Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_R), + ES9356_VOLUME_MIN, ES9356_VOLUME_MAX, ES9356_VOLUME_STEP, out_vol_tlv), + SDCA_SINGLE_Q78_TLV("FU11 Capture Volume", + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU11, ES9356_SDCA_CTL_FU_CH_GAIN, 0), + ES9356_GAIN_MIN, ES9356_DMIC_GAIN_MAX, ES9356_DMIC_GAIN_STEP, dmic_gain_tlv), +}; + +static const char *const es9356_left_mux_txt[] = { + "Left", + "Right", +}; + +static const char *const es9356_right_mux_txt[] = { + "Right", + "Left", +}; + +static const struct soc_enum es9356_left_mux_enum = + SOC_ENUM_SINGLE(ES9356_DAC_SWAP, 1, + ARRAY_SIZE(es9356_left_mux_txt), es9356_left_mux_txt); +static const struct soc_enum es9356_right_mux_enum = + SOC_ENUM_SINGLE(ES9356_DAC_SWAP, 0, + ARRAY_SIZE(es9356_right_mux_txt), es9356_right_mux_txt); + +static const struct snd_kcontrol_new es9356_left_mux_controls = + SOC_DAPM_ENUM("Channel MUX", es9356_left_mux_enum); +static const struct snd_kcontrol_new es9356_right_mux_controls = + SOC_DAPM_ENUM("Channel MUX", es9356_right_mux_enum); + +static const struct snd_soc_dapm_widget es9356_dapm_widgets[] = { + SND_SOC_DAPM_OUTPUT("HP"), + SND_SOC_DAPM_OUTPUT("SPK"), + SND_SOC_DAPM_INPUT("MIC1"), + SND_SOC_DAPM_INPUT("PDM_DIN"), + + SND_SOC_DAPM_SUPPLY("DMIC Clock", ES9356_DMIC_GPIO, 1, 1, NULL, 0), + + SND_SOC_DAPM_AIF_IN("DP4RX", "DP4 Playback", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_IN("DP3RX", "DP3 Playback", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("DP1TX", "DP1 Capture", 0, SND_SOC_NOPM, 0, 0), + SND_SOC_DAPM_AIF_OUT("DP2TX", "DP2 Capture", 0, SND_SOC_NOPM, 0, 0), + + SND_SOC_DAPM_PGA("IF DP3RXL", SND_SOC_NOPM, 0, 0, NULL, 0), + SND_SOC_DAPM_PGA("IF DP3RXR", SND_SOC_NOPM, 0, 0, NULL, 0), + + SND_SOC_DAPM_MUX("Left Channel MUX", SND_SOC_NOPM, 0, 0, &es9356_left_mux_controls), + SND_SOC_DAPM_MUX("Right Channel MUX", SND_SOC_NOPM, 0, 0, &es9356_right_mux_controls), + + SND_SOC_DAPM_DAC("FU 21 Left", NULL, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_MUTE, CH_L), 0, 1), + SND_SOC_DAPM_DAC("FU 21 Right", NULL, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_MUTE, CH_R), 0, 1), + SND_SOC_DAPM_DAC("FU 41 Left", NULL, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_MUTE, CH_L), 0, 1), + SND_SOC_DAPM_DAC("FU 41 Right", NULL, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_MUTE, CH_R), 0, 1), + SND_SOC_DAPM_DAC("FU 113 Left", NULL, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_MUTE, CH_L), 0, 1), + SND_SOC_DAPM_DAC("FU 113 Right", NULL, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_MUTE, CH_R), 0, 1), + SND_SOC_DAPM_DAC("FU 36 Left", NULL, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_MUTE, CH_L), 0, 1), + SND_SOC_DAPM_DAC("FU 36 Right", NULL, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_MUTE, CH_R), 0, 1), +}; + +static const struct snd_soc_dapm_route es9356_audio_map[] = { + {"FU 36 Left", NULL, "MIC1"}, + {"FU 36 Right", NULL, "MIC1"}, + {"DP2TX", NULL, "FU 36 Left"}, + {"DP2TX", NULL, "FU 36 Right"}, + + {"PDM_DIN", NULL, "DMIC Clock"}, + {"FU 113 Left", NULL, "PDM_DIN"}, + {"FU 113 Right", NULL, "PDM_DIN"}, + {"DP1TX", NULL, "FU 113 Left"}, + {"DP1TX", NULL, "FU 113 Right"}, + + {"FU 41 Left", NULL, "DP4RX"}, + {"FU 41 Right", NULL, "DP4RX"}, + + {"IF DP3RXL", NULL, "DP3RX"}, + {"IF DP3RXR", NULL, "DP3RX"}, + + {"Left Channel MUX", "Left", "IF DP3RXL"}, + {"Left Channel MUX", "Right", "IF DP3RXR"}, + {"Right Channel MUX", "Left", "IF DP3RXL"}, + {"Right Channel MUX", "Right", "IF DP3RXR"}, + + {"FU 21 Left", NULL, "Left Channel MUX"}, + {"FU 21 Right", NULL, "Right Channel MUX"}, + + {"SPK", NULL, "FU 21 Left"}, + {"SPK", NULL, "FU 21 Right"}, + + {"HP", NULL, "FU 41 Left"}, + {"HP", NULL, "FU 41 Right"}, +}; + +static int es9356_set_jack_detect(struct snd_soc_component *component, + struct snd_soc_jack *hs_jack, void *data) +{ + struct es9356_sdw_priv *es9356 = snd_soc_component_get_drvdata(component); + int ret; + + es9356->hs_jack = hs_jack; + + /* we can only resume if the device was initialized at least once */ + if (!es9356->first_hw_init) + return 0; + + ret = pm_runtime_resume_and_get(component->dev); + if (ret < 0) { + if (ret != -EACCES) { + dev_err(component->dev, "%s: failed to resume %d\n", __func__, ret); + return ret; + } + /* pm_runtime not enabled yet */ + dev_info(component->dev, "%s: skipping jack init for now\n", __func__); + return 0; + } + + if (es9356->hs_jack) + sdw_write_no_pm(es9356->slave, SDW_SCP_SDCA_INTMASK1, + (SDW_SCP_SDCA_INTMASK_SDCA_7 | SDW_SCP_SDCA_INTMASK_SDCA_5 | SDW_SCP_SDCA_INTMASK_SDCA_1)); + + pm_runtime_mark_last_busy(component->dev); + pm_runtime_put_autosuspend(component->dev); + + return 0; +} +static const struct snd_soc_component_driver snd_soc_es9356_sdw_component = { + .probe = es9356_sdw_component_probe, + .controls = es9356_sdca_controls, + .num_controls = ARRAY_SIZE(es9356_sdca_controls), + .dapm_widgets = es9356_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(es9356_dapm_widgets), + .dapm_routes = es9356_audio_map, + .num_dapm_routes = ARRAY_SIZE(es9356_audio_map), + .set_jack = es9356_set_jack_detect, + .endianness = 1, +}; + +static int es9356_sdw_set_sdw_stream(struct snd_soc_dai *dai, void *sdw_stream, + int direction) +{ + snd_soc_dai_dma_data_set(dai, direction, sdw_stream); + + return 0; +} + +static void es9356_sdw_shutdown(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + snd_soc_dai_set_dma_data(dai, substream, NULL); +} + +static int es9356_sdca_button(unsigned int *buffer) +{ + int cur_button = -1; + + if (*(buffer + 1) | *(buffer + 2)) + return -EINVAL; + switch (*buffer) { + case 0x00: + cur_button = 0; + break; + case 0x20: + cur_button = SND_JACK_BTN_4; + break; + case 0x10: + cur_button = SND_JACK_BTN_2; + break; + case 0x08: + cur_button = SND_JACK_BTN_1; + break; + case 0x02: + cur_button = SND_JACK_BTN_3; + break; + case 0x01: + cur_button = SND_JACK_BTN_0; + break; + default: + break; + } + + return cur_button; +} + +static int es9356_sdca_button_detect(struct es9356_sdw_priv *es9356) +{ + unsigned int btn_type = 0, offset, idx, val, owner; + unsigned int button[3]; + int ret; + + ret = regmap_read(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_HID, ES9356_SDCA_ENT_HID01, ES9356_SDCA_CTL_HIDTX_CURRENT_OWNER, 0), &owner); + if (ret < 0 || owner == 0x01) + return 0; + + ret = regmap_read(es9356->regmap, ES9356_BUF_ADDR_HID, &offset); + if (ret < 0) + goto button_det_end; + + for (idx = 0; idx < ARRAY_SIZE(button); idx++) { + ret = regmap_read(es9356->regmap, ES9356_BUF_ADDR_HID + offset + idx, &val); + if (ret < 0) + goto button_det_end; + button[idx] = val; + } + + btn_type = es9356_sdca_button(&button[0]); + +button_det_end: + if (owner == 0x00) + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_HID, ES9356_SDCA_ENT_HID01, ES9356_SDCA_CTL_HIDTX_CURRENT_OWNER, 0), 0x01); + + return btn_type; +} + +static int es9356_sdca_headset_detect(struct es9356_sdw_priv *es9356) +{ + unsigned int reg; + int ret; + + ret = regmap_read(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_GE35, ES9356_SDCA_CTL_DETECTED_MODE, 0), ®); + + if (ret < 0) + goto io_error; + + switch (reg) { + case 0x00: + es9356->jack_type = 0; + break; + case 0x03: + es9356->jack_type = SND_JACK_HEADPHONE; + break; + case 0x04: + es9356->jack_type = SND_JACK_HEADSET; + break; + default: + es9356->jack_type = 0; + return -1; + } + + if (reg) { + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_GE35, ES9356_SDCA_CTL_SELECTED_MODE, 0), reg); + regmap_write(es9356->regmap, ES9356_HP_DETECTTIME, 0x75); + } else { + regmap_write(es9356->regmap, ES9356_HP_DETECTTIME, 0xa4); + } + + return 0; + +io_error: + pr_err_ratelimited("IO error in %s, ret %d\n", __func__, ret); + return ret; +} + +static void es9356_interrupt_handler(struct work_struct *work) +{ + struct es9356_sdw_priv *es9356 = + container_of(work, struct es9356_sdw_priv, interrupt_handle_work.work); + int ret, btn_type = 0; + + if (!es9356->hs_jack) + return; + + if (!es9356->component->card || !es9356->component->card->instantiated) + return; + + /* Handling different types of interrupts based on the mask bit */ + if (es9356->sdca_status & SDW_SCP_SDCA_INT_SDCA_7) { + btn_type = es9356_sdca_button_detect(es9356); + if (btn_type < 0) + return; + } else { + ret = es9356_sdca_headset_detect(es9356); + if (ret < 0) + return; + } + + if (es9356->jack_type != SND_JACK_HEADSET) + btn_type = 0; + + snd_soc_jack_report(es9356->hs_jack, es9356->jack_type | btn_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3 | + SND_JACK_BTN_4); + + if (btn_type) { + snd_soc_jack_report(es9356->hs_jack, es9356->jack_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3 | + SND_JACK_BTN_4); + mod_delayed_work(system_power_efficient_wq, + &es9356->button_detect_work, msecs_to_jiffies(280)); + } +} + +static void es9356_button_detect_handler(struct work_struct *work) +{ + struct es9356_sdw_priv *es9356 = + container_of(work, struct es9356_sdw_priv, button_detect_work.work); + int ret, idx, btn_type = 0; + unsigned int reg, offset; + unsigned int button[3]; + + /* Check headset */ + ret = regmap_read(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_GE35, ES9356_SDCA_CTL_DETECTED_MODE, 0), ®); + + if (ret < 0) + goto io_error; + + if (reg == 0x04) { + ret = regmap_read(es9356->regmap, ES9356_BUF_ADDR_HID, &offset); + if (ret < 0) + goto io_error; + for (idx = 0; idx < ARRAY_SIZE(button); idx++) { + ret = regmap_read(es9356->regmap, ES9356_BUF_ADDR_HID + offset + idx, ®); + if (ret < 0) + goto io_error; + button[idx] = reg; + } + btn_type = es9356_sdca_button(&button[0]); + if (btn_type < 0) + return; + } + + snd_soc_jack_report(es9356->hs_jack, es9356->jack_type | btn_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3 | + SND_JACK_BTN_4); + + if (btn_type) { + snd_soc_jack_report(es9356->hs_jack, es9356->jack_type, + SND_JACK_HEADSET | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | + SND_JACK_BTN_2 | SND_JACK_BTN_3 | + SND_JACK_BTN_4); + mod_delayed_work(system_power_efficient_wq, + &es9356->button_detect_work, msecs_to_jiffies(280)); + } + + return; +io_error: + pr_err_ratelimited("IO error in %s, ret %d\n", __func__, ret); +} + +static int es9356_pde_transition_delay(struct es9356_sdw_priv *es9356, unsigned char func, + unsigned char entity, unsigned char ps) +{ + unsigned int retries = 10, val; + + /* waiting for Actual PDE becomes to PS0/PS3 */ + while (retries) { + regmap_read(es9356->regmap, + SDW_SDCA_CTL(func, entity, ES9356_SDCA_CTL_ACTUAL_POWER_STATE, 0), &val); + if (val == ps) + return 1; + + usleep_range(1000, 1500); + retries--; + } + if (!retries) { + dev_dbg(&es9356->slave->dev, "%s PDE is NOT %s", __func__, ps?"PS3":"PS0"); + } + return 0; +} + +static int es9356_power_state(struct snd_soc_dai *dai, unsigned char ps, unsigned int *rate) +{ + struct snd_soc_component *component = dai->component; + struct es9356_sdw_priv *es9356 = snd_soc_component_get_drvdata(component); + unsigned char ps0 = 0x0, ps3 = 0x3; + unsigned char func, cs_entity, pde_entity; + int ret; + + switch (dai->id) { + case ES9356_DMIC: + func = FUNC_NUM_MIC; + cs_entity = ES9356_SDCA_ENT_CS113; + pde_entity = ES9356_SDCA_ENT_PDE11; + break; + case ES9356_AMP: + func = FUNC_NUM_AMP; + cs_entity = ES9356_SDCA_ENT_CS21; + pde_entity = ES9356_SDCA_ENT_PDE23; + break; + case ES9356_JACK_IN: + func = FUNC_NUM_UAJ; + cs_entity = ES9356_SDCA_ENT_CS36; + pde_entity = ES9356_SDCA_ENT_PDE34; + break; + case ES9356_JACK_OUT: + func = FUNC_NUM_UAJ; + cs_entity = ES9356_SDCA_ENT_CS41; + pde_entity = ES9356_SDCA_ENT_PDE47; + break; + default: + return -EINVAL; + } + + /* power state changes are not independent across functions */ + mutex_lock(&es9356->pde_lock); + ret = es9356_pde_transition_delay(es9356, func, pde_entity, ps?ps0:ps3); + if (ret) { + regmap_write(es9356->regmap, + SDW_SDCA_CTL(func, pde_entity, ES9356_SDCA_CTL_REQ_POWER_STATE, 0), ps?ps3:ps0); + es9356_pde_transition_delay(es9356, func, pde_entity, ps?ps3:ps0); + } else + dev_dbg(component->dev, "%s PDE is already %d\n", __func__, ps?ps0:ps3); + + mutex_unlock(&es9356->pde_lock); + + if (rate) + regmap_write(es9356->regmap, + SDW_SDCA_CTL(func, cs_entity, ES9356_SDCA_CTL_SAMPLE_FREQ_INDEX, 0), *rate); + + return 0; +} + +static int es9356_sdw_pcm_hw_params(struct snd_pcm_substream *substream, + struct snd_pcm_hw_params *params, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct es9356_sdw_priv *es9356 = snd_soc_component_get_drvdata(component); + struct sdw_stream_config stream_config = {0}; + struct sdw_port_config port_config = {0}; + struct sdw_stream_runtime *sdw_stream = snd_soc_dai_get_dma_data(dai, substream); + unsigned char ps0 = 0x0; + unsigned int rate; + int ret; + + if (!sdw_stream) + return -EINVAL; + + if (!es9356->slave) + return -EINVAL; + + /* SoundWire specific configuration */ + snd_sdw_params_to_config(substream, params, &stream_config, &port_config); + + port_config.num = dai->id; + + ret = sdw_stream_add_slave(es9356->slave, &stream_config, + &port_config, 1, sdw_stream); + if (ret) { + dev_err(dai->dev, "Unable to configure port\n"); + return -EINVAL; + } + + switch (params_rate(params)) { + case 16000: + rate = ES9356_SDCA_RATE_16000HZ; + break; + case 44100: + rate = ES9356_SDCA_RATE_44100HZ; + break; + case 48000: + rate = ES9356_SDCA_RATE_48000HZ; + break; + case 96000: + rate = ES9356_SDCA_RATE_96000HZ; + break; + default: + dev_err(component->dev, "%s: Rate %d is not supported\n", + __func__, params_rate(params)); + return -EINVAL; + } + + ret = es9356_power_state(dai, ps0, &rate); + if (ret) { + dev_err(component->dev, "%s: Invalid dai id: %d\n", + __func__, dai->id); + return -EINVAL; + } + + return 0; +} + +static int es9356_sdw_pcm_hw_free(struct snd_pcm_substream *substream, + struct snd_soc_dai *dai) +{ + struct snd_soc_component *component = dai->component; + struct es9356_sdw_priv *es9356 = snd_soc_component_get_drvdata(component); + struct sdw_stream_runtime *sdw_stream = snd_soc_dai_get_dma_data(dai, substream); + unsigned char ps3 = 0x3; + int ret; + + if (!es9356->slave) + return -EINVAL; + + sdw_stream_remove_slave(es9356->slave, sdw_stream); + + ret = es9356_power_state(dai, ps3, NULL); + if (ret) { + dev_err(component->dev, "%s: Invalid dai id: %d\n", + __func__, dai->id); + return -EINVAL; + } + + return 0; +} + +static const struct snd_soc_dai_ops es9356_sdw_ops = { + .hw_params = es9356_sdw_pcm_hw_params, + .hw_free = es9356_sdw_pcm_hw_free, + .set_stream = es9356_sdw_set_sdw_stream, + .shutdown = es9356_sdw_shutdown, +}; + +static struct snd_soc_dai_driver es9356_sdw_dai[] = { + { + .name = "es9356-sdp-aif4", + .id = ES9356_DMIC, + .capture = { + .stream_name = "DP1 Capture", + .channels_min = 1, + .channels_max = 2, + }, + .ops = &es9356_sdw_ops, + }, + { + .name = "es9356-sdp-aif2", + .id = ES9356_JACK_IN, + .capture = { + .stream_name = "DP2 Capture", + .channels_min = 1, + .channels_max = 2, + }, + .ops = &es9356_sdw_ops, + }, + { + .name = "es9356-sdp-aif3", + .id = ES9356_AMP, + .playback = { + .stream_name = "DP3 Playback", + .channels_min = 1, + .channels_max = 2, + }, + .ops = &es9356_sdw_ops, + }, + { + .name = "es9356-sdp-aif1", + .id = ES9356_JACK_OUT, + .playback = { + .stream_name = "DP4 Playback", + .channels_min = 1, + .channels_max = 2, + }, + .ops = &es9356_sdw_ops, + }, +}; + +static int es9356_sdca_mbq_size(struct device *dev, unsigned int reg) +{ + switch (reg) { + case SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_L): + case SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_R): + case SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_L): + case SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_R): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_L): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_R): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_L): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_R): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU33, ES9356_SDCA_CTL_FU_CH_GAIN, 0): + case SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU11, ES9356_SDCA_CTL_FU_CH_GAIN, 0): + return 2; + default: + return 1; + } +} + +static struct regmap_sdw_mbq_cfg es9356_mbq_config = { + .mbq_size = es9356_sdca_mbq_size, +}; + +static bool es9356_sdca_volatile_register(struct device *dev, unsigned int reg) +{ + switch (reg) { + case ES9356_BUF_ADDR_HID: + case ES9356_HID_BYTE2: + case ES9356_HID_BYTE3: + case ES9356_HID_BYTE4: + case SDW_SDCA_CTL(FUNC_NUM_HID, ES9356_SDCA_ENT_HID01, ES9356_SDCA_CTL_HIDTX_CURRENT_OWNER, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_GE35, ES9356_SDCA_CTL_DETECTED_MODE, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_GE35, ES9356_SDCA_CTL_SELECTED_MODE, 0): + case SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_PDE23, ES9356_SDCA_CTL_REQ_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_PDE23, ES9356_SDCA_CTL_ACTUAL_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_PDE11, ES9356_SDCA_CTL_REQ_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_PDE11, ES9356_SDCA_CTL_ACTUAL_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_PDE47, ES9356_SDCA_CTL_REQ_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_PDE47, ES9356_SDCA_CTL_ACTUAL_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_PDE34, ES9356_SDCA_CTL_REQ_POWER_STATE, 0): + case SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_PDE34, ES9356_SDCA_CTL_ACTUAL_POWER_STATE, 0): + case ES9356_FLAGS_HP: + return true; + default: + return false; + } +} + +static const struct regmap_config es9356_sdca_regmap = { + .reg_bits = 32, + .val_bits = 16, + .volatile_reg = es9356_sdca_volatile_register, + .max_register = 0x45ffffff, + .cache_type = REGCACHE_MAPLE, + .use_single_read = true, + .use_single_write = true, +}; + +static void es9356_register_init(struct es9356_sdw_priv *es9356) +{ + regmap_write(es9356->regmap, ES9356_STATE, 0x02); + regmap_write(es9356->regmap, ES9356_ENDPOINT_MODE, 0x24); + regmap_write(es9356->regmap, ES9356_PRE_DIV_CTL, 0x00); + regmap_write(es9356->regmap, ES9356_ADC_OSR, 0x18); + regmap_write(es9356->regmap, ES9356_ADC_OSRGAIN, 0x13); + regmap_write(es9356->regmap, ES9356_DAC_OSR, 0x16); + regmap_write(es9356->regmap, ES9356_CLK_CTL, 0x0f); + regmap_write(es9356->regmap, ES9356_CSM_RESET, 0x01); + regmap_write(es9356->regmap, ES9356_CLK_SEL, 0x30); + + regmap_write(es9356->regmap, ES9356_DETCLK_CTL, 0x51); + regmap_write(es9356->regmap, ES9356_HP_TYPE, 0x10); + regmap_write(es9356->regmap, ES9356_MICBIAS_CTL, 0x10); + regmap_write(es9356->regmap, ES9356_HPDETECT_CTL, 0x07); + regmap_write(es9356->regmap, ES9356_ADC_ANA, 0x30); + regmap_write(es9356->regmap, ES9356_PGA_CTL, 0xa8); + regmap_write(es9356->regmap, ES9356_ADC_INT, 0xaa); + regmap_write(es9356->regmap, ES9356_ADC_LP, 0x19); + regmap_write(es9356->regmap, ES9356_VMID1SEL, 0xbc); + regmap_write(es9356->regmap, ES9356_VMID_TIME, 0x0b); + regmap_write(es9356->regmap, ES9356_STATE_TIME, 0xbb); + regmap_write(es9356->regmap, ES9356_HP_SPK_TIME, 0x77); + regmap_write(es9356->regmap, ES9356_HP_DETECTTIME, 0xa4); + regmap_write(es9356->regmap, ES9356_MICBIAS_SEL, 0x15); + regmap_write(es9356->regmap, ES9356_KEY_PRESS_TIME, 0xff); + regmap_write(es9356->regmap, ES9356_KEY_RELEASE_TIME, 0xff); + regmap_write(es9356->regmap, ES9356_KEY_HOLD_TIME, 0x0f); + regmap_write(es9356->regmap, ES9356_BTSEL_REF, 0x00); + regmap_write(es9356->regmap, ES9356_KEYD_DETECT, 0x18); + regmap_write(es9356->regmap, ES9356_MICBIAS_RES, 0x03); + regmap_write(es9356->regmap, ES9356_BUTTON_CHARGE, 0x00); + regmap_write(es9356->regmap, ES9356_CALIBRATION_TIME, 0x13); + regmap_write(es9356->regmap, ES9356_CALIBRATION_SETTING, 0xf4); + + regmap_write(es9356->regmap, ES9356_SPK_VOLUME, 0x33); + regmap_write(es9356->regmap, ES9356_DAC_VROI, 0x01); + regmap_write(es9356->regmap, ES9356_DAC_LP, 0x00); + regmap_write(es9356->regmap, ES9356_HP_IBIAS, 0x04); + regmap_write(es9356->regmap, ES9356_HP_LP, 0x03); + regmap_write(es9356->regmap, ES9356_SPKLDO_CTL, 0x65); + regmap_write(es9356->regmap, ES9356_SPKBIAS_COMP, 0x09); + regmap_write(es9356->regmap, ES9356_VMID1STL, 0x00); + regmap_write(es9356->regmap, ES9356_VMID2STL, 0x00); + regmap_write(es9356->regmap, ES9356_VSEL, 0xfc); + + regmap_write(es9356->regmap, ES9356_IBIASGEN, 0x10); + regmap_write(es9356->regmap, ES9356_ADC_AMIC_CTL, 0x0d); + regmap_write(es9356->regmap, ES9356_STATE, 0x0e); + regmap_write(es9356->regmap, ES9356_CSM_RESET, 0x00); + regmap_write(es9356->regmap, ES9356_HP_TYPE, 0x08); + + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_L), ES9356_DEFAULT_VOLUME); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_FU113, ES9356_SDCA_CTL_FU_VOLUME, CH_R), ES9356_DEFAULT_VOLUME); + + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_L), ES9356_DEFAULT_VOLUME); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_FU21, ES9356_SDCA_CTL_FU_VOLUME, CH_R), ES9356_DEFAULT_VOLUME); + + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_L), ES9356_DEFAULT_VOLUME); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU41, ES9356_SDCA_CTL_FU_VOLUME, CH_R), ES9356_DEFAULT_VOLUME); + + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_L), ES9356_DEFAULT_VOLUME); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_FU36, ES9356_SDCA_CTL_FU_VOLUME, CH_R), ES9356_DEFAULT_VOLUME); +} + +static int es9356_sdca_io_init(struct device *dev, struct sdw_slave *slave) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(&slave->dev); + + if (es9356->hw_init) + return 0; + + es9356->disable_irq = false; + + regcache_cache_only(es9356->regmap, false); + + if (es9356->first_hw_init) { + regcache_cache_bypass(es9356->regmap, true); + } else { + /* update count of parent 'active' children */ + pm_runtime_set_active(&slave->dev); + + es9356_register_init(es9356); + } + pm_runtime_get_noresume(&slave->dev); + + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT_XU12, ES9356_SDCA_CTL_SELECTED_MODE, 0), 0x01); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_MIC, ES9356_SDCA_ENT0, ES9356_SDCA_CTL_FUNC_STATUS, 0), 0x40); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT_XU22, ES9356_SDCA_CTL_SELECTED_MODE, 0), 0x01); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_AMP, ES9356_SDCA_ENT0, ES9356_SDCA_CTL_FUNC_STATUS, 0), 0x40); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_XU42, ES9356_SDCA_CTL_SELECTED_MODE, 0), 0x01); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT_XU36, ES9356_SDCA_CTL_SELECTED_MODE, 0), 0x01); + regmap_write(es9356->regmap, + SDW_SDCA_CTL(FUNC_NUM_UAJ, ES9356_SDCA_ENT0, ES9356_SDCA_CTL_FUNC_STATUS, 0), 0x40); + + if (es9356->first_hw_init) { + regcache_cache_bypass(es9356->regmap, false); + regcache_mark_dirty(es9356->regmap); + } else + es9356->first_hw_init = true; + + es9356->hw_init = true; + + pm_runtime_mark_last_busy(&slave->dev); + pm_runtime_put_autosuspend(&slave->dev); + + return 0; +} + +static int es9356_sdw_update_status(struct sdw_slave *slave, + enum sdw_slave_status status) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(&slave->dev); + + if (status == SDW_SLAVE_UNATTACHED) { + es9356->hw_init = false; + cancel_delayed_work_sync(&es9356->interrupt_handle_work); + cancel_delayed_work_sync(&es9356->button_detect_work); + regcache_cache_only(es9356->regmap, true); + } + + if (status == SDW_SLAVE_ATTACHED) { + if (es9356->hs_jack) + sdw_write_no_pm(es9356->slave, SDW_SCP_SDCA_INTMASK1, + (SDW_SCP_SDCA_INTMASK_SDCA_7 | SDW_SCP_SDCA_INTMASK_SDCA_5 | SDW_SCP_SDCA_INTMASK_SDCA_1)); + } + + if (es9356->hw_init || status != SDW_SLAVE_ATTACHED) + return 0; + + return es9356_sdca_io_init(&slave->dev, slave); +} + +static int es9356_sdw_read_prop(struct sdw_slave *slave) +{ + struct sdw_slave_prop *prop = &slave->prop; + int nval; + int i, j; + u32 bit; + unsigned long addr; + struct sdw_dpn_prop *dpn; + + prop->paging_support = true; + + /* + * first we need to allocate memory for set bits in port lists + * the port allocation is completely arbitrary: + * DP0 is not supported + * DP3 and DP4 is sink + * DP1 and DP2 is source + */ + prop->source_ports = BIT(1) | BIT(2); + prop->sink_ports = BIT(3) | BIT(4); + + nval = hweight32(prop->source_ports); + prop->src_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->src_dpn_prop), + GFP_KERNEL); + if (!prop->src_dpn_prop) + return -ENOMEM; + + i = 0; + dpn = prop->src_dpn_prop; + addr = prop->source_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[i].num = bit; + dpn[i].type = SDW_DPN_FULL; + dpn[i].simple_ch_prep_sm = true; + i++; + } + + /* do this again for sink now */ + nval = hweight32(prop->sink_ports); + prop->sink_dpn_prop = devm_kcalloc(&slave->dev, nval, + sizeof(*prop->sink_dpn_prop), + GFP_KERNEL); + if (!prop->sink_dpn_prop) + return -ENOMEM; + + j = 0; + dpn = prop->sink_dpn_prop; + addr = prop->sink_ports; + for_each_set_bit(bit, &addr, 32) { + dpn[j].num = bit; + dpn[j].type = SDW_DPN_FULL; + dpn[j].simple_ch_prep_sm = true; + j++; + } + + /* wake-up event */ + prop->wake_capable = 1; + + return 0; +} + +static int es9356_sdw_interrupt_callback(struct sdw_slave *slave, + struct sdw_slave_intr_status *status) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(&slave->dev); + unsigned int sdca_cascade, scp_sdca_stat1 = 0; + int count = 0, retry = 3; + int ret, stat, reg; + + mutex_lock(&es9356->disable_irq_lock); + + ret = sdw_read_no_pm(es9356->slave, SDW_SCP_SDCA_INT1); + if (ret < 0) + goto io_error; + es9356->sdca_status = ret; + + do { + reg = sdw_read_no_pm(es9356->slave, SDW_SCP_SDCA_INT1); + if (reg < 0) + goto io_error; + if (reg & SDW_SCP_SDCA_INTMASK_SDCA_1) { + ret = sdw_update_no_pm(es9356->slave, SDW_SCP_SDCA_INT1, + SDW_SCP_SDCA_INT_SDCA_1, SDW_SCP_SDCA_INT_SDCA_1); + if (ret < 0) + goto io_error; + } + + if (reg & SDW_SCP_SDCA_INTMASK_SDCA_5) { + ret = sdw_update_no_pm(es9356->slave, SDW_SCP_SDCA_INT1, + SDW_SCP_SDCA_INT_SDCA_5, SDW_SCP_SDCA_INT_SDCA_5); + if (ret < 0) + goto io_error; + } + + if (reg & SDW_SCP_SDCA_INTMASK_SDCA_7) { + ret = sdw_update_no_pm(es9356->slave, SDW_SCP_SDCA_INT1, + SDW_SCP_SDCA_INT_SDCA_7, SDW_SCP_SDCA_INT_SDCA_7); + if (ret < 0) + goto io_error; + } + + ret = sdw_read_no_pm(es9356->slave, SDW_DP0_INT); + if (ret < 0) + goto io_error; + sdca_cascade = ret & SDW_DP0_SDCA_CASCADE; + + ret = sdw_read_no_pm(es9356->slave, SDW_SCP_SDCA_INT1); + if (ret < 0) + goto io_error; + scp_sdca_stat1 = ret & + (SDW_SCP_SDCA_INTMASK_SDCA_1 | SDW_SCP_SDCA_INTMASK_SDCA_5 | SDW_SCP_SDCA_INTMASK_SDCA_7); + + stat = scp_sdca_stat1 || sdca_cascade; + + count++; + } while (stat != 0 && count < retry); + + /* The 280 ms figure was determined through testing */ + if (status->sdca_cascade && !es9356->disable_irq) + mod_delayed_work(system_power_efficient_wq, + &es9356->interrupt_handle_work, msecs_to_jiffies(280)); + + mutex_unlock(&es9356->disable_irq_lock); + return 0; + +io_error: + mutex_unlock(&es9356->disable_irq_lock); + pr_err_ratelimited("IO error in %s, ret %d\n", __func__, ret); + return ret; +} + +static const struct sdw_slave_ops es9356_sdw_slave_ops = { + .read_prop = es9356_sdw_read_prop, + .interrupt_callback = es9356_sdw_interrupt_callback, + .update_status = es9356_sdw_update_status, +}; + +static int es9356_sdca_init(struct device *dev, struct regmap *regmap, struct sdw_slave *slave) +{ + struct es9356_sdw_priv *es9356; + int ret; + + es9356 = devm_kzalloc(dev, sizeof(*es9356), GFP_KERNEL); + if (!es9356) + return -ENOMEM; + + dev_set_drvdata(dev, es9356); + + es9356->slave = slave; + es9356->regmap = regmap; + mutex_init(&es9356->disable_irq_lock); + mutex_init(&es9356->pde_lock); + + regcache_cache_only(es9356->regmap, true); + + es9356->hw_init = false; + es9356->first_hw_init = false; + + INIT_DELAYED_WORK(&es9356->interrupt_handle_work, + es9356_interrupt_handler); + INIT_DELAYED_WORK(&es9356->button_detect_work, + es9356_button_detect_handler); + + ret = devm_snd_soc_register_component(dev, + &snd_soc_es9356_sdw_component, + es9356_sdw_dai, + ARRAY_SIZE(es9356_sdw_dai)); + if (ret) { + dev_err_probe(dev, ret, "Failed to register component\n"); + return ret; + } + /* set autosuspend parameters */ + pm_runtime_set_autosuspend_delay(dev, 3000); + pm_runtime_use_autosuspend(dev); + /* make sure the device does not suspend immediately */ + pm_runtime_mark_last_busy(dev); + pm_runtime_enable(dev); + + return 0; +} + +static int es9356_sdw_probe(struct sdw_slave *slave, + const struct sdw_device_id *id) +{ + struct regmap *regmap; + + regmap = devm_regmap_init_sdw_mbq_cfg(&slave->dev, slave, &es9356_sdca_regmap, &es9356_mbq_config); + if (IS_ERR(regmap)) + return PTR_ERR(regmap); + + return es9356_sdca_init(&slave->dev, regmap, slave); +} + +static void es9356_sdw_remove(struct sdw_slave *slave) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(&slave->dev); + + if (es9356->hw_init) { + cancel_delayed_work_sync(&es9356->interrupt_handle_work); + cancel_delayed_work_sync(&es9356->button_detect_work); + } + + if (es9356->first_hw_init) + pm_runtime_disable(&slave->dev); + + mutex_destroy(&es9356->disable_irq_lock); + mutex_destroy(&es9356->pde_lock); +} + +static const struct sdw_device_id es9356_sdw_id[] = { + SDW_SLAVE_ENTRY_EXT(0x04b3, 0x9356, 0x02, 0, 0), + SDW_SLAVE_ENTRY_EXT(0x04b3, 0x9356, 0x03, 0, 0), + {}, +}; +MODULE_DEVICE_TABLE(sdw, es9356_sdw_id); + +static int es9356_sdca_dev_suspend(struct device *dev) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(dev); + + cancel_delayed_work_sync(&es9356->interrupt_handle_work); + cancel_delayed_work_sync(&es9356->button_detect_work); + + regcache_cache_only(es9356->regmap, true); + + return 0; +} + +static int es9356_sdca_dev_system_suspend(struct device *dev) +{ + struct es9356_sdw_priv *es9356 = dev_get_drvdata(dev); + + mutex_lock(&es9356->disable_irq_lock); + es9356->disable_irq = true; + mutex_unlock(&es9356->disable_irq_lock); + + return es9356_sdca_dev_suspend(dev); +} + +#define es9356_PROBE_TIMEOUT 2000 + +static int es9356_sdca_dev_resume(struct device *dev) +{ + struct sdw_slave *slave = dev_to_sdw_dev(dev); + struct es9356_sdw_priv *es9356 = dev_get_drvdata(dev); + unsigned long time; + + if (!slave->unattach_request) { + es9356->disable_irq = false; + goto regmap_sync; + } + + time = wait_for_completion_timeout(&slave->initialization_complete, + msecs_to_jiffies(es9356_PROBE_TIMEOUT)); + if (!time) { + dev_err(&slave->dev, "Initialization not complete, timed out\n"); + sdw_show_ping_status(slave->bus, true); + + return -ETIMEDOUT; + } + +regmap_sync: + slave->unattach_request = 0; + regcache_cache_only(es9356->regmap, false); + regcache_sync(es9356->regmap); + return 0; +} + +static const struct dev_pm_ops es9356_sdca_pm = { + SYSTEM_SLEEP_PM_OPS(es9356_sdca_dev_system_suspend, es9356_sdca_dev_resume) + RUNTIME_PM_OPS(es9356_sdca_dev_suspend, es9356_sdca_dev_resume, NULL) +}; + +static struct sdw_driver es9356_sdw_driver = { + .driver = { + .name = "es9356", + .pm = pm_ptr(&es9356_sdca_pm), + }, + .probe = es9356_sdw_probe, + .remove = es9356_sdw_remove, + .ops = &es9356_sdw_slave_ops, + .id_table = es9356_sdw_id, +}; +module_sdw_driver(es9356_sdw_driver); + +MODULE_IMPORT_NS("SND_SOC_SDCA"); +MODULE_DESCRIPTION("ASoC ES9356 SDCA SDW codec driver"); +MODULE_AUTHOR("Michael Zhang "); +MODULE_LICENSE("GPL"); diff --git a/sound/soc/codecs/es9356.h b/sound/soc/codecs/es9356.h new file mode 100644 index 00000000000000..2a676f36590f6d --- /dev/null +++ b/sound/soc/codecs/es9356.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ES9356_H__ +#define __ES9356_H__ + +/*ES9356 Implementation-define*/ +#define ES9356_FLAGS_HP 0x2003 +#define ES9356_CSM_RESET 0x2020 +#define ES9356_FUC_RESET 0x2021 +#define ES9356_STATE 0x2022 +#define ES9356_VMID_TIME 0x2023 +#define ES9356_STATE_TIME 0x2024 +#define ES9356_HP_SPK_TIME 0x2025 +#define ES9356_WP_ENABLE 0x2026 +#define ES9356_DMIC_GPIO 0x2027 +#define ES9356_ENDPOINT_MODE 0x2028 + +/*HP DETECT*/ +#define ES9356_HP_TYPE 0x2029 +#define ES9356_HP_DETECTTIME 0x202A +#define ES9356_MICBIAS_SEL 0x202B +#define ES9356_KEY_PRESS_TIME 0x202C +#define ES9356_KEY_RELEASE_TIME 0x202D +#define ES9356_KEY_HOLD_TIME 0x202E +#define ES9356_BTSEL_REF 0x202F +#define ES9356_BUTTON_CHARGE 0x2030 + +#define ES9356_KEYD_DETECT 0x2031 +#define ES9356_DPEN_TIME 0x2032 +#define ES9356_TIMER_CHECK 0x2033 +#define ES9356_IBIASGEN 0x2041 +#define ES9356_VMID1SEL 0x2042 +#define ES9356_VMID1STL 0x2043 +#define ES9356_VMID2SEL 0x2044 +#define ES9356_VMID2STL 0x2045 +#define ES9356_VSEL 0x2046 +#define ES9356_MICBIAS_CTL 0x2047 +#define ES9356_HPDETECT_CTL 0x2048 +#define ES9356_MICBIAS_RES 0x2049 + +/*CLK*/ +#define ES9356_CLK_SEL 0x2050 +#define ES9356_CLK_CTL 0x2051 +#define ES9356_DETCLK_CTL 0x2052 +#define ES9356_CPCLK_CTL 0x2053 +#define ES9356_SPKCLK_CTL 0x2054 +#define ES9356_PRE_DIV_CTL 0x2055 +#define ES9356_DLL_MODE 0x2056 +#define ES9356_ANACLK_SEL 0x2057 +#define ES9356_OSRCLK_SEL 0x2058 +#define ES9356_DSPCLK_SEL 0x2059 +#define ES9356_SPK9M_MODE 0x205a + +/*ADC DIG CTL*/ +#define ES9356_DMIC_POL 0x2061 +#define ES9356_ADC_SWAP 0x2062 +#define ES9356_ADC_OSR 0x2063 +#define ES9356_ADC_OSRGAIN 0x2064 +#define ES9356_ADC_CLEARRAM 0x2065 +#define ES9356_ADC_RAMP 0x2066 +#define ES9356_ADC_HPF1 0x2067 +#define ES9356_ADC_HPF2 0x2068 +#define ES9356_ADC_ALC 0x206C +#define ES9356_ALC_LEVEL 0x206D +#define ES9356_ALC_RAMP_WINSIZE 0x206E + +/*ADC ANA CTL*/ +#define ES9356_ADC_REF_EN 0x2080 +#define ES9356_ADC_AMIC_CTL 0x2081 +#define ES9356_ADC_ANA 0x2082 +#define ES9356_PGA_CTL 0x2083 +#define ES9356_ADC_INT 0x2084 +#define ES9356_ADC_VCM 0x2085 +#define ES9356_ADC_VRPBIAS 0x2086 +#define ES9356_ADC_LP 0x2087 + +/*DAC DIG CTL*/ +#define ES9356_DAC_FSMODE 0x2090 +#define ES9356_DAC_OSR 0x2091 +#define ES9356_DAC_INV 0x2092 +#define ES9356_DAC_RAMP 0x2093 +#define ES9356_DAC_VPPSCALE 0x2094 +#define ES9356_DAC_SWAP 0x2097 +#define ES9356_SPKCMP_VPPSC 0x20A0 +#define ES9356_CALIBRATION_TIME 0x20A1 +#define ES9356_CALIBRATION_SETTING 0x20A2 +#define ES9356_DAC_OFFSET_LH 0x20A3 +#define ES9356_DAC_OFFSET_LL 0x20A4 +#define ES9356_DAC_OFFSET_RH 0x20A5 +#define ES9356_DAC_OFFSET_RL 0x20A6 + +/*DAC ANA CTL*/ +#define ES9356_DAC_REF_EN 0x20B0 +#define ES9356_DAC_ENABLE 0x20B1 +#define ES9356_DAC_VROI 0x20B2 +#define ES9356_DAC_LP 0x20B3 + +/*HP CTL*/ +#define ES9356_CHARGEPUMP_CTL 0x20C0 +#define ES9356_CPLDO_CTL 0x20C1 +#define ES9356_HP_REF_CTL 0x20C2 +#define ES9356_HP_IBIAS 0x20C3 +#define ES9356_HP_EN 0x20C4 +#define ES9356_HP_VOLUME 0x20C5 +#define ES9356_HP_LP 0x20C6 + +/*SPK CTL*/ +#define ES9356_SPKLDO_CTL 0x20D0 +#define ES9356_CLASSD_CTL 0x20D1 +#define ES9356_SPK_HBDG 0x20D5 +#define ES9356_SPK_VOLUME 0x20D7 +#define ES9356_SPK_SCP 0x20D8 +#define ES9356_SPK_DT 0x20D9 +#define ES9356_SPK_OTP 0x20DA +#define ES9356_SPKBIAS_COMP 0x20DB + +/* ES9356 SDCA Control - function number */ +#define FUNC_NUM_UAJ 0x01 +#define FUNC_NUM_MIC 0x02 +#define FUNC_NUM_AMP 0x03 +#define FUNC_NUM_HID 0x04 + +/* ES9356 SDCA entity */ +#define ES9356_SDCA_ENT0 0x00 +#define ES9356_SDCA_ENT_PDE11 0x03 +#define ES9356_SDCA_ENT_FU11 0x04 +#define ES9356_SDCA_ENT_XU12 0x05 +#define ES9356_SDCA_ENT_FU113 0x07 +#define ES9356_SDCA_ENT_CS113 0x09 +#define ES9356_SDCA_ENT_PPU11 0x0C + +#define ES9356_SDCA_ENT_CS21 0x02 +#define ES9356_SDCA_ENT_PPU21 0x03 +#define ES9356_SDCA_ENT_FU21 0X04 +#define ES9356_SDCA_ENT_XU22 0x06 +#define ES9356_SDCA_ENT_SAPU29 0x03 +#define ES9356_SDCA_ENT_PDE23 0x0B +#define ES9356_SDCA_ENT_HID01 0x01 + +#define ES9356_SDCA_ENT_CS41 0x02 +#define ES9356_SDCA_ENT_FU35 0x04 +#define ES9356_SDCA_ENT_XU42 0x06 +#define ES9356_SDCA_ENT_FU41 0x07 +#define ES9356_SDCA_ENT_PDE47 0x0E +#define ES9356_SDCA_ENT_IT33 0x0F +#define ES9356_SDCA_ENT_PDE34 0x10 +#define ES9356_SDCA_ENT_FU33 0x11 +#define ES9356_SDCA_ENT_XU36 0x13 +#define ES9356_SDCA_ENT_FU36 0x15 +#define ES9356_SDCA_ENT_CS36 0x17 +#define ES9356_SDCA_ENT_GE35 0x18 + +/* ES9356 SDCA control */ +#define ES9356_SDCA_CTL_SAMPLE_FREQ_INDEX 0x10 +#define ES9356_SDCA_CTL_FU_MUTE 0x01 +#define ES9356_SDCA_CTL_FU_VOLUME 0x02 +#define ES9356_SDCA_CTL_HIDTX_CURRENT_OWNER 0x10 +#define ES9356_SDCA_CTL_SELECTED_MODE 0x01 +#define ES9356_SDCA_CTL_DETECTED_MODE 0x02 +#define ES9356_SDCA_CTL_REQ_POWER_STATE 0x01 +#define ES9356_SDCA_CTL_FU_CH_GAIN 0x0b +#define ES9356_SDCA_CTL_FUNC_STATUS 0x10 +#define ES9356_SDCA_CTL_ACTUAL_POWER_STATE 0x10 +#define ES9356_SDCA_CTL_POSTURE_NUMBER 0x00 + +/* ES9356 SDCA channel */ +#define CH_L 0x01 +#define CH_R 0x02 +#define MBQ 0x2000 + +/* ES9356 HID*/ +#define ES9356_BUF_ADDR_HID 0x44000000 +#define ES9356_HID_BYTE2 0x44000001 +#define ES9356_HID_BYTE3 0x44000002 +#define ES9356_HID_BYTE4 0x44000003 + +/* ES9356 Volume Setting*/ +#define ES9356_VU_BASE 768 +#define ES9356_OFFSET_HIGH 0x07F8 +#define ES9356_OFFSET_LOW 0x0007 +#define ES9356_DEFAULT_VOLUME 0x00 +#define ES9356_VOLUME_STEP 32 +#define ES9356_VOLUME_MIN -768 +#define ES9356_VOLUME_MAX 285 +#define ES9356_AMIC_GAIN_STEP 768 +#define ES9356_DMIC_GAIN_STEP 1536 +#define ES9356_GAIN_MIN 0 +#define ES9356_AMIC_GAIN_MAX 10 +#define ES9356_DMIC_GAIN_MAX 3 + +enum { + ES9356_DMIC = 1, /* For dmic */ + ES9356_JACK_IN, /* For headset mic */ + ES9356_AMP, /* For speaker */ + ES9356_JACK_OUT, /* For headphone */ +}; + +enum { + ES9356_SDCA_RATE_16000HZ, + ES9356_SDCA_RATE_24000HZ, + ES9356_SDCA_RATE_32000HZ, + ES9356_SDCA_RATE_44100HZ, + ES9356_SDCA_RATE_48000HZ, + ES9356_SDCA_RATE_88200HZ, + ES9356_SDCA_RATE_96000HZ, +}; + +#endif From 25b905940832c95674d2f11f40e6cea60ccb5a4d Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 14 May 2026 15:52:04 +0800 Subject: [PATCH 899/972] ASoC: Intel: soc-acpi: arl: Add es9356 support Add support for the es9356 codec in the ARL board configuration. Signed-off-by: Zhang Yi Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260514075206.3483-5-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- .../intel/common/soc-acpi-intel-arl-match.c | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-arl-match.c b/sound/soc/intel/common/soc-acpi-intel-arl-match.c index c952f7d2b2c0e0..9782825dd8bfd1 100644 --- a/sound/soc/intel/common/soc-acpi-intel-arl-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-arl-match.c @@ -192,6 +192,42 @@ static const struct snd_soc_acpi_endpoint cs42l43_endpoints[] = { }, }; +static const struct snd_soc_acpi_endpoint es9356_endpoints[] = { + { /* Jack Playback Endpoint */ + .num = 0, + .aggregated = 0, + .group_position = 0, + .group_id = 0, + }, + { /* DMIC Capture Endpoint */ + .num = 1, + .aggregated = 0, + .group_position = 0, + .group_id = 0, + }, + { /* Jack Capture Endpoint */ + .num = 2, + .aggregated = 0, + .group_position = 0, + .group_id = 0, + }, + { /* Speaker Playback Endpoint */ + .num = 3, + .aggregated = 0, + .group_position = 0, + .group_id = 0, + }, +}; + +static const struct snd_soc_acpi_adr_device es9356_adr[] = { + { + .adr = 0x00013004b3935601ull, + .num_endpoints = ARRAY_SIZE(es9356_endpoints), + .endpoints = es9356_endpoints, + .name_prefix = "es9356" + } +}; + static const struct snd_soc_acpi_adr_device cs42l43_0_adr[] = { { .adr = 0x00003001FA424301ull, @@ -255,6 +291,15 @@ static const struct snd_soc_acpi_adr_device rt1320_2_single_adr[] = { } }; +static const struct snd_soc_acpi_link_adr arl_n_mrd_es9356_link1[] = { + { + .mask = BIT(1), + .num_adr = ARRAY_SIZE(es9356_adr), + .adr_d = es9356_adr, + }, + {} +}; + static const struct snd_soc_acpi_link_adr arl_cs42l43_l0[] = { { .mask = BIT(0), @@ -528,6 +573,13 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[] = { .sof_tplg_filename = "sof-arl-rt722-l0_rt1320-l2.tplg", .get_function_tplg_files = sof_sdw_get_tplg_files, }, + { + .link_mask = BIT(1), + .links = arl_n_mrd_es9356_link1, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-arl-es9356.tplg", + .get_function_tplg_files = sof_sdw_get_tplg_files, + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_arl_sdw_machines); From f1e35606d618e9de98f08f305a06c18f74aae216 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 14 May 2026 15:52:05 +0800 Subject: [PATCH 900/972] ASoC: Intel: sof_sdw: add es9356 support add Everest-semi ES9356 support Signed-off-by: Zhang Yi Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260514075206.3483-6-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/intel/boards/Kconfig b/sound/soc/intel/boards/Kconfig index d53af8f7e55b80..cddbd2aa424ebb 100644 --- a/sound/soc/intel/boards/Kconfig +++ b/sound/soc/intel/boards/Kconfig @@ -532,6 +532,7 @@ config SND_SOC_INTEL_SOUNDWIRE_SOF_MACH select MFD_CS42L43_SDW select SND_SOC_CS35L56_SPI select SND_SOC_CS35L56_SDW + select SND_SOC_ES9356 select SND_SOC_DMIC select SND_SOC_INTEL_HDA_DSP_COMMON imply SND_SOC_SDW_MOCKUP From 6fc7e8a3b8115294f60f5c89de27330bf1b9c98e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:13 -0300 Subject: [PATCH 901/972] iommu: Fix loss of errno on map failure for classic ops A typo, likely from a rebase, inverted the condition and caused errors to be lost. Fix it to be "if (ret)". This was breaking iommu_create_device_direct_mappings() on drivers that don't use iommupt and don't fully set up their domain in alloc_pages() (i.e., SMMUv2). In this case the first call of iommu_create_device_direct_mappings() should fail due to the incompletely initialized domain. Since it wrongly returns success, the second call to iommu_create_device_direct_mappings() doesn't happen and IOMMU_RESV_DIRECT is never set up. Cc: stable@vger.kernel.org Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Reported-by: Josua Mayer Closes: https://lore.kernel.org/all/321c2e57-6a17-4aef-ba42-d2ebd577e472@solid-run.com/ Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index e7bd28cc77eeb4..05f78cfd1f1d0f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2709,7 +2709,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, return 0; } ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (!ret) + if (ret) return ret; trace_map(iova, paddr, size); From b948a87228482235afbaf5f4d8037860b5c470fd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:14 -0300 Subject: [PATCH 902/972] iommu: Fix up map/unmap debugging for iommupt domains Sashiko noticed a few issues in this path, and a few more were found on review. Tidy them up further. These are intertwined because the debug code depends on some of the WARN_ONs to function right: Lift into iommu_map_nosync(): - The might_sleep_if() - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - WARN_ON for illegal gfp flags Then remove the return 0 since it is now safe to call iommu_debug_map(). Lift into __iommu_unmap(): - 0 pgsize_bitmap WARN_ON - Promote the illegal domain->type to a WARN_ON - iommu_debug_unmap_begin() This now pairs with the unconditional iommu_debug_map() on the mapping side. Thus iommu debugging now works for iommupt along with some of the other debugging features. Fixes: 99fb8afa16ad ("iommupt: Directly call iommupt's unmap_range()") Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 05f78cfd1f1d0f..c21d1e3c4876bf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2623,19 +2623,9 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, size_t orig_size = size; int ret = 0; - might_sleep_if(gfpflags_allow_blocking(gfp)); - - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return -EINVAL; - - if (WARN_ON(!ops->map_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->map_pages)) return -ENODEV; - /* Discourage passing strange GFP flags */ - if (WARN_ON_ONCE(gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | - __GFP_HIGHMEM))) - return -EINVAL; - /* find out the minimum page size supported */ min_pagesz = 1 << __ffs(domain->pgsize_bitmap); @@ -2697,6 +2687,15 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); int ret; + might_sleep_if(gfpflags_allow_blocking(gfp)); + + /* Discourage passing strange GFP flags or illegal domains */ + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap || + (gfp & (__GFP_COMP | __GFP_DMA | __GFP_DMA32 | + __GFP_HIGHMEM)))) + return -EINVAL; + if (pt) { size_t mapped = 0; @@ -2706,11 +2705,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, iommu_unmap(domain, iova, mapped); return ret; } - return 0; + } else { + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, + gfp); + if (ret) + return ret; } - ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); - if (ret) - return ret; trace_map(iova, paddr, size); iommu_debug_map(domain, paddr, size); @@ -2742,10 +2742,7 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, size_t unmapped_page, unmapped = 0; unsigned int min_pagesz; - if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) - return 0; - - if (WARN_ON(!ops->unmap_pages || domain->pgsize_bitmap == 0UL)) + if (WARN_ON(!ops->unmap_pages)) return 0; /* find out the minimum page size supported */ @@ -2764,8 +2761,6 @@ __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); - iommu_debug_unmap_begin(domain, iova, size); - /* * Keep iterating until we either unmap 'size' bytes (or more) * or we hit an area that isn't mapped. @@ -2801,6 +2796,12 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, struct pt_iommu *pt = iommupt_from_domain(domain); size_t unmapped; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING) || + !domain->pgsize_bitmap)) + return 0; + + iommu_debug_unmap_begin(domain, iova, size); + if (pt) unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); else From 0735c54804c709d1b292f3b6947cfb560b2ce552 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:15 -0300 Subject: [PATCH 903/972] iommu: Handle unmap error when iommu_debug is enabled Sashiko noticed a latent bug where the map error flow called iommu_unmap() which calls iommu_debug_unmap_begin()/iommu_debug_unmap_end() however since this is an error path the map flow never actually established the original iommu_debug_map() it will malfunction. Lift the unmap error handling into iommu_map_nosync() and reorder it so the trace_map()/iommu_debug_map() records the partial mapping and then immediately unmaps it. This avoid creating the unbalanced tracking and provides saner tracing instead of a unmap unmatched to any map. Fixes: ccc21213f013 ("iommu: Add calls for IOMMU_DEBUG_PAGEALLOC") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Reviewed-by: Mostafa Saleh Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 49 +++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c21d1e3c4876bf..d1a9e713d3a05f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2615,12 +2615,11 @@ static size_t iommu_pgsize(struct iommu_domain *domain, unsigned long iova, static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, - size_t size, int prot, gfp_t gfp) + size_t size, int prot, gfp_t gfp, + size_t *mapped) { const struct iommu_domain_ops *ops = domain->ops; - unsigned long orig_iova = iova; unsigned int min_pagesz; - size_t orig_size = size; int ret = 0; if (WARN_ON(!ops->map_pages)) @@ -2643,31 +2642,25 @@ static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, pr_debug("map: iova 0x%lx pa %pa size 0x%zx\n", iova, &paddr, size); while (size) { - size_t pgsize, count, mapped = 0; + size_t pgsize, count, op_mapped = 0; pgsize = iommu_pgsize(domain, iova, paddr, size, &count); pr_debug("mapping: iova 0x%lx pa %pa pgsize 0x%zx count %zu\n", iova, &paddr, pgsize, count); ret = ops->map_pages(domain, iova, paddr, pgsize, count, prot, - gfp, &mapped); + gfp, &op_mapped); /* * Some pages may have been mapped, even if an error occurred, * so we should account for those so they can be unmapped. */ - size -= mapped; - + *mapped += op_mapped; if (ret) - break; - - iova += mapped; - paddr += mapped; - } + return ret; - /* unroll mapping in case something went wrong */ - if (ret) { - iommu_unmap(domain, orig_iova, orig_size - size); - return ret; + size -= op_mapped; + iova += op_mapped; + paddr += op_mapped; } return 0; } @@ -2685,6 +2678,7 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { struct pt_iommu *pt = iommupt_from_domain(domain); + size_t mapped = 0; int ret; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2696,24 +2690,19 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, __GFP_HIGHMEM)))) return -EINVAL; - if (pt) { - size_t mapped = 0; - + if (pt) ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, &mapped); - if (ret) { - iommu_unmap(domain, iova, mapped); - return ret; - } - } else { + else ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, - gfp); - if (ret) - return ret; - } + gfp, &mapped); - trace_map(iova, paddr, size); - iommu_debug_map(domain, paddr, size); + trace_map(iova, paddr, mapped); + iommu_debug_map(domain, paddr, mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } return 0; } From 8ef3f77c440005c7f04229a75976bfc078364247 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:16 -0300 Subject: [PATCH 904/972] iommupt: Check for missing PAGE_SIZE in the pgsize_bitmap Sashiko pointed out that the driver could drop PAGE_SIZE from the pgsize_bitmap. That is technically allowed but nothing does it, and such an iommu_domain would not be used with the DMA API today. Still, it is against the design and it is trivial to fix up. Lift the PT_WARN_ON to the if branch and just skip the fast path. Fixes: dcd6a011a8d5 ("iommupt: Add map_pages op") Signed-off-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Samiullah Khawaja Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 19b6daf88f2ab1..4877b05291c9d4 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -920,8 +920,8 @@ static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && len == PAGE_SIZE) { - PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); + if (pt_has_system_page_size(common) && len == PAGE_SIZE && + likely(pgsize_bitmap & PAGE_SIZE)) { if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; From 58829512ad461af8f35941069c209941e3a97b65 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 May 2026 13:46:17 -0300 Subject: [PATCH 905/972] iommupt: Fix the end_index calculation in __map_range_leaf() Sashiko noticed a mismatch of units in this math: num_leaves is actually the number of leaf *entries* (so a 16-item contiguous leaf is one num_leaves), while index is in items. The mismatch in maths causes __map_range_leaf() to exit early instead of efficiently filling a larger range of contiguous PTEs. The early exit is caught by the functions above and then __map_range_leaf() is re-invoked, so there is no functional issue. Correct the misuse of units by adjusting num_leaves with the leaf size and avoid the performance cost of looping externally. There are also some mismatched types for num_leaves; simplify things to remove the duplicated calculations. Fixes: d6c65b0fd621 ("iommupt: Avoid rewalking during map") Signed-off-by: Jason Gunthorpe Reviewed-by: Samiullah Khawaja Reviewd-by: Pranjal Shrivastava Tested-by: Josua Mayer Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 4877b05291c9d4..dc91fb4e2f61cb 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -534,10 +534,12 @@ static int __map_range_leaf(struct pt_range *range, void *arg, struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; + unsigned int leaves_avail; unsigned int start_index; pt_oaddr_t oa = map->oa; - unsigned int num_leaves; + pt_vaddr_t num_leaves; unsigned int orig_end; + unsigned int step_lg2; pt_vaddr_t last_va; unsigned int step; bool need_contig; @@ -546,21 +548,25 @@ static int __map_range_leaf(struct pt_range *range, void *arg, PT_WARN_ON(map->leaf_level != level); PT_WARN_ON(!pt_can_have_leaf(&pts)); - step = log2_to_int_t(unsigned int, - leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); - need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); + step_lg2 = leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts); + step = log2_to_int_t(unsigned int, step_lg2); + need_contig = step_lg2 != 0; _pt_iter_first(&pts); start_index = pts.index; orig_end = pts.end_index; - if (pts.index + map->num_leaves < pts.end_index) { + leaves_avail = + log2_div_t(unsigned int, pts.end_index - pts.index, step_lg2); + if (map->num_leaves <= leaves_avail) { /* Need to stop in the middle of the table to change sizes */ - pts.end_index = pts.index + map->num_leaves; + pts.end_index = pts.index + log2_mul(map->num_leaves, step_lg2); num_leaves = 0; } else { - num_leaves = map->num_leaves - (pts.end_index - pts.index); + num_leaves = map->num_leaves - leaves_avail; } + PT_WARN_ON( + log2_mod_t(unsigned int, pts.end_index - pts.index, step_lg2)); do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { From 1bb54043ff309795c90ccadd8a6e6b13ac40ec4e Mon Sep 17 00:00:00 2001 From: Tomasz Jeznach Date: Tue, 12 May 2026 10:37:44 -0700 Subject: [PATCH 906/972] MAINTAINERS: update Tomasz Jeznach's email address Switch from the previous work address to a linux.dev account, as the work address is no longer actively monitored. Signed-off-by: Tomasz Jeznach Signed-off-by: Joerg Roedel --- .mailmap | 1 + MAINTAINERS | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index eec4a740f7ca0b..bd6a72f29d9cc5 100644 --- a/.mailmap +++ b/.mailmap @@ -857,6 +857,7 @@ Tobias Klauser Tobias Klauser Tobias Klauser Todor Tomov +Tomasz Jeznach Tony Luck Trilok Soni TripleX Chung diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a38659..55c6ab1a974ab7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22944,7 +22944,7 @@ N: riscv K: riscv RISC-V IOMMU -M: Tomasz Jeznach +M: Tomasz Jeznach L: iommu@lists.linux.dev L: linux-riscv@lists.infradead.org S: Maintained From c0e4fffc0f474b7ed10adee4ab2bc1a66d36fc72 Mon Sep 17 00:00:00 2001 From: Robertus Diawan Chris Date: Fri, 8 May 2026 10:39:14 +0700 Subject: [PATCH 907/972] ALSA: scarlett2: Add missing error check when initialise Autogain Status When initialise new control with scarlett2_add_new_ctl() function for Autogain Status, scarlett2_add_new_ctl() might throw an error. So, add error check after initialise new control for Autogain Status. This is reported by Coverity Scan with CID 1598781 as UNUSED_VALUE. Fixes: 0a995e38dc44 ("ALSA: scarlett2: Add support for software-controllable input gain") Signed-off-by: Robertus Diawan Chris Link: https://patch.msgid.link/20260508033914.111596-1-robertusdchris@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 8eaa962227596f..0f83f8981213fb 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -6707,6 +6707,8 @@ static int scarlett2_add_line_in_ctls(struct usb_mixer_interface *mixer) err = scarlett2_add_new_ctl( mixer, &scarlett2_autogain_status_ctl, i, 1, s, &private->autogain_status_ctls[i]); + if (err < 0) + return err; } /* Add autogain target controls */ From 2149c011510cbdcf183a13b26756e4a02071f0f2 Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Fri, 8 May 2026 12:49:34 +0000 Subject: [PATCH 908/972] ALSA: usb-audio: Add iface reset and delay quirk for TTGK Technology USB-C Audio Setting up the interface when suspended/resumeing fail on this card. Adding a reset and delay quirk will eliminate this problem. usb 1-1: new full-speed USB device number 2 using xhci-hcd usb 1-1: New USB device found, idVendor=3302, idProduct=17c2 usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 usb 1-1: Product: USB-C Audio usb 1-1: Manufacturer: TTGK Technology usb 1-1: SerialNumber: 170120210706 Signed-off-by: Lianqin Hu Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/TYUPR06MB621720E4E8F99A42E162FD51D23D2@TYUPR06MB6217.apcprd06.prod.outlook.com --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 17983d9774f844..31cbe383ae658d 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2479,6 +2479,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_IGNORE_CTL_ERROR), DEVICE_FLG(0x3255, 0x0000, /* Luxman D-10X */ QUIRK_FLAG_ITF_USB_DSD_DAC | QUIRK_FLAG_CTL_MSG_DELAY), + DEVICE_FLG(0x3302, 0x17c2, /* TTGK Technology USB-C Audio */ + QUIRK_FLAG_FORCE_IFACE_RESET | QUIRK_FLAG_IFACE_DELAY), DEVICE_FLG(0x339b, 0x3a07, /* Synaptics HONOR USB-C HEADSET */ QUIRK_FLAG_MIXER_PLAYBACK_MIN_MUTE), DEVICE_FLG(0x3443, 0x930d, /* NexiGo N930W 60fps Webcam */ From dd074f04e04648d89d9d10ae9846cd057c97b385 Mon Sep 17 00:00:00 2001 From: Nicholas Bonello Date: Fri, 8 May 2026 18:55:07 -0400 Subject: [PATCH 909/972] ALSA: hda/realtek: Fix Legion 7 16ITHG6 speaker amp binding The Lenovo Legion 7 16ITHG6 uses codec SSID 17aa:3855, but its PCI SSID is 17aa:3811. The latter is now also used by the Legion S7 15IMH05 quirk, which is matched before codec SSID fallback and incorrectly routes Legion 7 16ITHG6 machines to ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS. That fixup does not bind the CLSA0101 CS35L41 companion amplifiers, making the built-in speakers silent even though playback appears to be active. Add a codec SSID quirk for 17aa:3855 before the conflicting PCI SSID quirk so that the Legion 7 16ITHG6 uses ALC287_FIXUP_LEGION_16ITHG6. This restores CS35L41 firmware loading and binds both speaker amplifiers. Fixes: 67f4c61a73e9 ("ALSA: hda/realtek: Add quirk for Legion S7 15IMH") Cc: stable@vger.kernel.org Tested-by: Nicholas Bonello Assisted-by: Codex:GPT-5 Signed-off-by: Nicholas Bonello Link: https://patch.msgid.link/20260508225507.47667-1-hadobedo@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 55bb98e2e55a01..4e2c8401c404cf 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7675,11 +7675,12 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3801, "Lenovo Yoga9 14IAP7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), HDA_CODEC_QUIRK(0x17aa, 0x3802, "DuetITL 2021", ALC287_FIXUP_YOGA7_14ITL_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3802, "Lenovo Yoga Pro 9 14IRP8", ALC287_FIXUP_TAS2781_I2C), - /* Yoga Pro 9 16IMH9 shares PCI SSID 17aa:3811 with Legion S7 15IMH05; - * use codec SSID to distinguish them + /* Yoga Pro 9 16IMH9 and Legion 7 16ITHG6 share PCI SSID 17aa:3811 + * with Legion S7 15IMH05; use codec SSID to distinguish them */ HDA_CODEC_QUIRK(0x17aa, 0x38d5, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), HDA_CODEC_QUIRK(0x17aa, 0x38d6, "Lenovo Yoga Pro 9 16IMH9", ALC287_FIXUP_TAS2781_I2C), + HDA_CODEC_QUIRK(0x17aa, 0x3855, "Legion 7 16ITHG6", ALC287_FIXUP_LEGION_16ITHG6), SND_PCI_QUIRK(0x17aa, 0x3811, "Legion S7 15IMH05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3813, "Legion 7i 15IMHG05", ALC287_FIXUP_LEGION_15IMHG05_SPEAKERS), SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940 / Yoga Duet 7", ALC298_FIXUP_LENOVO_C940_DUET7), From 0d552587a3c0cd247817417d63e9a1a072fb2116 Mon Sep 17 00:00:00 2001 From: Dirga Yuza Date: Sun, 10 May 2026 12:58:01 +0700 Subject: [PATCH 910/972] ALSA: hda/realtek: Add micmute quirk to Acer Nitro AN515-58 The Acer Nitro AN515-58 uses GPIO 4 in active-low configuration to control the mic-mute LED. This patch adds a fixup to register the LED classdev and associate it with the mic-mute trigger, while chaining to the existing headset mic fixup. Signed-off-by: Dirga Yuza Link: https://patch.msgid.link/20260510055951.9035-1-dirgayuza123@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index d720565db4aaf2..3f7698b28b971a 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -3323,6 +3323,18 @@ static void alc256_fixup_acer_sfg16_micmute_led(struct hda_codec *codec, alc_fixup_hp_gpio_led(codec, action, 0, 0x04); } +static void alc287_fixup_acer_micmute_led(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + struct alc_spec *spec = codec->spec; + + alc_fixup_hp_gpio_led(codec, action, 0, 0x10); + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + spec->micmute_led_polarity = 1; + spec->gpio_mask |= 0x10; + spec->gpio_dir |= 0x10; + } +} /* for alc295_fixup_hp_top_speakers */ #include "../helpers/hp_x360.c" @@ -4112,6 +4124,7 @@ enum { ALC245_FIXUP_CS35L41_I2C_2_MUTE_LED, ALC236_FIXUP_HP_DMIC, ALC256_FIXUP_HONOR_MRB_XXX_M1020_AUDIO, + ALC287_FIXUP_ACER_MICMUTE_LED, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -6664,6 +6677,12 @@ static const struct hda_fixup alc269_fixups[] = { { 0x1b, 0x90170110 }, { } } + }, + [ALC287_FIXUP_ACER_MICMUTE_LED] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc287_fixup_acer_micmute_led, + .chained = true, + .chain_id = ALC2XX_FIXUP_HEADSET_MIC, } }; @@ -6714,7 +6733,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1025, 0x1466, "Acer Aspire A515-56", ALC255_FIXUP_ACER_HEADPHONE_AND_MIC), SND_PCI_QUIRK(0x1025, 0x1534, "Acer Predator PH315-54", ALC255_FIXUP_ACER_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1025, 0x1539, "Acer Nitro 5 AN515-57", ALC2XX_FIXUP_HEADSET_MIC), - SND_PCI_QUIRK(0x1025, 0x159c, "Acer Nitro 5 AN515-58", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1025, 0x159c, "Acer Nitro 5 AN515-58", ALC287_FIXUP_ACER_MICMUTE_LED), SND_PCI_QUIRK(0x1025, 0x1597, "Acer Nitro 5 AN517-55", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x160e, "Acer PT316-51S", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1025, 0x1679, "Acer Nitro 16 AN16-41", ALC2XX_FIXUP_HEADSET_MIC), From 0a9c56dd387605d17dabeedd9fdd2c4c1d0bab7b Mon Sep 17 00:00:00 2001 From: Myeonghun Pak Date: Wed, 13 May 2026 15:57:00 +0900 Subject: [PATCH 911/972] drm/loongson: Use managed KMS polling lsdc_pci_probe() initializes KMS polling before setting up vblank support, requesting the IRQ and registering the DRM device. If any of those later steps fails, probe returns without finalizing polling. The driver also never finalizes polling on regular removal. Use drmm_kms_helper_poll_init() so polling is tied to the DRM device lifetime and automatically finalized on probe failure and device removal. This issue was identified during our ongoing static-analysis research while reviewing kernel code. Fixes: f39db26c5428 ("drm: Add kms driver for loongson display controller") Cc: stable@vger.kernel.org Co-developed-by: Ijae Kim Signed-off-by: Ijae Kim Reviewed-by: Thomas Zimmermann Acked-by: Jianmin Lv Reviewed-by: Huacai Chen Signed-off-by: Myeonghun Pak Signed-off-by: Thomas Zimmermann Link: https://patch.msgid.link/20260513065706.23803-1-mhun512@gmail.com --- drivers/gpu/drm/loongson/lsdc_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/loongson/lsdc_drv.c b/drivers/gpu/drm/loongson/lsdc_drv.c index 1ece1ea42f7816..34405073c4d48e 100644 --- a/drivers/gpu/drm/loongson/lsdc_drv.c +++ b/drivers/gpu/drm/loongson/lsdc_drv.c @@ -293,7 +293,7 @@ static int lsdc_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) vga_client_register(pdev, lsdc_vga_set_decode); - drm_kms_helper_poll_init(ddev); + drmm_kms_helper_poll_init(ddev); if (loongson_vblank) { ret = drm_vblank_init(ddev, descp->num_of_crtc); From 814b2c9b30e56074e11fc0a6e5419b3fee0639bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 11 May 2026 01:36:37 -0300 Subject: [PATCH 912/972] ALSA: usb-audio: qcom: Check offload mapping failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uaudio_transfer_buffer_setup() calls dma_get_sgtable() and then passes the sg_table to uaudio_iommu_map_xfer_buf() without checking whether sg table construction succeeded. If dma_get_sgtable() fails, the sg_table contents are not valid. uaudio_iommu_map_pa() also ignores iommu_map() failures for the event and transfer rings and still returns the allocated IOVA to the QMI response. That can expose an unmapped IOVA to the audio DSP. For transfer rings, the failed mapping also leaves the IOVA allocator state marked in use. Check both operations. Free the coherent transfer buffer when sg table construction fails, free the sg table when transfer-buffer IOMMU mapping fails, and release the transfer-ring IOVA if iommu_map() fails. Also return the existing event-ring IOVA when the event ring is already mapped, matching the pre-split helper behavior. Fixes: 326bbc348298 ("ALSA: usb-audio: qcom: Introduce QC USB SND offloading support") Fixes: 44499ecb4f28 ("ALSA: usb: qcom: Fix false-positive address space check") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260511-alsa-usb-qcom-offload-map-errors-v1-1-6502695e58bc@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/qcom/qc_audio_offload.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/sound/usb/qcom/qc_audio_offload.c b/sound/usb/qcom/qc_audio_offload.c index 5f993b88448c7e..a0009503b2c592 100644 --- a/sound/usb/qcom/qc_audio_offload.c +++ b/sound/usb/qcom/qc_audio_offload.c @@ -565,6 +565,7 @@ static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, unsigned long iova = 0; bool map = true; int prot = uaudio_iommu_map_prot(dma_coherent); + int ret; switch (mtype) { case MEM_EVENT_RING: @@ -582,10 +583,24 @@ static unsigned long uaudio_iommu_map_pa(enum mem_type mtype, bool dma_coherent, dev_err(uaudio_qdev->data->dev, "unknown mem type %d\n", mtype); } - if (!iova || !map) + if (!iova) return 0; - iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, GFP_KERNEL); + if (!map) + return iova; + + ret = iommu_map(uaudio_qdev->data->domain, iova, pa, size, prot, + GFP_KERNEL); + if (ret) { + dev_err(uaudio_qdev->data->dev, + "failed to map %zu bytes at iova 0x%08lx: %d\n", + size, iova, ret); + if (mtype == MEM_XFER_RING) + uaudio_put_iova(iova, size, + &uaudio_qdev->xfer_ring_list, + &uaudio_qdev->xfer_ring_iova_size); + return 0; + } return iova; } @@ -1054,15 +1069,17 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, if (!xfer_buf) return -ENOMEM; - dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, - xfer_buf_dma, len); + ret = dma_get_sgtable(subs->dev->bus->sysdev, &xfer_buf_sgt, xfer_buf, + xfer_buf_dma, len); + if (ret) + goto free_xfer_buf; /* map the physical buffer into sysdev as well */ xfer_buf_dma_sysdev = uaudio_iommu_map_xfer_buf(dma_coherent, len, &xfer_buf_sgt); if (!xfer_buf_dma_sysdev) { ret = -ENOMEM; - goto unmap_sync; + goto free_sgt; } mem_info->dma = xfer_buf_dma; @@ -1073,7 +1090,9 @@ static int uaudio_transfer_buffer_setup(struct snd_usb_substream *subs, return 0; -unmap_sync: +free_sgt: + sg_free_table(&xfer_buf_sgt); +free_xfer_buf: usb_free_coherent(subs->dev, len, xfer_buf, xfer_buf_dma); return ret; From cb02416507bc0c31415f1527aefc280ae96260bd Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 11 May 2026 00:54:47 -0700 Subject: [PATCH 913/972] ALSA: seq: Use flexible array for MIDI channels Store MIDI channel entries in the MIDI channel set allocation instead of allocating them separately. This ties the channel array lifetime directly to the channel set, removes a separate allocation failure path, and lets __counted_by() describe the array bounds. Move the embedded emux channel set to the end of its containing structure so it can carry the flexible array. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260511075447.445350-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- include/sound/emux_synth.h | 2 +- include/sound/seq_midi_emul.h | 6 +++--- sound/core/seq/seq_midi_emul.c | 38 ++++++++++------------------------ sound/synth/emux/emux_seq.c | 11 +++------- 4 files changed, 18 insertions(+), 39 deletions(-) diff --git a/include/sound/emux_synth.h b/include/sound/emux_synth.h index 3f7f365ed248cd..2c0a976e00abb4 100644 --- a/include/sound/emux_synth.h +++ b/include/sound/emux_synth.h @@ -125,7 +125,6 @@ struct snd_emux { */ struct snd_emux_port { - struct snd_midi_channel_set chset; struct snd_emux *emu; char port_mode; /* operation mode */ @@ -138,6 +137,7 @@ struct snd_emux_port { #if IS_ENABLED(CONFIG_SND_SEQUENCER_OSS) struct snd_seq_oss_arg *oss_arg; #endif + struct snd_midi_channel_set chset; }; /* port_mode */ diff --git a/include/sound/seq_midi_emul.h b/include/sound/seq_midi_emul.h index 88799d1e1f53df..afc7654378701e 100644 --- a/include/sound/seq_midi_emul.h +++ b/include/sound/seq_midi_emul.h @@ -55,14 +55,14 @@ struct snd_midi_channel_set { int client; /* Client for this port */ int port; /* The port number */ - int max_channels; /* Size of the channels array */ - struct snd_midi_channel *channels; - unsigned char midi_mode; /* MIDI operating mode */ unsigned char gs_master_volume; /* SYSEX master volume: 0-127 */ unsigned char gs_chorus_mode; unsigned char gs_reverb_mode; + int max_channels; /* Size of the channels array */ + struct snd_midi_channel channels[] __counted_by(max_channels); + }; struct snd_midi_op { diff --git a/sound/core/seq/seq_midi_emul.c b/sound/core/seq/seq_midi_emul.c index fd067c85c524fc..a90ebc7b3811b2 100644 --- a/sound/core/seq/seq_midi_emul.c +++ b/sound/core/seq/seq_midi_emul.c @@ -81,9 +81,6 @@ snd_midi_process_event(const struct snd_midi_op *ops, pr_debug("ALSA: seq_midi_emul: ev or chanbase NULL (snd_midi_process_event)\n"); return; } - if (chanset->channels == NULL) - return; - if (snd_seq_ev_is_channel_type(ev)) { dest_channel = ev->data.note.channel; if (dest_channel >= chanset->max_channels) { @@ -642,23 +639,6 @@ static void snd_midi_channel_init(struct snd_midi_channel *p, int n) p->drum_channel = 1; /* Default ch 10 as drums */ } -/* - * Allocate and initialise a set of midi channel control blocks. - */ -static struct snd_midi_channel *snd_midi_channel_init_set(int n) -{ - struct snd_midi_channel *chan; - int i; - - chan = kmalloc_objs(struct snd_midi_channel, n); - if (chan) { - for (i = 0; i < n; i++) - snd_midi_channel_init(chan+i, i); - } - - return chan; -} - /* * reset all midi channels */ @@ -687,13 +667,18 @@ reset_all_channels(struct snd_midi_channel_set *chset) struct snd_midi_channel_set *snd_midi_channel_alloc_set(int n) { struct snd_midi_channel_set *chset; + int i; + + chset = kmalloc_flex(*chset, channels, n); + if (!chset) + return NULL; + + chset->max_channels = n; + chset->private_data = NULL; + + for (i = 0; i < n; i++) + snd_midi_channel_init(&chset->channels[i], i); - chset = kmalloc_obj(*chset); - if (chset) { - chset->channels = snd_midi_channel_init_set(n); - chset->private_data = NULL; - chset->max_channels = n; - } return chset; } EXPORT_SYMBOL(snd_midi_channel_alloc_set); @@ -717,7 +702,6 @@ void snd_midi_channel_free_set(struct snd_midi_channel_set *chset) { if (chset == NULL) return; - kfree(chset->channels); kfree(chset); } EXPORT_SYMBOL(snd_midi_channel_free_set); diff --git a/sound/synth/emux/emux_seq.c b/sound/synth/emux/emux_seq.c index 2ed01e9d79bb90..01ad5b12b680f5 100644 --- a/sound/synth/emux/emux_seq.c +++ b/sound/synth/emux/emux_seq.c @@ -132,19 +132,15 @@ snd_emux_create_port(struct snd_emux *emu, char *name, int i, type, cap; /* Allocate structures for this channel */ - p = kzalloc_obj(*p); + p = kzalloc_flex(*p, chset.channels, max_channels); if (!p) return NULL; - p->chset.channels = kzalloc_objs(*p->chset.channels, max_channels); - if (!p->chset.channels) { - kfree(p); - return NULL; - } + p->chset.max_channels = max_channels; + for (i = 0; i < max_channels; i++) p->chset.channels[i].number = i; p->chset.private_data = p; - p->chset.max_channels = max_channels; p->emu = emu; p->chset.client = emu->client; #ifdef SNDRV_EMUX_USE_RAW_EFFECT @@ -182,7 +178,6 @@ free_port(void *private_data) #ifdef SNDRV_EMUX_USE_RAW_EFFECT snd_emux_delete_effect(p); #endif - kfree(p->chset.channels); kfree(p); } } From 74e8409821ac8cda70bf23eb593f2c7f6e3b5a2f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 11 May 2026 11:41:48 +0100 Subject: [PATCH 914/972] ALSA: doc: cs35l56: Update path to HDA driver source The HDA drivers were moved to sound/hda/... so update a Documentation reference that still pointed to the old location. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260511104148.36382-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- Documentation/sound/codecs/cs35l56.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index d5363b08f5152f..b3f8c1c2385185 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -40,7 +40,7 @@ There are two drivers in the kernel *For systems using SoundWire*: sound/soc/codecs/cs35l56.c and associated files -*For systems using HDA*: sound/pci/hda/cs35l56_hda.c +*For systems using HDA*: sound/hda/codecs/side-codecs/cs35l56_hda.c Firmware ======== From d02d2d51a50d1bbf44a50eda094aa2b10fecf023 Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Mon, 11 May 2026 15:15:58 -0300 Subject: [PATCH 915/972] ALSA: hda/realtek: Limit mic boost on Positivo DN50E The internal mic boost on the Positivo DN50E is too high. Fix this by applying the ALC269_FIXUP_LIMIT_INT_MIC_BOOST fixup to the machine to limit the gain. Signed-off-by: Edson Juliano Drosdeck Link: https://patch.msgid.link/20260511181558.670563-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 4e2c8401c404cf..3323793cdc14a7 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7830,6 +7830,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), SND_PCI_QUIRK(0x1e39, 0xca14, "MEDION NM14LNL", ALC233_FIXUP_MEDION_MTL_SPK), + SND_PCI_QUIRK(0x1e50, 0x7007, "Positivo DN50E", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x1ee7, 0x2078, "HONOR BRB-X M1010", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1ee7, 0x2081, "HONOR MRB-XXX M1020", ALC256_FIXUP_HONOR_MRB_XXX_M1020_AUDIO), SND_PCI_QUIRK(0x1f4c, 0xe001, "Minisforum V3 (SE)", ALC245_FIXUP_BASS_HP_DAC), From da5e9f8e552128ba7f7b930986d31567f2b188a3 Mon Sep 17 00:00:00 2001 From: Simon Wood Date: Mon, 11 May 2026 15:24:58 -0600 Subject: [PATCH 916/972] ALSA: M-Audio C600 disable Output Gain Knob The C400/C600 interfaces have a large output gain knob which attenuates the outputs as stereo pairs (1/2, 3/4 and 5/6). The Windows driver/app provides a control to disable this knob/behaviour on any/all outputs, forcing maximum gain on each. The 'disable behaviour' is desirable if any outputs are being used for aux/effects sends, or if interface is being used as a live/matrix mixer. This patch adds a control to select which output pairs are affected by the Output Gain Knob. Default behaviour is to select all outputs. Tested on the C600, likely also works for the C400. Signed-off-by: Simon Wood Link: https://patch.msgid.link/20260511212458.44142-1-simon@mungewell.org Signed-off-by: Takashi Iwai --- sound/usb/mixer_quirks.c | 46 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index 99975c3240a551..628f841b04aaed 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -1301,7 +1301,7 @@ static int snd_ftu_eff_switch_init(struct usb_mixer_interface *mixer, err = snd_usb_ctl_msg(dev, usb_rcvctrlpipe(dev, 0), UAC_GET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_IN, - pval & 0xff00, + (pval & 0xff00) | ((pval & 0xff0000) >> 16), snd_usb_ctrl_intf(mixer->hostif) | ((pval & 0xff) << 8), value, 2); if (err < 0) @@ -1334,7 +1334,7 @@ static int snd_ftu_eff_switch_update(struct usb_mixer_elem_list *list) usb_sndctrlpipe(chip->dev, 0), UAC_SET_CUR, USB_RECIP_INTERFACE | USB_TYPE_CLASS | USB_DIR_OUT, - pval & 0xff00, + (pval & 0xff00) | ((pval & 0xff0000) >> 16), snd_usb_ctrl_intf(list->mixer->hostif) | ((pval & 0xff) << 8), value, 2); } @@ -1754,6 +1754,44 @@ static int snd_c400_create_effect_ret_vol_ctls(struct usb_mixer_interface *mixer return 0; } +/* output gain knob selectively adjusts outputs as stereo pairs */ +/* reuses functions from FTU effect switch */ +static int snd_c400_knob_switch_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + static const char *const texts[8] = { + "None", "1/2", "3/4", "1/2 3/4", + "5/6", "1/2 5/6", "3/4 5/6", "1/2 3/4 5/6" + }; + + return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(texts), texts); +} + +static int snd_c400_create_knob_switch(struct usb_mixer_interface *mixer, + int validx, int bUnitID) +{ + static struct snd_kcontrol_new template = { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = "Output Gain Knob", + .index = 0, + .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, + .info = snd_c400_knob_switch_info, + .get = snd_ftu_eff_switch_get, + .put = snd_ftu_eff_switch_put + }; + struct usb_mixer_elem_list *list; + int err; + + err = add_single_ctl_with_resume(mixer, bUnitID, + snd_ftu_eff_switch_update, + &template, &list); + if (err < 0) + return err; + list->kctl->private_value = (validx << 8) | bUnitID; + snd_ftu_eff_switch_init(mixer, list->kctl); + return 0; +} + static int snd_c400_create_mixer(struct usb_mixer_interface *mixer) { int err; @@ -1786,6 +1824,10 @@ static int snd_c400_create_mixer(struct usb_mixer_interface *mixer) if (err < 0) return err; + err = snd_c400_create_knob_switch(mixer, 0x0900, 0x20); + if (err < 0) + return err; + return 0; } From 57cd11e788bdf68460eedbc414be9fb4ac8d8f4c Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 11 May 2026 16:00:26 -0700 Subject: [PATCH 917/972] ALSA: ctxfi: Use flexible array for SRCIMP imappers Store the SRCIMP imapper entries in the SRCIMP resource allocation instead of allocating a separate array. This keeps the mapper table tied to the SRCIMP lifetime and removes the extra allocation and cleanup paths. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260511230026.28488-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/ctxfi/ctsrc.c | 15 +-------------- sound/pci/ctxfi/ctsrc.h | 2 +- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/sound/pci/ctxfi/ctsrc.c b/sound/pci/ctxfi/ctsrc.c index 326997bb885df4..46dc1f509234f0 100644 --- a/sound/pci/ctxfi/ctsrc.c +++ b/sound/pci/ctxfi/ctsrc.c @@ -668,13 +668,6 @@ static int srcimp_rsc_init(struct srcimp *srcimp, if (err) return err; - /* Reserve memory for imapper nodes */ - srcimp->imappers = kzalloc_objs(struct imapper, desc->msr); - if (!srcimp->imappers) { - err = -ENOMEM; - goto error1; - } - /* Set srcimp specific operations */ srcimp->rsc.ops = &srcimp_basic_rsc_ops; srcimp->ops = &srcimp_ops; @@ -683,16 +676,10 @@ static int srcimp_rsc_init(struct srcimp *srcimp, srcimp->rsc.ops->master(&srcimp->rsc); return 0; - -error1: - rsc_uninit(&srcimp->rsc); - return err; } static int srcimp_rsc_uninit(struct srcimp *srcimp) { - kfree(srcimp->imappers); - srcimp->imappers = NULL; srcimp->ops = NULL; srcimp->mgr = NULL; rsc_uninit(&srcimp->rsc); @@ -711,7 +698,7 @@ static int get_srcimp_rsc(struct srcimp_mgr *mgr, *rsrcimp = NULL; /* Allocate mem for SRCIMP resource */ - srcimp = kzalloc(sizeof(*srcimp), GFP_KERNEL); + srcimp = kzalloc_flex(*srcimp, imappers, desc->msr); if (!srcimp) return -ENOMEM; diff --git a/sound/pci/ctxfi/ctsrc.h b/sound/pci/ctxfi/ctsrc.h index e6366cc6a7ae6a..7ba94adde92085 100644 --- a/sound/pci/ctxfi/ctsrc.h +++ b/sound/pci/ctxfi/ctsrc.h @@ -103,10 +103,10 @@ struct srcimp_rsc_ops; struct srcimp { struct rsc rsc; unsigned char idx[8]; - struct imapper *imappers; unsigned int mapped; /* A bit-map indicating which conj rsc is mapped */ struct srcimp_mgr *mgr; const struct srcimp_rsc_ops *ops; + struct imapper imappers[]; }; struct srcimp_rsc_ops { From 3aec8baeb517c3aa391fecd2136ab4a1d8e5fd74 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 11 May 2026 16:01:21 -0700 Subject: [PATCH 918/972] ALSA: asihpi: Use flexible array for control cache Store the ASIHPI control-cache lookup table in the control-cache allocation instead of allocating it separately. This keeps the lookup table tied to the cache object lifetime and removes the extra allocation and free path. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260511230121.28606-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/asihpi/hpicmn.c | 16 ++++------------ sound/pci/asihpi/hpicmn.h | 4 ++-- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/sound/pci/asihpi/hpicmn.c b/sound/pci/asihpi/hpicmn.c index d846777e7462b3..0674fd854cc8bd 100644 --- a/sound/pci/asihpi/hpicmn.c +++ b/sound/pci/asihpi/hpicmn.c @@ -641,17 +641,12 @@ void hpi_cmn_control_cache_sync_to_msg(struct hpi_control_cache *p_cache, struct hpi_control_cache *hpi_alloc_control_cache(const u32 control_count, const u32 size_in_bytes, u8 *p_dsp_control_buffer) { - struct hpi_control_cache *p_cache = kmalloc_obj(*p_cache); + struct hpi_control_cache *p_cache; + + p_cache = kzalloc_flex(*p_cache, p_info, control_count); if (!p_cache) return NULL; - p_cache->p_info = - kzalloc_objs(*p_cache->p_info, control_count); - if (!p_cache->p_info) { - kfree(p_cache); - return NULL; - } - p_cache->cache_size_in_bytes = size_in_bytes; p_cache->control_count = control_count; p_cache->p_cache = p_dsp_control_buffer; @@ -661,10 +656,7 @@ struct hpi_control_cache *hpi_alloc_control_cache(const u32 control_count, void hpi_free_control_cache(struct hpi_control_cache *p_cache) { - if (p_cache) { - kfree(p_cache->p_info); - kfree(p_cache); - } + kfree(p_cache); } static void subsys_message(struct hpi_message *phm, struct hpi_response *phr) diff --git a/sound/pci/asihpi/hpicmn.h b/sound/pci/asihpi/hpicmn.h index 8ec656cf8848c7..40af7329587a12 100644 --- a/sound/pci/asihpi/hpicmn.h +++ b/sound/pci/asihpi/hpicmn.h @@ -37,10 +37,10 @@ struct hpi_control_cache { u16 adap_idx; u32 control_count; u32 cache_size_in_bytes; - /** pointer to allocated memory of lookup pointers. */ - struct hpi_control_cache_info **p_info; /** pointer to DSP's control cache. */ u8 *p_cache; + /** pointer to allocated memory of lookup pointers. */ + struct hpi_control_cache_info *p_info[] __counted_by(control_count); }; struct hpi_adapter_obj *hpi_find_adapter(u16 adapter_index); From 67c73815220784074ff13ec07df955911caf1b73 Mon Sep 17 00:00:00 2001 From: Daniel Schaefer Date: Wed, 13 May 2026 23:55:13 +0800 Subject: [PATCH 919/972] ALSA: hda/realtek: fix mic boost on Framework PTL In addition to the mic jack fix, also need to avoid boosting the internal mic too much, otherwise >50% input volume clips a lot. Also add a second SSID. We have one for the classic chassis/speaker and one for the new Pro chassis/speaker. To: Jaroslav Kysela To: Takashi Iwai To: linux-sound@vger.kernel.org Cc: Dustin L. Howett Cc: linux@frame.work Signed-off-by: Daniel Schaefer Link: https://patch.msgid.link/20260513155513.11683-1-dhs@frame.work Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 3323793cdc14a7..ef1eccda70df82 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -4095,6 +4095,7 @@ enum { ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED, ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED, ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, + ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST, ALC287_FIXUP_LEGION_16ITHG6, ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK, ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN, @@ -6346,6 +6347,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_limit_int_mic_boost, + .chained = true, + .chain_id = ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE, + }, [ALC287_FIXUP_LEGION_16ITHG6] = { .type = HDA_FIXUP_FUNC, .v.func = alc287_fixup_legion_16ithg6_speakers, @@ -7856,7 +7863,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x000b, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), - SND_PCI_QUIRK(0xf111, 0x000f, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000f, "Framework Laptop 13 Pro PTL", ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0xf111, 0x010f, "Framework Laptop 13 PTL", ALC295_FIXUP_FRAMEWORK_LAPTOP_LIMIT_INT_MIC_BOOST), #if 0 /* Below is a quirk table taken from the old code. From 2891bb13ef158281736b6314b1ceaef6d08d57f4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 May 2026 18:27:58 +0200 Subject: [PATCH 920/972] ALSA: hda/cs35l56: Drop malformed default N from Kconfig First of all, it has to be 'default n' (small letter n), otherwise it looks for CONFIG_N which is absent and in case of appearance will enable something unrelated. Second and most important is that 'n' *is* the default 'default' already. Hence just drop malformed line. Signed-off-by: Andy Shevchenko Reviewed-by: Richard Fitzgerald Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260513162758.365972-1-andriy.shevchenko@linux.intel.com --- sound/hda/codecs/side-codecs/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/Kconfig b/sound/hda/codecs/side-codecs/Kconfig index fc5651e555e3c2..e51964c0a09162 100644 --- a/sound/hda/codecs/side-codecs/Kconfig +++ b/sound/hda/codecs/side-codecs/Kconfig @@ -94,7 +94,6 @@ menu "CS35L56 driver options" config SND_HDA_SCODEC_CS35L56_CAL_DEBUGFS bool "CS35L56 create debugfs for factory calibration" - default N depends on DEBUG_FS select SND_SOC_CS35L56_CAL_DEBUGFS_COMMON help From fd87b510f5f543125ecf51e7c706a9f4bc3352be Mon Sep 17 00:00:00 2001 From: Markus Kramer Date: Thu, 14 May 2026 00:28:18 +0200 Subject: [PATCH 921/972] ALSA: hda/realtek: Add quirk for Samsung Galaxy Book5 360 headphone The Samsung Galaxy Book5 360 (NP750QHA, PCI subsystem ID 0x144d:0xc902) has severe audio distortion on the 3.5mm headphone jack. Applying ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET corrects the output path configuration, consistent with fixes already applied to other Samsung Galaxy Book models using the same ALC256 codec. Cc: stable@vger.kernel.org Link: https://github.com/thesofproject/linux/issues/5648 Signed-off-by: Markus Kramer Link: https://patch.msgid.link/20260513222818.14351-1-linux@markus-kramer.de Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index ef1eccda70df82..0c501e9fdc2711 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7511,6 +7511,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc870, "Samsung Galaxy Book2 Pro (NP950XED)", ALC298_FIXUP_SAMSUNG_AMP_V2_2_AMPS), SND_PCI_QUIRK(0x144d, 0xc872, "Samsung Galaxy Book2 Pro (NP950XEE)", ALC298_FIXUP_SAMSUNG_AMP_V2_2_AMPS), SND_PCI_QUIRK(0x144d, 0xc886, "Samsung Galaxy Book3 Pro (NP964XFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), + SND_PCI_QUIRK(0x144d, 0xc902, "Samsung Galaxy Book5 360 (NP750QHA)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), SND_PCI_QUIRK(0x144d, 0xc1ca, "Samsung Galaxy Book3 Pro 360 (NP960QFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), SND_PCI_QUIRK(0x144d, 0xc1cb, "Samsung Galaxy Book3 Pro 360 (NP965QFG)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), SND_PCI_QUIRK(0x144d, 0xc1cc, "Samsung Galaxy Book3 Ultra (NT960XFH)", ALC298_FIXUP_SAMSUNG_AMP_V2_4_AMPS), From 9921941929ab014038c357b30567d93d20393a94 Mon Sep 17 00:00:00 2001 From: Quan Sun <2022090917019@std.uestc.edu.cn> Date: Thu, 14 May 2026 21:22:45 +0800 Subject: [PATCH 922/972] ALSA: hda: Fix NULL pointer dereference in snd_hda_ctl_add() snd_hda_ctl_add() dereferences kctl->id.subdevice without checking whether kctl is NULL. Multiple callers in sound/hda/codecs/ca0132.c pass the return value of snd_ctl_new1() directly to snd_hda_ctl_add() without a NULL check: return snd_hda_ctl_add(codec, nid, snd_ctl_new1(&knew, codec)); snd_ctl_new1() returns NULL when the underlying snd_ctl_new() fails on memory allocation (kzalloc_flex),which can occur under memory pressure or via fault injection. Add a NULL check at the entry of snd_hda_ctl_add(), matching the pattern already used by snd_ctl_add_replace() at the same call path (sound/core/control.c:515). Return -EINVAL to let callers handle the error gracefully. Fixes: 44f0c9782cc6 ("ALSA: hda/ca0132: Add tuning controls") Signed-off-by: Quan Sun <2022090917019@std.uestc.edu.cn> Link: https://patch.msgid.link/20260514132245.3062884-1-2022090917019@std.uestc.edu.cn Signed-off-by: Takashi Iwai --- sound/hda/common/codec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/hda/common/codec.c b/sound/hda/common/codec.c index c2af2511a83160..81f266b9b850f7 100644 --- a/sound/hda/common/codec.c +++ b/sound/hda/common/codec.c @@ -1699,6 +1699,9 @@ int snd_hda_ctl_add(struct hda_codec *codec, hda_nid_t nid, unsigned short flags = 0; struct hda_nid_item *item; + if (!kctl) + return -EINVAL; + if (kctl->id.subdevice & HDA_SUBDEV_AMP_FLAG) { flags |= HDA_NID_ITEM_AMP; if (nid == 0) From 83dca2530fb3ba63f47bad339d890bc30aa06ab5 Mon Sep 17 00:00:00 2001 From: Jackie Dong Date: Thu, 14 May 2026 23:39:40 +0800 Subject: [PATCH 923/972] ALSA: hda/realtek: ALC269 fixup for Lenovo Yoga Pro 7 15ASH111 audio Volume control for the speakers on the Lenovo Yoga Pro 7 15ASH11 laptop doesn't work. The DAC routing is the same as on the ThinkPad X1 Gen7 function, so reuse the alc285_fixup_thinkpad_x1_gen7 to get it working. Signed-off-by: Jackie Dong Link: https://patch.msgid.link/20260514153940.7320-1-xy-jackie@139.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 0c501e9fdc2711..9946ae185f4abd 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7762,6 +7762,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x38df, "Y990 YG DUAL", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x38f9, "Thinkbook 16P Gen5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), SND_PCI_QUIRK(0x17aa, 0x38fa, "Thinkbook 16P Gen5", ALC287_FIXUP_MG_RTKC_CSAMP_CS35L41_I2C_THINKPAD), + SND_PCI_QUIRK(0x17aa, 0x38fc, "Lenovo Yoga Pro 7 15ASH11", ALC245_FIXUP_BASS_HP_DAC), SND_PCI_QUIRK(0x17aa, 0x38fd, "ThinkBook plus Gen5 Hybrid", ALC287_FIXUP_TAS2781_I2C), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x390d, "Lenovo Yoga Pro 7 14ASP10", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), From 7d1051ad68df3d584b5f24bfa1fb19f3a24db278 Mon Sep 17 00:00:00 2001 From: Adrien Burnett Date: Thu, 14 May 2026 18:59:05 +0200 Subject: [PATCH 924/972] ALSA: hda/realtek: Add mute LED quirk for HP Pavilion Laptop 16-ag0xxx Add a SND_PCI_QUIRK entry for the HP Pavilion Laptop 16-ag0xxx (subsystem 0x103c:0x8cbc, Realtek ALC245). The ALC245_FIXUP_HP_X360_MUTE_LEDS fixup is already used by the neighbouring HP Pavilion Aero Laptop 13-bg0xxx (0x103c:0x8cbd); it chains the master-mute COEF handler with the GPIO mic-mute LED handler, which is what this machine needs. Tested on the affected hardware: both the mute and mic-mute key LEDs respond correctly to the keyboard hotkeys after this change. Cc: Signed-off-by: Adrien Burnett Link: https://patch.msgid.link/20260514165905.21175-1-an.arctic.pigeon@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 9946ae185f4abd..2f5d13032fd7ac 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7200,6 +7200,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca7, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8caf, "HP Elite mt645 G8 Mobile Thin Client", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8cbc, "HP Pavilion Laptop 16-ag0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8cbd, "HP Pavilion Aero Laptop 13-bg0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x8cdd, "HP Spectre", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x8cde, "HP OmniBook Ultra Flip Laptop 14t", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), From 6fd9f6e870ea285f05102e8e00e6a7f4495a9a02 Mon Sep 17 00:00:00 2001 From: Matt DeVillier Date: Thu, 7 May 2026 09:58:41 -0500 Subject: [PATCH 925/972] ALSA: hda/ca0132: Disable auto-detect on manual output select Commit 778031e1658d ("ALSA: hda/ca0132: Set HP/Speaker auto-detect default from headphone pin verb") enables HP/Speaker auto-detect by default when the headphone pin supports presence detect. With auto-detect enabled, ca0132_select_out() and ca0132_alt_select_out() choose the output from jack presence instead of the manual HP/Speaker selection. This means selecting speaker output while headphones are plugged in updates the control state, but audio still routes to the headphones. Treat an explicit manual output selection as a request to leave auto-detect mode. Clear the HP/Speaker auto-detect switch before applying the manual selection, and notify userspace so the auto-detect control state is updated in mixers. Do this for both the normal HP/Speaker Playback Switch and the alternate Output Select control used by desktop cards. This keeps auto-detect enabled by default for devices with jack presence detection, while preserving the expected behavior that a manual output choice takes effect immediately. Fixes: 778031e1658d ("ALSA: hda/ca0132: Set HP/Speaker auto-detect default from headphone pin verb") Signed-off-by: Matt DeVillier Link: https://lore.kernel.org/CAFTm+6AfeXKf=b2frG4xC5yC4jjM9TkD6c8+dOWWFw6BDjDESw@mail.gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/ca0132.c | 44 +++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/sound/hda/codecs/ca0132.c b/sound/hda/codecs/ca0132.c index ad533b04ab29cf..be565ffaade0ab 100644 --- a/sound/hda/codecs/ca0132.c +++ b/sound/hda/codecs/ca0132.c @@ -5498,6 +5498,30 @@ static int zxr_headphone_gain_set(struct hda_codec *codec, long val) return 0; } +/* + * Manual output selection (HP/Speaker Playback Switch or alt Output Select) + * is meaningful only when HP/Speaker auto-detect is disabled, since the + * select_out path always prefers jack presence when auto-detect is on. When + * the user explicitly chooses an output, turn auto-detect off so the manual + * choice actually takes effect, and notify userspace so the auto-detect + * control reflects the new state. + */ +static void ca0132_disable_hp_auto_detect(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + struct snd_kcontrol *kctl; + + if (!spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]) + return; + + spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID] = 0; + kctl = snd_hda_find_mixer_ctl(codec, + "HP/Speaker Auto Detect Playback Switch"); + if (kctl) + snd_ctl_notify(codec->card, SNDRV_CTL_EVENT_MASK_VALUE, + &kctl->id); +} + static int ca0132_vnode_switch_set(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -5510,14 +5534,11 @@ static int ca0132_vnode_switch_set(struct snd_kcontrol *kcontrol, int auto_jack; if (nid == VNID_HP_SEL) { - auto_jack = - spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]; - if (!auto_jack) { - if (ca0132_use_alt_functions(spec)) - ca0132_alt_select_out(codec); - else - ca0132_select_out(codec); - } + ca0132_disable_hp_auto_detect(codec); + if (ca0132_use_alt_functions(spec)) + ca0132_alt_select_out(codec); + else + ca0132_select_out(codec); return 1; } @@ -5978,7 +5999,6 @@ static int ca0132_alt_output_select_put(struct snd_kcontrol *kcontrol, struct ca0132_spec *spec = codec->spec; int sel = ucontrol->value.enumerated.item[0]; unsigned int items = NUM_OF_OUTPUTS; - unsigned int auto_jack; if (sel >= items) return 0; @@ -5988,10 +6008,8 @@ static int ca0132_alt_output_select_put(struct snd_kcontrol *kcontrol, spec->out_enum_val = sel; - auto_jack = spec->vnode_lswitch[VNID_HP_ASEL - VNODE_START_NID]; - - if (!auto_jack) - ca0132_alt_select_out(codec); + ca0132_disable_hp_auto_detect(codec); + ca0132_alt_select_out(codec); return 1; } From c77a6cbb36ff8cbc1f084d94f8dcda5250935271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Thu, 7 May 2026 11:28:30 -0300 Subject: [PATCH 926/972] ALSA: virtio: Validate control metadata from the device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio-snd control handling trusts the device-provided control type and value count returned by the device. That metadata is then used directly to index g_v2a_type_map[] in virtsnd_kctl_info(), and to size loops and memcpy() operations in virtsnd_kctl_get() and virtsnd_kctl_put() against fixed-size virtio_snd_ctl_value and snd_ctl_elem_value arrays. A buggy or malicious device can therefore trigger out-of-bounds access by advertising an invalid control type or an oversized value count. Validate control type and count once in virtsnd_kctl_parse_cfg(), before querying enumerated items or exposing the control to ALSA. Fixes: d6568e3de42d ("ALSA: virtio: add support for audio controls") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260507-alsa-virtio-validate-kctl-info-v1-1-7404fb12ec37@gmail.com Signed-off-by: Takashi Iwai --- sound/virtio/virtio_kctl.c | 50 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/sound/virtio/virtio_kctl.c b/sound/virtio/virtio_kctl.c index ffb903d56297a7..45f7b6a5b3080a 100644 --- a/sound/virtio/virtio_kctl.c +++ b/sound/virtio/virtio_kctl.c @@ -18,6 +18,21 @@ static const snd_ctl_elem_type_t g_v2a_type_map[] = { [VIRTIO_SND_CTL_TYPE_IEC958] = SNDRV_CTL_ELEM_TYPE_IEC958 }; +/* Map for converting VirtIO types to maximum value counts. */ +static const unsigned int g_v2a_count_map[] = { + [VIRTIO_SND_CTL_TYPE_BOOLEAN] = + ARRAY_SIZE(((struct virtio_snd_ctl_value *)0)->value.integer), + [VIRTIO_SND_CTL_TYPE_INTEGER] = + ARRAY_SIZE(((struct virtio_snd_ctl_value *)0)->value.integer), + [VIRTIO_SND_CTL_TYPE_INTEGER64] = + ARRAY_SIZE(((struct virtio_snd_ctl_value *)0)->value.integer64), + [VIRTIO_SND_CTL_TYPE_ENUMERATED] = + ARRAY_SIZE(((struct virtio_snd_ctl_value *)0)->value.enumerated), + [VIRTIO_SND_CTL_TYPE_BYTES] = + ARRAY_SIZE(((struct virtio_snd_ctl_value *)0)->value.bytes), + [VIRTIO_SND_CTL_TYPE_IEC958] = 1 +}; + /* Map for converting VirtIO access rights to ALSA access rights. */ static const unsigned int g_v2a_access_map[] = { [VIRTIO_SND_CTL_ACCESS_READ] = SNDRV_CTL_ELEM_ACCESS_READ, @@ -36,6 +51,37 @@ static const unsigned int g_v2a_mask_map[] = { [VIRTIO_SND_CTL_EVT_MASK_TLV] = SNDRV_CTL_EVENT_MASK_TLV }; +static int virtsnd_kctl_validate_info(struct virtio_snd *snd, u32 cid, + struct virtio_snd_ctl_info *kinfo) +{ + struct virtio_device *vdev = snd->vdev; + unsigned int type = le32_to_cpu(kinfo->type); + unsigned int count = le32_to_cpu(kinfo->count); + + if (type >= ARRAY_SIZE(g_v2a_type_map)) { + dev_err(&vdev->dev, "control #%u: unknown type %u\n", + cid, type); + return -EINVAL; + } + + if (count > g_v2a_count_map[type] || + (type == VIRTIO_SND_CTL_TYPE_IEC958 && count != 1)) { + dev_err(&vdev->dev, "control #%u: invalid count %u for type %u\n", + cid, count, type); + return -EINVAL; + } + + if (type == VIRTIO_SND_CTL_TYPE_ENUMERATED && + !le32_to_cpu(kinfo->value.enumerated.items)) { + dev_err(&vdev->dev, + "control #%u: no items for enumerated control\n", + cid); + return -EINVAL; + } + + return 0; +} + /** * virtsnd_kctl_info() - Returns information about the control. * @kcontrol: ALSA control element. @@ -385,6 +431,10 @@ int virtsnd_kctl_parse_cfg(struct virtio_snd *snd) struct virtio_snd_ctl_info *kinfo = &snd->kctl_infos[i]; unsigned int type = le32_to_cpu(kinfo->type); + rc = virtsnd_kctl_validate_info(snd, i, kinfo); + if (rc) + return rc; + if (type == VIRTIO_SND_CTL_TYPE_ENUMERATED) { rc = virtsnd_kctl_get_enum_items(snd, i); if (rc) From c84179a1d36bebe99d9694502737ae9f3a90d2bc Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Fri, 15 May 2026 16:30:43 +0800 Subject: [PATCH 927/972] ASoC: Intel: sof_sdw: append dai type to dai link name unconditionally The dai_type is used to select function topologies. Since the topology stream name and DAI link name use partial matching, unconditionally appending the dai_type provides necessary selection metadata without breaking existing topologies. Signed-off-by: Bard Liao Reviewed-by: Kai Vehmanen Reviewed-by: Liam Girdwood Link: https://patch.msgid.link/20260515083043.1864426-1-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/boards/sof_sdw.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c index c18ec607e02967..5c4682f9f7c9b1 100644 --- a/sound/soc/intel/boards/sof_sdw.c +++ b/sound/soc/intel/boards/sof_sdw.c @@ -901,10 +901,16 @@ static int create_sdw_dailink(struct snd_soc_card *card, } } + /* + * The dai_type is used to select function topologies. Since the topology stream name + * and DAI link name use partial matching, unconditionally appending the dai_type provides + * necessary selection metadata without breaking existing topologies. Although + * ctx->append_dai_type is not checked here, we overwrite it to ensure consistency in case + * it is referenced elsewhere. + */ + ctx->append_dai_type = true; for_each_pcm_streams(stream) { static const char * const sdw_stream_name[] = { - "SDW%d-Playback", - "SDW%d-Capture", "SDW%d-Playback-%s", "SDW%d-Capture-%s", }; @@ -932,15 +938,10 @@ static int create_sdw_dailink(struct snd_soc_card *card, } /* create stream name according to first link id */ - if (ctx->append_dai_type) - name = devm_kasprintf(dev, GFP_KERNEL, - sdw_stream_name[stream + 2], - ffs(sof_end->link_mask) - 1, - type_strings[sof_end->dai_info->dai_type]); - else - name = devm_kasprintf(dev, GFP_KERNEL, - sdw_stream_name[stream], - ffs(sof_end->link_mask) - 1); + name = devm_kasprintf(dev, GFP_KERNEL, + sdw_stream_name[stream], + ffs(sof_end->link_mask) - 1, + type_strings[sof_end->dai_info->dai_type]); if (!name) return -ENOMEM; From b65020d5398f499c09498c9786dba6d67ae57664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 11 May 2026 01:29:34 -0300 Subject: [PATCH 928/972] ALSA: hda/cs35l41: Fix firmware load work teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cs35l41_hda creates ALSA controls whose private data points at the cs35l41_hda object. The firmware load control can also queue fw_load_work. Those controls are not removed on component unbind, and device remove only cancels fw_load_work through cs35l41_remove_dsp(). That helper is skipped when halo_initialized is false. With firmware_autostart disabled, a firmware load can be requested before the DSP has been initialized. If the component or device is removed before the queued work runs, the worker can run after teardown and dereference driver state that is no longer valid. Track the created controls and remove them on unbind so no new control callback can reach the driver data or queue more work. Then cancel fw_load_work to drain any request that was already queued. Also cancel the work unconditionally during device remove before runtime PM teardown. Fixes: 47ceabd99a28 ("ALSA: hda: cs35l41: Support Firmware switching and reloading") Fixes: 4c870513fbb0 ("ALSA: hda: cs35l41: Add read-only ALSA control for forced mute") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Reviewed-by: Stefan Binding Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20260511-alsa-hda-cs35l41-fw-work-teardown-v1-1-1184e9bc4f25@gmail.com --- sound/hda/codecs/side-codecs/cs35l41_hda.c | 77 +++++++++++++++------- sound/hda/codecs/side-codecs/cs35l41_hda.h | 5 ++ 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda.c b/sound/hda/codecs/side-codecs/cs35l41_hda.c index acfccc848f82d8..64a5bd895fd1e6 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda.c +++ b/sound/hda/codecs/side-codecs/cs35l41_hda.c @@ -1325,6 +1325,43 @@ static int cs35l41_fw_type_ctl_info(struct snd_kcontrol *kcontrol, struct snd_ct return snd_ctl_enum_info(uinfo, 1, ARRAY_SIZE(cs35l41_hda_fw_ids), cs35l41_hda_fw_ids); } +static void cs35l41_remove_controls(struct cs35l41_hda *cs35l41) +{ + if (!cs35l41->codec) + return; + + snd_ctl_remove(cs35l41->codec->card, cs35l41->mute_override_ctl); + cs35l41->mute_override_ctl = NULL; + + snd_ctl_remove(cs35l41->codec->card, cs35l41->fw_load_ctl); + cs35l41->fw_load_ctl = NULL; + + snd_ctl_remove(cs35l41->codec->card, cs35l41->fw_type_ctl); + cs35l41->fw_type_ctl = NULL; +} + +static int cs35l41_add_control(struct cs35l41_hda *cs35l41, + struct snd_kcontrol_new *ctl, + struct snd_kcontrol **kctl) +{ + int ret; + + *kctl = snd_ctl_new1(ctl, cs35l41); + if (!*kctl) + return -ENOMEM; + + ret = snd_ctl_add(cs35l41->codec->card, *kctl); + if (ret) { + dev_err(cs35l41->dev, "Failed to add KControl %s = %d\n", ctl->name, ret); + *kctl = NULL; + return ret; + } + + dev_dbg(cs35l41->dev, "Added Control %s\n", ctl->name); + + return 0; +} + static int cs35l41_create_controls(struct cs35l41_hda *cs35l41) { char fw_type_ctl_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN]; @@ -1360,32 +1397,23 @@ static int cs35l41_create_controls(struct cs35l41_hda *cs35l41) scnprintf(mute_override_ctl_name, SNDRV_CTL_ELEM_ID_NAME_MAXLEN, "%s Forced Mute Status", cs35l41->amp_name); - ret = snd_ctl_add(cs35l41->codec->card, snd_ctl_new1(&fw_type_ctl, cs35l41)); - if (ret) { - dev_err(cs35l41->dev, "Failed to add KControl %s = %d\n", fw_type_ctl.name, ret); - return ret; - } - - dev_dbg(cs35l41->dev, "Added Control %s\n", fw_type_ctl.name); - - ret = snd_ctl_add(cs35l41->codec->card, snd_ctl_new1(&fw_load_ctl, cs35l41)); - if (ret) { - dev_err(cs35l41->dev, "Failed to add KControl %s = %d\n", fw_load_ctl.name, ret); - return ret; - } - - dev_dbg(cs35l41->dev, "Added Control %s\n", fw_load_ctl.name); + ret = cs35l41_add_control(cs35l41, &fw_type_ctl, &cs35l41->fw_type_ctl); + if (ret) + goto err; - ret = snd_ctl_add(cs35l41->codec->card, snd_ctl_new1(&mute_override_ctl, cs35l41)); - if (ret) { - dev_err(cs35l41->dev, "Failed to add KControl %s = %d\n", mute_override_ctl.name, - ret); - return ret; - } + ret = cs35l41_add_control(cs35l41, &fw_load_ctl, &cs35l41->fw_load_ctl); + if (ret) + goto err; - dev_dbg(cs35l41->dev, "Added Control %s\n", mute_override_ctl.name); + ret = cs35l41_add_control(cs35l41, &mute_override_ctl, &cs35l41->mute_override_ctl); + if (ret) + goto err; return 0; + +err: + cs35l41_remove_controls(cs35l41); + return ret; } static bool cs35l41_dsm_supported(acpi_handle handle, unsigned int commands) @@ -1522,6 +1550,10 @@ static void cs35l41_hda_unbind(struct device *dev, struct device *master, void * device_link_remove(&cs35l41->codec->core.dev, cs35l41->dev); unlock_system_sleep(sleep_flags); memset(comp, 0, sizeof(*comp)); + + cs35l41_remove_controls(cs35l41); + cancel_work_sync(&cs35l41->fw_load_work); + cs35l41->codec = NULL; } } @@ -2060,6 +2092,7 @@ void cs35l41_hda_remove(struct device *dev) struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); component_del(cs35l41->dev, &cs35l41_hda_comp_ops); + cancel_work_sync(&cs35l41->fw_load_work); pm_runtime_get_sync(cs35l41->dev); pm_runtime_dont_use_autosuspend(cs35l41->dev); diff --git a/sound/hda/codecs/side-codecs/cs35l41_hda.h b/sound/hda/codecs/side-codecs/cs35l41_hda.h index 7d003c598e93e5..56ec07c0bb7456 100644 --- a/sound/hda/codecs/side-codecs/cs35l41_hda.h +++ b/sound/hda/codecs/side-codecs/cs35l41_hda.h @@ -57,6 +57,8 @@ enum control_bus { SPI }; +struct snd_kcontrol; + struct cs35l41_hda { struct device *dev; struct regmap *regmap; @@ -75,6 +77,9 @@ struct cs35l41_hda { int speaker_id; struct mutex fw_mutex; struct work_struct fw_load_work; + struct snd_kcontrol *fw_type_ctl; + struct snd_kcontrol *fw_load_ctl; + struct snd_kcontrol *mute_override_ctl; struct regmap_irq_chip_data *irq_data; bool firmware_running; From 96350db80e0acd733e9b9ef61c0d910790b27289 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 15 May 2026 12:57:09 +0200 Subject: [PATCH 929/972] ring-buffer remote: Avoid unexpected symbol warnings (arm, s390) The now more verbose check found more architecture specific symbol missing from the whitelist, during randconfig testing on s390 and 32-bit arm: Unexpected symbols in kernel/trace/simple_ring_buffer.o: U __aeabi_unwind_cpp_pr1 Unexpected symbols in kernel/trace/simple_ring_buffer.o: U __s390_indirect_jump_r1 U __s390_indirect_jump_r10 U __s390_indirect_jump_r14 U __s390_indirect_jump_r2 U __s390_indirect_jump_r5 U __s390_indirect_jump_r7 U __s390_indirect_jump_r8 U __s390_indirect_jump_r9 make[6]: *** [/home/arnd/arm-soc/kernel/trace/Makefile:160: kernel/trace/simple_ring_buffer.o.checked] Error 1 Add these to the list and keep it roughly sorted into sanitizer and architecture symbols. Cc: Masami Hiramatsu Cc: Marc Zyngier Cc: Nathan Chancellor Cc: Vincent Donnefort Cc: Mathieu Desnoyers Cc: Paolo Bonzini Link: https://patch.msgid.link/20260515105717.1023007-1-arnd@kernel.org Fixes: 1211907ac0b5 ("tracing: Generate undef symbols allowlist for simple_ring_buffer") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1decdce8cbef25..9b0834134cae24 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o targets += undefsyms_base.o KASAN_SANITIZE_undefsyms_base.o := y -UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ - __msan simple_ring_buffer \ +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') quiet_cmd_check_undefined = NM $< From dc366607c41c45fd0ae6f3db090f31dd611b644a Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Wed, 13 May 2026 12:30:50 +0800 Subject: [PATCH 930/972] drm: Replace old pointer to new idr Commit 5e28b7b94408 introduced a logical error by failing to replace the newly generated IDR pointer to old id's pointer at the correct location within the "change handle" logic; this resulted in the issue reported by syzbot [1]. Specifically, the new IDR object pointer is intended to replace the original id's pointer during the normal execution flow. Additionally, an unnecessary conditional check for the ret exit path has been removed. [1] !RB_EMPTY_ROOT(&prime_fpriv->dmabufs) WARNING: drivers/gpu/drm/drm_prime.c:224 at drm_prime_destroy_file_private+0x48/0x60 drivers/gpu/drm/drm_prime.c:224, CPU#0: syz.0.17/5833 Call Trace: drm_file_free.part.0+0x7e6/0xcc0 drivers/gpu/drm/drm_file.c:269 drm_file_free drivers/gpu/drm/drm_file.c:237 [inline] drm_close_helper.isra.0+0x186/0x200 drivers/gpu/drm/drm_file.c:290 drm_release+0x1ab/0x360 drivers/gpu/drm/drm_file.c:438 Fixes: 5e28b7b94408 ("drm: Set old handle to NULL before prime swap in change_handle") Reported-by: syzbot+d7c9eed171647e421013@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d7c9eed171647e421013 Cc: stable@vger.kernel.org Tested-by: syzbot+d7c9eed171647e421013@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Signed-off-by: Dave Airlie Link: https://patch.msgid.link/tencent_C267296443AAA4567771176886DFF364A305@qq.com --- drivers/gpu/drm/drm_gem.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 51a887cc7fd744..8afab57fc0554e 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1067,17 +1067,12 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, spin_unlock(&file_priv->table_lock); - if (ret < 0) - goto out_unlock; - if (obj->dma_buf) { ret = drm_prime_add_buf_handle(&file_priv->prime, obj->dma_buf, handle); if (ret < 0) { spin_lock(&file_priv->table_lock); idr_remove(&file_priv->object_idr, handle); - idrobj = idr_replace(&file_priv->object_idr, obj, handle); - WARN_ON(idrobj != NULL); spin_unlock(&file_priv->table_lock); goto out_unlock; } @@ -1089,7 +1084,9 @@ int drm_gem_change_handle_ioctl(struct drm_device *dev, void *data, spin_lock(&file_priv->table_lock); idr_remove(&file_priv->object_idr, args->handle); + idrobj = idr_replace(&file_priv->object_idr, obj, handle); spin_unlock(&file_priv->table_lock); + WARN_ON(idrobj != NULL); out_unlock: mutex_unlock(&file_priv->prime.lock); From 7e68ba282165b8880d11eac8a816d54d449b7d80 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Thu, 14 May 2026 09:06:07 +0000 Subject: [PATCH 931/972] ASoC: qcom: q6apm-dai: Allocate an extra page for PCM buffers Some Old DSP firmware versions use 32-bit address arithmetic and size for validating the PCM buffer address range. If a buffer is allocated near the top of the 32-bit address space, arithmetic calculations involving the end address can overflow and fail checks. Work around this by increasing the preallocated PCM buffer size by one page. The DSP is still passed the usable buffer size, excluding the extra page, which prevents the firmware from seeing an end address that crosses the 32-bit boundary. This was not hit before because PCM buffer allocation and DSP-side mapping happened at different points, and the size mapped on the DSP was usually nperiods * period_size. Therefore the mapped size was unlikely to match the full preallocated buffer size exactly, although the issue was still possible. With early buffer mapping on the DSP, the full preallocated buffer is mapped during PCM creation, making the failure reproducible at boot. Fixes: 8ea6e25c8536 ("ASoC: qcom: q6apm: Add support for early buffer mapping on DSP") Cc: Stable@vger.kernel.org Reported-by: Jens Glathe Closes: https://lore.kernel.org/all/7f10abbd-fb78-4c3a-ab90-7ca78239891a@oldschoolsolutions.biz/ Signed-off-by: Srinivas Kandagatla Tested-by: Jens Glathe Link: https://patch.msgid.link/20260514090607.2435484-1-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- sound/soc/qcom/qdsp6/q6apm-dai.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sound/soc/qcom/qdsp6/q6apm-dai.c b/sound/soc/qcom/qdsp6/q6apm-dai.c index ede19fdea6e9ee..3a1be41df096cb 100644 --- a/sound/soc/qcom/qdsp6/q6apm-dai.c +++ b/sound/soc/qcom/qdsp6/q6apm-dai.c @@ -497,7 +497,12 @@ static int q6apm_dai_pcm_new(struct snd_soc_component *component, struct snd_soc { struct snd_soc_dai *cpu_dai = snd_soc_rtd_to_cpu(rtd, 0); struct snd_pcm *pcm = rtd->pcm; - int size = BUFFER_BYTES_MAX; + /* + * Allocate one extra page as a workaround for a DSP bug where 32-bit + * address arithmetic can overflow when the buffer is placed near the + * end of the addressable range. + */ + int size = BUFFER_BYTES_MAX + PAGE_SIZE; int graph_id, ret; struct snd_pcm_substream *substream; From 1afd8f06dcb1d561af3b239c5b14a88b87c13454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 11 May 2026 13:42:02 -0300 Subject: [PATCH 932/972] ASoC: amd: acp-sdw-legacy: check CPU DAI name before logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit devm_kasprintf() can fail and return NULL. The legacy AMD SoundWire machine driver logs cpus->dai_name before checking the allocation result. Move the debug print after the NULL check, matching the ordering used by the SOF AMD SoundWire path after commit 5726b68473f7 ("ASoC: amd/sdw_utils: avoid NULL deref when devm_kasprintf() fails"). Fixes: 2981d9b0789c ("ASoC: amd: acp: add soundwire machine driver for legacy stack") Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260511-asoc-amd-acp-sdw-legacy-dai-name-null-v1-1-dc6151b6da8a@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/acp/acp-sdw-legacy-mach.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/amd/acp/acp-sdw-legacy-mach.c b/sound/soc/amd/acp/acp-sdw-legacy-mach.c index 0f21e5f64531a2..09b475c83c4966 100644 --- a/sound/soc/amd/acp/acp-sdw-legacy-mach.c +++ b/sound/soc/amd/acp/acp-sdw-legacy-mach.c @@ -260,9 +260,9 @@ static int create_sdw_dailink(struct snd_soc_card *card, cpus->dai_name = devm_kasprintf(dev, GFP_KERNEL, "SDW%d Pin%d", link_num, cpu_pin_id); - dev_dbg(dev, "cpu->dai_name:%s\n", cpus->dai_name); if (!cpus->dai_name) return -ENOMEM; + dev_dbg(dev, "cpu->dai_name:%s\n", cpus->dai_name); codec_maps[j].cpu = 0; codec_maps[j].codec = j; From 6e4bfd9da8851f562f04503d8e43221957f96eeb Mon Sep 17 00:00:00 2001 From: Jasper Smet Date: Wed, 13 May 2026 07:21:37 +0200 Subject: [PATCH 933/972] ASoC: amd: acp: Add DMI quirk for ASUS Zenbook S16 UM5606GA The ASUS Zenbook S16 (UM5606GA) with AMD Ryzen AI 9 465 (Strix Point, ACP 7.0) has a BIOS that incorrectly sets the ACPI property 'acp-audio-config-flag' to 0x10 (FLAG_AMD_LEGACY_ONLY_DMIC) for the ACP device. This prevents snd_pci_ps from probing the SoundWire bus, resulting in no internal audio (dummy output only). The hardware uses a Cirrus Logic CS42L43 (headphone/jack) and four CS35L56 smart amplifiers (speakers), all on SoundWire link 1. The corresponding machine table entry (acp70_cs42l43_l1u0_cs35l56x4_l1u0123) already exists in amd-acp70-acpi-match.c and correctly describes this topology. Add a DMI quirk to override the flag to 0, consistent with the existing entry for the HN7306EA. Signed-off-by: Jasper Smet Link: https://patch.msgid.link/20260513052137.56703-1-josbeir@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/acp-config.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/acp-config.c b/sound/soc/amd/acp-config.c index 1604ed679224ba..309dc9ed6e0d2b 100644 --- a/sound/soc/amd/acp-config.c +++ b/sound/soc/amd/acp-config.c @@ -30,6 +30,13 @@ static const struct dmi_system_id acp70_acpi_flag_override_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HN7306EA"), }, }, + { + /* ASUS Zenbook S16 UM5606GA (Strix Point, ACP 7.0) */ + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "Zenbook S16 UM5606GA"), + }, + }, {} }; From fea3df9eab74863b6d388fc71357342e25c5e877 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 11 May 2026 16:13:13 -0700 Subject: [PATCH 934/972] ASoC: pcm6240: Use flexible array for config blocks Store the per-config block pointer table in the config allocation instead of allocating it separately. This ties the table to the config object lifetime and removes the extra allocation and free path. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260511231313.31929-1-rosenp@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/pcm6240.c | 36 +++++++++++++++--------------------- sound/soc/codecs/pcm6240.h | 2 +- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/sound/soc/codecs/pcm6240.c b/sound/soc/codecs/pcm6240.c index 78b21fbfad50e0..667596fc1781b6 100644 --- a/sound/soc/codecs/pcm6240.c +++ b/sound/soc/codecs/pcm6240.c @@ -1230,15 +1230,11 @@ static struct pcmdevice_config_info *pcmdevice_add_config(void *ctxt, int *status) { struct pcmdevice_priv *pcm_dev = (struct pcmdevice_priv *)ctxt; - struct pcmdevice_config_info *cfg_info; + struct pcmdevice_config_info *cfg_info = NULL; struct pcmdevice_block_data **bk_da; + char cfg_name[64] = {}; unsigned int config_offset = 0, i; - - cfg_info = kzalloc_obj(struct pcmdevice_config_info); - if (!cfg_info) { - *status = -ENOMEM; - goto out; - } + unsigned int nblocks; if (pcm_dev->regbin.fw_hdr.binary_version_num >= 0x105) { if (config_offset + 64 > (int)config_size) { @@ -1247,7 +1243,7 @@ static struct pcmdevice_config_info *pcmdevice_add_config(void *ctxt, "%s: cfg_name out of boundary\n", __func__); goto out; } - memcpy(cfg_info->cfg_name, &config_data[config_offset], 64); + memcpy(cfg_name, &config_data[config_offset], 64); config_offset += 64; } @@ -1257,16 +1253,17 @@ static struct pcmdevice_config_info *pcmdevice_add_config(void *ctxt, __func__); goto out; } - cfg_info->nblocks = - get_unaligned_be32(&config_data[config_offset]); + nblocks = get_unaligned_be32(&config_data[config_offset]); config_offset += 4; - bk_da = cfg_info->blk_data = kzalloc_objs(struct pcmdevice_block_data *, - cfg_info->nblocks); - if (!bk_da) { + cfg_info = kzalloc_flex(*cfg_info, blk_data, nblocks); + if (!cfg_info) { *status = -ENOMEM; goto out; } + cfg_info->nblocks = nblocks; + memcpy(cfg_info->cfg_name, cfg_name, sizeof(cfg_info->cfg_name)); + bk_da = cfg_info->blk_data; cfg_info->real_nblocks = 0; for (i = 0; i < cfg_info->nblocks; i++) { if (config_offset + 12 > config_size) { @@ -1449,14 +1446,11 @@ static void pcmdevice_config_info_remove(void *ctxt) for (i = 0; i < regbin->ncfgs; i++) { if (!cfg_info[i]) continue; - if (cfg_info[i]->blk_data) { - for (j = 0; j < (int)cfg_info[i]->real_nblocks; j++) { - if (!cfg_info[i]->blk_data[j]) - continue; - kfree(cfg_info[i]->blk_data[j]->regdata); - kfree(cfg_info[i]->blk_data[j]); - } - kfree(cfg_info[i]->blk_data); + for (j = 0; j < (int)cfg_info[i]->real_nblocks; j++) { + if (!cfg_info[i]->blk_data[j]) + continue; + kfree(cfg_info[i]->blk_data[j]->regdata); + kfree(cfg_info[i]->blk_data[j]); } kfree(cfg_info[i]); } diff --git a/sound/soc/codecs/pcm6240.h b/sound/soc/codecs/pcm6240.h index 2d8f9e798139ac..e27afd33910c4b 100644 --- a/sound/soc/codecs/pcm6240.h +++ b/sound/soc/codecs/pcm6240.h @@ -199,7 +199,7 @@ struct pcmdevice_config_info { unsigned int nblocks; unsigned int real_nblocks; unsigned char active_dev; - struct pcmdevice_block_data **blk_data; + struct pcmdevice_block_data *blk_data[] __counted_by(nblocks); }; struct pcmdevice_regbin { From e0014849e9af1e08aaaf2b7bc26b1918d8275f24 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 11 May 2026 16:03:51 -0700 Subject: [PATCH 935/972] ASoC: sigmadsp: Use flexible array for control cache Allocate SigmaDSP controls with kzalloc_flex() for the trailing cache data instead of open-coding the size calculation. Annotate the cache array with its existing byte count so the allocation helper can initialize the counter. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260511230351.28868-1-rosenp@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/sigmadsp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/sigmadsp.c b/sound/soc/codecs/sigmadsp.c index 3343dbb63d2b70..2e08fde3989c2f 100644 --- a/sound/soc/codecs/sigmadsp.c +++ b/sound/soc/codecs/sigmadsp.c @@ -35,7 +35,7 @@ struct sigmadsp_control { struct snd_kcontrol *kcontrol; bool is_readback; bool cached; - uint8_t cache[]; + u8 cache[] __counted_by(num_bytes); }; struct sigmadsp_data { @@ -223,7 +223,7 @@ static int sigma_fw_load_control(struct sigmadsp *sigmadsp, return -EINVAL; num_bytes = le16_to_cpu(ctrl_chunk->num_bytes); - ctrl = kzalloc(sizeof(*ctrl) + num_bytes, GFP_KERNEL); + ctrl = kzalloc_flex(*ctrl, cache, num_bytes); if (!ctrl) return -ENOMEM; From 83fbbcb7935ec6d2c8ba3bc133e8a0ead2ab0b2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Fri, 15 May 2026 10:32:25 -0300 Subject: [PATCH 936/972] ALSA: virtio: Add missing 384 kHz PCM rate mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VirtIO sound UAPI defines VIRTIO_SND_PCM_RATE_384000, and ALSA has SNDRV_PCM_RATE_384000. However, virtio-snd's rate conversion tables stop at 192 kHz. A device advertising only 384 kHz is rejected as having no supported PCM frame rates. A device advertising 384 kHz together with lower rates does not expose 384 kHz through the ALSA hardware constraints. The selected ALSA rate also needs a reverse mapping for SET_PARAMS. Add the missing 384 kHz entries to both conversion tables. Fixes: 29b96bf50ba9 ("ALSA: virtio: build PCM devices and substream hardware descriptors") Fixes: da76e9f3e43a ("ALSA: virtio: PCM substream operators") Cc: stable@vger.kernel.org Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260515-alsa-virtio-384k-rate-v1-1-35ecb5df835c@gmail.com Signed-off-by: Takashi Iwai --- sound/virtio/virtio_pcm.c | 3 ++- sound/virtio/virtio_pcm_ops.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/virtio/virtio_pcm.c b/sound/virtio/virtio_pcm.c index eb9cc8131905da..be3893de40a559 100644 --- a/sound/virtio/virtio_pcm.c +++ b/sound/virtio/virtio_pcm.c @@ -77,7 +77,8 @@ static const struct virtsnd_v2a_rate g_v2a_rate_map[] = { [VIRTIO_SND_PCM_RATE_88200] = { SNDRV_PCM_RATE_88200, 88200 }, [VIRTIO_SND_PCM_RATE_96000] = { SNDRV_PCM_RATE_96000, 96000 }, [VIRTIO_SND_PCM_RATE_176400] = { SNDRV_PCM_RATE_176400, 176400 }, - [VIRTIO_SND_PCM_RATE_192000] = { SNDRV_PCM_RATE_192000, 192000 } + [VIRTIO_SND_PCM_RATE_192000] = { SNDRV_PCM_RATE_192000, 192000 }, + [VIRTIO_SND_PCM_RATE_384000] = { SNDRV_PCM_RATE_384000, 384000 } }; /** diff --git a/sound/virtio/virtio_pcm_ops.c b/sound/virtio/virtio_pcm_ops.c index 6297a9c61e70b0..1105e7ff3523a0 100644 --- a/sound/virtio/virtio_pcm_ops.c +++ b/sound/virtio/virtio_pcm_ops.c @@ -90,7 +90,8 @@ static const struct virtsnd_a2v_rate g_a2v_rate_map[] = { { 88200, VIRTIO_SND_PCM_RATE_88200 }, { 96000, VIRTIO_SND_PCM_RATE_96000 }, { 176400, VIRTIO_SND_PCM_RATE_176400 }, - { 192000, VIRTIO_SND_PCM_RATE_192000 } + { 192000, VIRTIO_SND_PCM_RATE_192000 }, + { 384000, VIRTIO_SND_PCM_RATE_384000 } }; static int virtsnd_pcm_sync_stop(struct snd_pcm_substream *substream); From b09a45601094c7f4ec4db8090b825fa61e169d93 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 14 May 2026 14:31:49 -0700 Subject: [PATCH 937/972] hwmon: (lm90) Stop work before releasing hwmon device Sashiko reports: In lm90_probe(), the devm action to cancel the alert_work and report_work (lm90_restore_conf) is registered in lm90_init_client() before devm_hwmon_device_register_with_info() is called. Because devm executes cleanup actions in reverse order during module unbind or probe failure, the hwmon device is unregistered and freed first. If lm90_alert_work() or lm90_report_alarms() runs in the window between the hwmon device being freed and the delayed works being cancelled, lm90_update_alarms() will dereference the freed data->hwmon_dev here. Fix the problem by canceling the workers separately after registering the hwmon device and before registering the interrupt handler. This ensures that the workers are canceled after interrupts are disabled and before the hwmon device is released. Add "shutdown" flag to indicate that device shutdown is in progress to prevent workers from being re-armed. Fixes: f6d0775119fb9 ("hwmon: (lm90) Rework alarm/status handling") Reported-by: Sashiko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index 3c10a5066b53de..c4a9dafff81d68 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -736,6 +736,7 @@ struct lm90_data { struct hwmon_chip_info chip; struct delayed_work alert_work; struct work_struct report_work; + bool shutdown; /* true if shutting down */ bool valid; /* true if register values are valid */ bool alarms_valid; /* true if status register values are valid */ unsigned long last_updated; /* in jiffies */ @@ -1154,6 +1155,9 @@ static void lm90_report_alarms(struct work_struct *work) static int lm90_update_alarms_locked(struct lm90_data *data, bool force) { + if (data->shutdown) + return 0; + if (force || !data->alarms_valid || time_after(jiffies, data->alarms_updated + msecs_to_jiffies(data->update_interval))) { struct i2c_client *client = data->client; @@ -2584,15 +2588,23 @@ static void lm90_restore_conf(void *_data) struct lm90_data *data = _data; struct i2c_client *client = data->client; - cancel_delayed_work_sync(&data->alert_work); - cancel_work_sync(&data->report_work); - /* Restore initial configuration */ if (data->flags & LM90_HAVE_CONVRATE) lm90_write_convrate(data, data->convrate_orig); lm90_write_reg(client, LM90_REG_CONFIG1, data->config_orig); } +static void lm90_stop_work(void *_data) +{ + struct lm90_data *data = _data; + + hwmon_lock(data->hwmon_dev); + data->shutdown = true; + hwmon_unlock(data->hwmon_dev); + cancel_delayed_work_sync(&data->alert_work); + cancel_work_sync(&data->report_work); +} + static int lm90_init_client(struct i2c_client *client, struct lm90_data *data) { struct device_node *np = client->dev.of_node; @@ -2902,6 +2914,10 @@ static int lm90_probe(struct i2c_client *client) data->hwmon_dev = hwmon_dev; + err = devm_add_action_or_reset(&client->dev, lm90_stop_work, data); + if (err) + return err; + if (client->irq) { dev_dbg(dev, "IRQ: %d\n", client->irq); err = devm_request_threaded_irq(dev, client->irq, @@ -2930,7 +2946,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, */ struct lm90_data *data = i2c_get_clientdata(client); - if ((data->flags & LM90_HAVE_BROKEN_ALERT) && + if (!data->shutdown && (data->flags & LM90_HAVE_BROKEN_ALERT) && (data->current_alarms & data->alert_alarms)) { if (!(data->config & 0x80)) { dev_dbg(&client->dev, "Disabling ALERT#\n"); From 873e919e3101063a7a75989510ccfc125a4391cf Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 14 May 2026 14:41:00 -0700 Subject: [PATCH 938/972] hwmon: (lm90) Add lock protection to lm90_alert Sashiko reports: lm90_alert() executes in the smbus alert context and calls lm90_update_confreg() to disable the hardware alert line, without acquiring hwmon_lock. Concurrently, sysfs write operations (such as lm90_write_convrate) hold the hwmon_lock, temporarily modify data->config, and then restore it. If an alert interrupt occurs concurrently with a sysfs write, the sysfs path will overwrite the alert handler's modifications to data->config and the hardware register. This unintentionally re-enables the hardware alert line while the alarm is still active, causing an interrupt storm. Add the missing lock to lm90_alert() to solve the problem. Fixes: 7a1d220ccb0cc ("hwmon: (lm90) Introduce function to update configuration register") Reported-by: Sashiko Signed-off-by: Guenter Roeck --- drivers/hwmon/lm90.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hwmon/lm90.c b/drivers/hwmon/lm90.c index c4a9dafff81d68..1eeb608e59039d 100644 --- a/drivers/hwmon/lm90.c +++ b/drivers/hwmon/lm90.c @@ -2946,6 +2946,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, */ struct lm90_data *data = i2c_get_clientdata(client); + hwmon_lock(data->hwmon_dev); if (!data->shutdown && (data->flags & LM90_HAVE_BROKEN_ALERT) && (data->current_alarms & data->alert_alarms)) { if (!(data->config & 0x80)) { @@ -2955,6 +2956,7 @@ static void lm90_alert(struct i2c_client *client, enum i2c_alert_protocol type, schedule_delayed_work(&data->alert_work, max_t(int, HZ, msecs_to_jiffies(data->update_interval))); } + hwmon_unlock(data->hwmon_dev); } else { dev_dbg(&client->dev, "Everything OK\n"); } From 55a0005518195fdea1fd2991b07644f8dc97ea8e Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 15 May 2026 21:16:16 +0100 Subject: [PATCH 939/972] tracing: Fix desc in error path for the trace remote test module During initialisation in remote_test_load(), if one of the simple_ring_buffer fails to initialise, the error path attempts to rollback initialised buffers. However, the rollback incorrectly uses the global pointer to the trace descriptor, which is only set upon successful load completion. Fix the error path by using the local pointer to the descriptor. Link: https://patch.msgid.link/20260515201616.337469-1-vdonnefort@google.com Fixes: ea908a2b79c8 ("tracing: Add a trace remote module for testing") Reported-by: Sashiko Signed-off-by: Vincent Donnefort base-commit: 5d6919055dec134de3c40167a490f33c74c12581 Signed-off-by: Steven Rostedt --- kernel/trace/remote_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c index 6c1b7701ddae82..a3e2c9b606eb1f 100644 --- a/kernel/trace/remote_test.c +++ b/kernel/trace/remote_test.c @@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus return remote_test_buffer_desc; err_unload: - for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + for_each_ring_buffer_desc(rb_desc, cpu, desc) remote_test_unload_simple_rb(rb_desc->cpu); - trace_remote_free_buffer(remote_test_buffer_desc); + trace_remote_free_buffer(desc); err_free_desc: kfree(desc); From 8a7fe10eec64bfb7cf4091bca540de4c55d56bfa Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Thu, 14 May 2026 22:16:25 +0800 Subject: [PATCH 940/972] soundwire: intel_ace2x: release bpt_stream when close it The BPT stream was allocated in intel_ace2x_bpt_open_stream(), we need to free it in intel_ace2x_bpt_close_stream(). Fixes: 4c1ce9f37d8a8 ("soundwire: intel_ace2x: add BPT send_async/wait callbacks") Signed-off-by: Bard Liao Reviewed-by: Simon Trimmer Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260514141625.1834216-1-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- drivers/soundwire/intel_ace2x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/soundwire/intel_ace2x.c b/drivers/soundwire/intel_ace2x.c index 0a97253fe0c278..b37933efac5d26 100644 --- a/drivers/soundwire/intel_ace2x.c +++ b/drivers/soundwire/intel_ace2x.c @@ -317,6 +317,7 @@ static void intel_ace2x_bpt_close_stream(struct sdw_intel *sdw, struct sdw_slave dev_err(cdns->dev, "%s: remove slave failed: %d\n", __func__, ret); + sdw_release_stream(cdns->bus.bpt_stream); cdns->bus.bpt_stream = NULL; } From 23e6a1ca04ae44806439a5a446e62e4d42e80bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Tue, 12 May 2026 12:00:41 +0200 Subject: [PATCH 941/972] virt: sev-guest: Do not use host-controlled page order in cleanup path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When issuing an extended guest request (SVM_VMGEXIT_EXT_GUEST_REQUEST), get_ext_report() allocates a buffer to retrieve a certificate blob from the host, keeping track of its size in report_req->certs_len. However, the host may return SNP_GUEST_VMM_ERR_INVALID_LEN, indicating an invalid buffer size, as well as the expected length of such buffer. get_ext_report() subsequently updates report_req->certs_len with the host-controlled value, and cleans up the buffer by computing a page order from such value. This is incorrect, as the host-provided length may not match the page order of the original allocation, potentially resulting in corruption in the page allocator. Fix this by using alloc_pages_exact() instead, and reusing @npages to compute the size passed to free_pages_exact(). For consistency, also use @npages to compute the size when allocating the pages, even though this last change has no functional effect. Fixes: 3e385c0d6ce8 ("virt: sev-guest: Move SNP Guest Request data pages handling under snp_cmd_mutex") Signed-off-by: Carlos López Signed-off-by: Borislav Petkov (AMD) Tested-by: Michael Roth Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- drivers/virt/coco/sev-guest/sev-guest.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index e001e6769a43fc..910a1de0d5a72f 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -176,7 +176,6 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques struct snp_guest_req req = {}; int ret, npages = 0, resp_len; sockptr_t certs_address; - struct page *page; if (sockptr_is_null(io->req_data) || sockptr_is_null(io->resp_data)) return -EINVAL; @@ -211,16 +210,15 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques * zeros to indicate that certificate data was not provided. */ npages = report_req->certs_len >> PAGE_SHIFT; - page = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, - get_order(report_req->certs_len)); - if (!page) + req.certs_data = alloc_pages_exact(npages << PAGE_SHIFT, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req.certs_data) return -ENOMEM; - req.certs_data = page_address(page); ret = set_memory_decrypted((unsigned long)req.certs_data, npages); if (ret) { pr_err("failed to mark page shared, ret=%d\n", ret); - __free_pages(page, get_order(report_req->certs_len)); + free_pages_exact(req.certs_data, npages << PAGE_SHIFT); return -EFAULT; } @@ -277,7 +275,7 @@ static int get_ext_report(struct snp_guest_dev *snp_dev, struct snp_guest_reques if (set_memory_encrypted((unsigned long)req.certs_data, npages)) WARN_ONCE(ret, "failed to restore encryption mask (leak it)\n"); else - __free_pages(page, get_order(report_req->certs_len)); + free_pages_exact(req.certs_data, npages << PAGE_SHIFT); } return ret; } From 5200f5f493f79f14bbdc349e402a40dfb32f23c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 May 2026 13:59:58 -0700 Subject: [PATCH 942/972] Linux 7.1-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b7b80e84e1eb26..9f59598d3a085a 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 7 PATCHLEVEL = 1 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* From 456ad3cdaf04229fe5052eaeb5deb48936d7a917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Sun, 17 May 2026 23:33:49 -0300 Subject: [PATCH 943/972] ALSA: ctxfi: Keep line/mic notification controls per mixer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ctxfi stores the Line Capture Switch and Mic Capture Switch controls in a file-scope kctls[] array so do_line_mic_switch() can notify the opposite control when the shared line/mic input selection changes. That storage is shared by all ctxfi cards. If more than one X-Fi card is present, a later card can overwrite the pointers saved by an earlier one. A control update on one card can then use another card's kcontrol object for snd_ctl_notify(). If that other card is removed, the saved pointer can also become stale. Store those notification targets in struct ct_mixer instead. The mixer is per-card state and matches the lifetime of the controls created for that card. Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260517-alsa-ctxfi-mixer-kctls-v1-1-6e4f81f6b658@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/ctxfi/ctmixer.c | 17 +++++++---------- sound/pci/ctxfi/ctmixer.h | 3 +++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sound/pci/ctxfi/ctmixer.c b/sound/pci/ctxfi/ctmixer.c index 50ab69aca2fa31..9abe1d53c3c5e5 100644 --- a/sound/pci/ctxfi/ctmixer.c +++ b/sound/pci/ctxfi/ctmixer.c @@ -221,10 +221,6 @@ ct_mixer_recording_select(struct ct_mixer *mixer, enum CT_AMIXER_CTL type); static void ct_mixer_recording_unselect(struct ct_mixer *mixer, enum CT_AMIXER_CTL type); -/* FIXME: this static looks like it would fail if more than one card was */ -/* installed. */ -static struct snd_kcontrol *kctls[2] = {NULL}; - static enum CT_AMIXER_CTL get_amixer_index(enum CTALSA_MIXER_CTL alsa_index) { switch (alsa_index) { @@ -481,17 +477,18 @@ static struct snd_kcontrol_new mic_source_ctl = { static void do_line_mic_switch(struct ct_atc *atc, enum CTALSA_MIXER_CTL type) { + struct ct_mixer *mixer = atc->mixer; if (MIXER_LINEIN_C_S == type) { atc->select_line_in(atc); - set_switch_state(atc->mixer, MIXER_MIC_C_S, 0); + set_switch_state(mixer, MIXER_MIC_C_S, 0); snd_ctl_notify(atc->card, SNDRV_CTL_EVENT_MASK_VALUE, - &kctls[1]->id); + &mixer->line_mic_kctls[1]->id); } else if (MIXER_MIC_C_S == type) { atc->select_mic_in(atc); - set_switch_state(atc->mixer, MIXER_LINEIN_C_S, 0); + set_switch_state(mixer, MIXER_LINEIN_C_S, 0); snd_ctl_notify(atc->card, SNDRV_CTL_EVENT_MASK_VALUE, - &kctls[0]->id); + &mixer->line_mic_kctls[0]->id); } } @@ -778,9 +775,9 @@ ct_mixer_kcontrol_new(struct ct_mixer *mixer, struct snd_kcontrol_new *new) switch (new->private_value) { case MIXER_LINEIN_C_S: - kctls[0] = kctl; break; + mixer->line_mic_kctls[0] = kctl; break; case MIXER_MIC_C_S: - kctls[1] = kctl; break; + mixer->line_mic_kctls[1] = kctl; break; default: break; } diff --git a/sound/pci/ctxfi/ctmixer.h b/sound/pci/ctxfi/ctmixer.h index dd23d227aeb5aa..0c0963bba99c8a 100644 --- a/sound/pci/ctxfi/ctmixer.h +++ b/sound/pci/ctxfi/ctmixer.h @@ -17,6 +17,8 @@ #include "ctatc.h" #include "ctresource.h" +struct snd_kcontrol; + #define INIT_VOL 0x1c00 enum MIXER_PORT_T { @@ -42,6 +44,7 @@ struct ct_mixer { struct ct_atc *atc; struct sum **sums; /* sum resources for signal collection */ + struct snd_kcontrol *line_mic_kctls[2]; /* line/mic capture switch controls */ unsigned int switch_state; /* A bit-map to indicate state of switches */ int (*get_output_ports)(struct ct_mixer *mixer, enum MIXER_PORT_T type, From e38353138a091554242f0a3da883250fdc4696b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Sun, 17 May 2026 23:41:07 -0300 Subject: [PATCH 944/972] ASoC: mediatek: mt8196: Fix probe resource cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MT8196 AFE probe assigns reserved memory with of_reserved_mem_device_init(), but never releases it. This leaks the reserved memory assignment on driver removal and on later probe failures. The same probe path also uses unchecked pm_runtime_get_sync() calls. A failure while resuming the device can leave the runtime PM usage count in an unexpected state. The regmap error path returns directly while the device is still runtime active, and the remove path drops a runtime PM reference even though successful probe has already released its temporary reference. Register a devm cleanup action for the reserved memory assignment, use pm_runtime_resume_and_get(), and only drop runtime PM references on paths where they are actually held. Fixes: 57513aabfe5b ("ASoC: mediatek: mt8196: add platform driver") Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260517-asoc-mt8196-probe-cleanup-v1-1-a5d26949d7fe@gmail.com Signed-off-by: Mark Brown --- sound/soc/mediatek/mt8196/mt8196-afe-pcm.c | 44 +++++++++++++++------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/sound/soc/mediatek/mt8196/mt8196-afe-pcm.c b/sound/soc/mediatek/mt8196/mt8196-afe-pcm.c index 511e888567be87..a1ae8322d8b6cf 100644 --- a/sound/soc/mediatek/mt8196/mt8196-afe-pcm.c +++ b/sound/soc/mediatek/mt8196/mt8196-afe-pcm.c @@ -2242,13 +2242,16 @@ static int mt8196_afe_runtime_resume(struct device *dev) static int mt8196_afe_component_probe(struct snd_soc_component *component) { struct mtk_base_afe *afe = snd_soc_component_get_drvdata(component); + int ret; + + /* enable clock for regcache get default value from hw */ + ret = pm_runtime_resume_and_get(afe->dev); + if (ret) + return dev_err_probe(afe->dev, ret, "failed to resume device\n"); + + mtk_afe_add_sub_dai_control(component); + pm_runtime_put_sync(afe->dev); - if (component) { - /* enable clock for regcache get default value from hw */ - pm_runtime_get_sync(afe->dev); - mtk_afe_add_sub_dai_control(component); - pm_runtime_put_sync(afe->dev); - } return 0; } @@ -2306,6 +2309,11 @@ static const struct reg_sequence mt8196_cg_patch[] = { { AUDIO_TOP_CON4, 0x361c }, }; +static void mt8196_afe_release_reserved_mem(void *data) +{ + of_reserved_mem_device_release(data); +} + static int mt8196_afe_pcm_dev_probe(struct platform_device *pdev) { int ret, i; @@ -2320,8 +2328,13 @@ static int mt8196_afe_pcm_dev_probe(struct platform_device *pdev) return ret; ret = of_reserved_mem_device_init(dev); - if (ret) + if (ret) { dev_err(dev, "failed to assign memory region: %d\n", ret); + } else { + ret = devm_add_action_or_reset(dev, mt8196_afe_release_reserved_mem, dev); + if (ret) + return ret; + } afe = devm_kzalloc(dev, sizeof(*afe), GFP_KERNEL); if (!afe) @@ -2422,18 +2435,22 @@ static int mt8196_afe_pcm_dev_probe(struct platform_device *pdev) dev_pm_syscore_device(dev, true); /* enable clock for regcache get default value from hw */ - pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); + if (ret) + return dev_err_probe(dev, ret, "failed to resume device\n"); afe->regmap = devm_regmap_init_mmio(dev, afe->base_addr, &mt8196_afe_regmap_config); - if (IS_ERR(afe->regmap)) - return PTR_ERR(afe->regmap); + if (IS_ERR(afe->regmap)) { + ret = PTR_ERR(afe->regmap); + goto err_pm_put; + } ret = regmap_register_patch(afe->regmap, mt8196_cg_patch, ARRAY_SIZE(mt8196_cg_patch)); if (ret < 0) { dev_err(dev, "Failed to apply cg patch\n"); - goto err_pm_disable; + goto err_pm_put; } regmap_read(afe->regmap, AFE_IRQ_MCU_EN, &tmp_reg); @@ -2452,12 +2469,12 @@ static int mt8196_afe_pcm_dev_probe(struct platform_device *pdev) afe->num_dai_drivers); if (ret) { dev_err(dev, "afe component err\n"); - goto err_pm_disable; + return ret; } return 0; -err_pm_disable: +err_pm_put: pm_runtime_put_sync(dev); return ret; } @@ -2467,7 +2484,6 @@ static void mt8196_afe_pcm_dev_remove(struct platform_device *pdev) struct mtk_base_afe *afe = platform_get_drvdata(pdev); struct device *dev = &pdev->dev; - pm_runtime_put_sync(dev); if (!pm_runtime_status_suspended(dev)) mt8196_afe_runtime_suspend(dev); From fd3b95866d86844ae747fce9b3438d73ed5f1e7a Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Tue, 12 May 2026 14:52:52 +0800 Subject: [PATCH 945/972] ASoC: fsl_sai: Eliminate possible interrupt storm during probe When the SAI peripheral is left in a running state by the bootloader, the driver can experience an interrupt storm during probe that prevents successful initialization. This occurs because the current code registers the IRQ handler before resetting the hardware to a known state. The issue manifests as: - Continuous interrupts firing immediately after devm_request_irq() - Driver probe failure or system hang - Error messages about unhandled interrupts This is particularly problematic on systems where U-Boot or other bootloaders enable SAI for boot-time audio feedback or diagnostics and don't properly disable it before handing control to Linux. Fix this by reordering the probe sequence: 1. Add fsl_sai_reset_hw() to clear TCSR/RCSR control registers, which disables the transmitter/receiver and all interrupt sources 2. Move devm_request_irq() to after hardware initialization This ensures the SAI is in a clean reset state before the interrupt handler can be invoked, preventing the storm while maintaining proper error handling and cleanup paths. Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20260512065252.75859-1-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 43 ++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index bd336d2e4cb383..e364552c1f47e4 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1370,6 +1370,31 @@ static int fsl_sai_check_version(struct device *dev) return 0; } +static int fsl_sai_reset_hw(struct device *dev) +{ + struct fsl_sai *sai = dev_get_drvdata(dev); + unsigned char ofs = sai->soc_data->reg_offset; + int ret; + + /* + * Clear TCSR/RCSR to reset SAI and disable all interrupts. + * Bootloader may leave SAI running causing interrupt storm. + */ + ret = regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), 0); + if (ret) { + dev_err(dev, "Failed to clear TCSR: %d\n", ret); + return ret; + } + + ret = regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), 0); + if (ret) { + dev_err(dev, "Failed to clear RCSR: %d\n", ret); + return ret; + } + + return 0; +} + /* * Calculate the offset between first two datalines, don't * different offset in one case. @@ -1575,13 +1600,6 @@ static int fsl_sai_probe(struct platform_device *pdev) if (irq < 0) return irq; - ret = devm_request_irq(dev, irq, fsl_sai_isr, IRQF_SHARED, - np->name, sai); - if (ret) { - dev_err(dev, "failed to claim irq %u\n", irq); - return ret; - } - memcpy(&sai->cpu_dai_drv, fsl_sai_dai_template, sizeof(*fsl_sai_dai_template) * ARRAY_SIZE(fsl_sai_dai_template)); @@ -1656,6 +1674,10 @@ static int fsl_sai_probe(struct platform_device *pdev) if (ret < 0) dev_warn(dev, "Error reading SAI version: %d\n", ret); + ret = fsl_sai_reset_hw(dev); + if (ret < 0) + dev_warn(dev, "Failed to reset hardware: %d\n", ret); + /* Select MCLK direction */ if (sai->mclk_direction_output && sai->soc_data->max_register >= FSL_SAI_MCTL) { @@ -1667,6 +1689,13 @@ static int fsl_sai_probe(struct platform_device *pdev) if (ret < 0 && ret != -ENOSYS) goto err_pm_get_sync; + ret = devm_request_irq(dev, irq, fsl_sai_isr, IRQF_SHARED, + np->name, sai); + if (ret) { + dev_err(dev, "failed to claim irq %u\n", irq); + goto err_pm_get_sync; + } + if (of_device_is_compatible(np, "fsl,imx952-sai") && !of_property_read_string(np, "fsl,sai-amix-mode", &str)) { if (!strcmp(str, "bypass")) From 37dba38ac8d0910f31788368bce4d93aa4059040 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:45 +0200 Subject: [PATCH 946/972] of: Introduce of_property_read_s32_index() Signed integers can be read from single value properties using of_property_read_s32() but nothing exist to read signed integers from multi-value properties. Fix this lack adding of_property_read_s32_index(). Signed-off-by: Herve Codina Acked-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260513081702.317117-2-herve.codina@bootlin.com Signed-off-by: Mark Brown --- include/linux/of.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/of.h b/include/linux/of.h index 959786f8f19662..28153616e616a3 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1476,6 +1476,13 @@ static inline int of_property_read_s32(const struct device_node *np, return of_property_read_u32(np, propname, (u32*) out_value); } +static inline int of_property_read_s32_index(const struct device_node *np, + const char *propname, u32 index, + s32 *out_value) +{ + return of_property_read_u32_index(np, propname, index, (u32 *)out_value); +} + #define of_for_each_phandle(it, err, np, ln, cn, cc) \ for (of_phandle_iterator_init((it), (np), (ln), (cn), (cc)), \ err = of_phandle_iterator_next(it); \ From 04a1fe8e6582ac2417c6d68258db5dc47f4dae3e Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:46 +0200 Subject: [PATCH 947/972] ASoC: dt-bindings: Add support for the GPIOs driven amplifier Some amplifiers based on analog switches and op-amps can be present in the audio path and can be driven by GPIOs in order to control their gain value, their mute and/or bypass functions. Those components needs to be viewed as audio components in order to be fully integrated in the audio path. gpio-audio-amp allows to consider these GPIO driven amplifiers as auxiliary audio devices. Signed-off-by: Herve Codina Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260513081702.317117-3-herve.codina@bootlin.com Signed-off-by: Mark Brown --- .../bindings/sound/gpio-audio-amp.yaml | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 Documentation/devicetree/bindings/sound/gpio-audio-amp.yaml diff --git a/Documentation/devicetree/bindings/sound/gpio-audio-amp.yaml b/Documentation/devicetree/bindings/sound/gpio-audio-amp.yaml new file mode 100644 index 00000000000000..3690f3d1628c90 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/gpio-audio-amp.yaml @@ -0,0 +1,270 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/sound/gpio-audio-amp.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Audio amplifier driven by GPIOs + +maintainers: + - Herve Codina + +description: | + Audio GPIO amplifiers are driven by GPIO in order to control the gain value + of the amplifier, its mute function and/or its bypass function. + + Those amplifiers are based on discrete components (analog switches, op-amps + and more) where some of them, mostly analog switches, are controlled by GPIOs + to adjust the gain value of the whole amplifier and/or to control + the mute and/or bypass function. + + For instance, the following piece of hardware is a GPIO amplifier + + +5VA + ^ + |\ | + | \ + Vin >---------------------------|+ \ + | +-------+-----> Vout + .--\/\/\/--+------------|- / | + | | | / | + v | |/ | | + GND o v | + \ GND | + gpio >-----------> \ | + o o | + | | | + | '--\/\/\/--. | + | +--\/\/\/--' + '---------------' + +properties: + compatible: + oneOf: + - const: gpio-audio-amp-mono + description: + A single channel amplifier. All features apply to this sole channel. + + - const: gpio-audio-amp-stereo + description: + A dual channel amplifier (left and right). All features apply to both + channels producing the same effect on both channels at the same time. + + vdd-supply: + description: Main power supply of the amplifier + + vddio-supply: + description: Power supply related to the control path + + vdda1-supply: + description: Analog power supply + + vdda2-supply: + description: Additional analog power supply + + mute-gpios: + description: GPIO to control the mute function + maxItems: 1 + + bypass-gpios: + description: GPIO to control the bypass function + maxItems: 1 + + gain-gpios: + description: | + GPIOs to control the amplifier gain + + The gain value is computed from GPIOs value from 0 to 2^N-1 with N the + number of GPIO described. The first GPIO described is the lsb of the gain + value. + + For instance assuming 2 gpios + gain-gpios = <&gpio1 GPIO_ACTIVE_HIGH> <&gpio2 GPIO_ACTIVE_HIGH>; + The gain value will be the following: + + gpio1 | gpio2 | gain + ------+-------+----- + 0 | 0 | 0b00 -> 0 + 1 | 0 | 0b01 -> 1 + 0 | 1 | 0b10 -> 2 + 1 | 1 | 0b11 -> 3 + ------+-------+----- + + Note: The gain value, bits set to 1 or 0, indicate the state active (bit + set) or the state inactive (bit unset) of the related GPIO. The + physical voltage corresponding to this active/inactive state is + given by the GPIO_ACTIVE_HIGH and GPIO_ACTIVE_LOW flags. + + minItems: 1 + maxItems: 16 + + gain-ranges: + $ref: /schemas/types.yaml#/definitions/int32-matrix + description: | + A list of one or more ranges of possible values. Each range is defined by + the first and last point in the range. Each point is defined by the pair + (GPIOs value, Gain in 0.01 dB unit). + + Ranges can be contiguous or holes can be present between ranges if some + gpios value should not be used. Also in a range the first point and the + last point can be identical. In that case, the range contains only one + item, the given point. + + items: + items: + - description: GPIOs value of the first point in the range + - description: Gain in 0.01 dB unit of the first point in the range + - description: GPIOs value of the last point in the range + - description: Gain in 0.01 dB unit of the last point in the range + description: | + A range defines a linear function (linear in dB) from the first point + to the last point, both included. The number of items in the range is + N = abs(first_point.gpio_value - last_point.gpio_value) + 1 + + It allows to define the gain range from the first_point.gain to + the last_point.gain, both points included. + + Gain (0.01 dB unit) + ^ + | last + +- - - - - - - - - - + point + | + . + | + . + | + . + +- - - - + . + | first . . + | point . . + | . . + +--------+-----------+---> gpios + value + + Note: Even if first_point.gpio_value is lower than last_point.gpio_value + and first_point.gain is lower than last_point.gain in the above + graphic, all combination of values are supported leading to an + increasing or a decreasing linear segment. + + minItems: 1 + maxItems: 65536 + + gain-labels: + $ref: /schemas/types.yaml#/definitions/string-array + minItems: 2 + maxItems: 65536 + description: | + List of the gain labels attached to the combination of GPIOs controlling + the gain. The first label is related to the gain value 0, the second label + is related to the gain value 1 and so on. + + With 2 GPIOs controlling the gain, GPIOs value can be 0, 1, 2 and 3. + Assuming that gain value set the hardware according to the following + table: + + GPIOs | Hardware + value | amplification + ------+-------------- + 0 | Low + 1 | Middle + 2 | High + 3 | Max + ------+-------------- + + The description using gain labels can be: + gain-labels = "Low", "Middle", "High", "Max"; + +dependencies: + gain-ranges: [ gain-gpios ] + gain-labels: [ gain-gpios ] + +required: + - compatible + - vdd-supply + +anyOf: + - required: + - gain-gpios + - required: + - mute-gpios + - required: + - bypass-gpios + +allOf: + - $ref: component-common.yaml# + - if: + required: + - gain-ranges + then: + properties: + gain-labels: false + - if: + required: + - gain-labels + then: + properties: + gain-ranges: false + +unevaluatedProperties: false + +examples: + - | + #include + + /* Gain controlled by gpios */ + amplifier-0 { + compatible = "gpio-audio-amp-mono"; + vdd-supply = <®ulator>; + gain-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>, <&gpio 1 GPIO_ACTIVE_HIGH>; + }; + + /* Gain controlled by gpio using a simple range on a stereo amplifier */ + amplifier-1 { + compatible = "gpio-audio-amp-stereo"; + vdd-supply = <®ulator>; + gain-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>, <&gpio 1 GPIO_ACTIVE_HIGH>; + gain-ranges = <0 (-300) 3 600>; + }; + + /* Gain controlled by gpio with labels */ + amplifier-3 { + compatible = "gpio-audio-amp-mono"; + vdd-supply = <®ulator>; + gain-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; + gain-labels = "Low", "High"; + }; + + /* A mutable stereo amplifier without any gain control */ + amplifier-4 { + compatible = "gpio-audio-amp-stereo"; + vdd-supply = <®ulator>; + mute-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>; + }; + + /* + * Several supplies, gain controlled using more complex ranges, mute and + * bypass. + * + * Assuming 3 gpios for controlling the gain with the following table + * gpios value Gain + * 0b000 Do not use (gpios value not allowed) + * 0b001 - 3dB + * 0b010 + 3dB + * 0b011 + 10dB + * 0b100 Do not use (gpios value not allowed) + * 0b101 + 6dB + * 0b110 + 7dB + * 0b111 + 8dB + */ + amplifier-5 { + compatible = "gpio-audio-amp-mono"; + vdd-supply = <®ulator>; + vddio-supply = <®ulator1>; + vdda1-supply = <®ulator2>; + gain-gpios = <&gpio 0 GPIO_ACTIVE_HIGH>, + <&gpio 1 GPIO_ACTIVE_HIGH>, + <&gpio 2 GPIO_ACTIVE_HIGH>; + gain-ranges = <1 (-300) 2 300>, + <3 1000 3 1000>, + <5 600 7 800>; + mute-gpios = <&gpio 3 GPIO_ACTIVE_HIGH>; + bypass-gpios = <&gpio 4 GPIO_ACTIVE_HIGH>; + }; +... From 28188a6d8869d2350f9e271e1b25e85077c70cee Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:47 +0200 Subject: [PATCH 948/972] ASoC: simple-amplifier: Remove DRV_NAME defined value DRV_NAME is defined and used only in the simple-amplifier driver declaration. Remove the useless defined and use directly the value in the driver declaration itself. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-4-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index d306c585b52b09..41485445727dde 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -9,8 +9,6 @@ #include #include -#define DRV_NAME "simple-amplifier" - struct simple_amp { struct gpio_desc *gpiod_enable; }; @@ -97,7 +95,7 @@ MODULE_DEVICE_TABLE(of, simple_amp_ids); static struct platform_driver simple_amp_driver = { .driver = { - .name = DRV_NAME, + .name = "simple-amplifier", .of_match_table = of_match_ptr(simple_amp_ids), }, .probe = simple_amp_probe, From 23cf9e3ca498b2b3d6f2f4cf39d6db10a6c3e8e9 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:48 +0200 Subject: [PATCH 949/972] ASoC: simple-amplifier: Add missing headers The simple-amplifier driver is a platform device driver. Add missing include files related to this kind of driver. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-5-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 41485445727dde..5b44bcfef49e98 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -5,7 +5,9 @@ */ #include +#include #include +#include #include #include From 2c562adb159ea3f4e5fdf4074e512ae369b4c22b Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:49 +0200 Subject: [PATCH 950/972] ASoC: simple-amplifier: Remove CONFIG_OF flag and of_match_ptr() The simple-amplifier Use CONFIG_OF flag for its of_device_id table and of_match_ptr() when it assigns the table in the driver declaration. This is no more needed. Drop them. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-6-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 5b44bcfef49e98..215318ff62fc65 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -86,19 +86,17 @@ static int simple_amp_probe(struct platform_device *pdev) NULL, 0); } -#ifdef CONFIG_OF static const struct of_device_id simple_amp_ids[] = { { .compatible = "dioo,dio2125", }, { .compatible = "simple-audio-amplifier", }, { } }; MODULE_DEVICE_TABLE(of, simple_amp_ids); -#endif static struct platform_driver simple_amp_driver = { .driver = { .name = "simple-amplifier", - .of_match_table = of_match_ptr(simple_amp_ids), + .of_match_table = simple_amp_ids, }, .probe = simple_amp_probe, }; From dc8beaad292646cc7a019aad50762591e2939632 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:50 +0200 Subject: [PATCH 951/972] ASoC: simple-amplifier: Rename drv_event() function The drv_event() is used to handle power events related to the DRV item. Later, with the support for gpio-audio-amp, this function will be also used to handle power events related to the PGA item. Also, more functions will be added in the driver and it is a common usage to prefix functions based on the driver name. Rename the drv_event() function to simple_amp_power_event() to follow common usage and get rid of the 'drv' term. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-7-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 215318ff62fc65..8f2daec55134de 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -15,8 +15,8 @@ struct simple_amp { struct gpio_desc *gpiod_enable; }; -static int drv_event(struct snd_soc_dapm_widget *w, - struct snd_kcontrol *control, int event) +static int simple_amp_power_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *control, int event) { struct snd_soc_component *c = snd_soc_dapm_to_component(w->dapm); struct simple_amp *priv = snd_soc_component_get_drvdata(c); @@ -42,7 +42,7 @@ static int drv_event(struct snd_soc_dapm_widget *w, static const struct snd_soc_dapm_widget simple_amp_dapm_widgets[] = { SND_SOC_DAPM_INPUT("INL"), SND_SOC_DAPM_INPUT("INR"), - SND_SOC_DAPM_OUT_DRV_E("DRV", SND_SOC_NOPM, 0, 0, NULL, 0, drv_event, + SND_SOC_DAPM_OUT_DRV_E("DRV", SND_SOC_NOPM, 0, 0, NULL, 0, simple_amp_power_event, (SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD)), SND_SOC_DAPM_OUTPUT("OUTL"), SND_SOC_DAPM_OUTPUT("OUTR"), From 34ddd2d368c3b30f899b6b882b1a0284358826dc Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:51 +0200 Subject: [PATCH 952/972] ASoC: simple-amplifier: Use 'simple_amp' variable name instead of 'priv' The simple-amplifier driver use 'priv' as variable name for its private data (struct simple_amp). With the support for gpio-audio-amp, more functions and data structures will be added. Those future additions will add more complexity in data manipulation and will make the 'priv' term error prone. In order to clearly identify the struct simple_amp private data, use 'simple_amp' as variable name when this structure is involved. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-8-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 8f2daec55134de..231e84ab4c0e3e 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -19,7 +19,7 @@ static int simple_amp_power_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *control, int event) { struct snd_soc_component *c = snd_soc_dapm_to_component(w->dapm); - struct simple_amp *priv = snd_soc_component_get_drvdata(c); + struct simple_amp *simple_amp = snd_soc_component_get_drvdata(c); int val; switch (event) { @@ -34,7 +34,7 @@ static int simple_amp_power_event(struct snd_soc_dapm_widget *w, return -EINVAL; } - gpiod_set_value_cansleep(priv->gpiod_enable, val); + gpiod_set_value_cansleep(simple_amp->gpiod_enable, val); return 0; } @@ -68,17 +68,17 @@ static const struct snd_soc_component_driver simple_amp_component_driver = { static int simple_amp_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct simple_amp *priv; + struct simple_amp *simple_amp; - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); - if (priv == NULL) + simple_amp = devm_kzalloc(dev, sizeof(*simple_amp), GFP_KERNEL); + if (!simple_amp) return -ENOMEM; - platform_set_drvdata(pdev, priv); + platform_set_drvdata(pdev, simple_amp); - priv->gpiod_enable = devm_gpiod_get_optional(dev, "enable", - GPIOD_OUT_LOW); - if (IS_ERR(priv->gpiod_enable)) - return dev_err_probe(dev, PTR_ERR(priv->gpiod_enable), + simple_amp->gpiod_enable = devm_gpiod_get_optional(dev, "enable", + GPIOD_OUT_LOW); + if (IS_ERR(simple_amp->gpiod_enable)) + return dev_err_probe(dev, PTR_ERR(simple_amp->gpiod_enable), "Failed to get 'enable' gpio"); return devm_snd_soc_register_component(dev, From 4d84b75e5eecd729e31ed5981353f84baa351c49 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:52 +0200 Subject: [PATCH 953/972] ASoC: simple-amplifier: Remove DAPM widgets and routes from the ASoC component driver The simple-amplifier set the DAPM wigets and routes table in the ASoC component driver. This is perfectly fine when the component has well known DAPM tables. The simple-amplifier is going to handle several kind of components based on the driver compatible string. The DAPM table will not be the same for all components supported by the driver. In order to have different DAPM table based on matching compatible strings, move those tables from the ASoC component driver to the device compatible string matching data. Add those DAPM widgets and routes dynamically during the ASoC component probe operation. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-9-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 59 ++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 231e84ab4c0e3e..3e644c1c269606 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -11,7 +11,15 @@ #include #include +struct simple_amp_data { + const struct snd_soc_dapm_widget *dapm_widgets; + unsigned int num_dapm_widgets; + const struct snd_soc_dapm_route *dapm_routes; + unsigned int num_dapm_routes; +}; + struct simple_amp { + const struct simple_amp_data *data; struct gpio_desc *gpiod_enable; }; @@ -58,11 +66,39 @@ static const struct snd_soc_dapm_route simple_amp_dapm_routes[] = { { "OUTR", NULL, "DRV" }, }; +static int simple_amp_add_basic_dapm(struct snd_soc_component *component) +{ + struct snd_soc_dapm_context *dapm = snd_soc_component_to_dapm(component); + struct simple_amp *simple_amp = snd_soc_component_get_drvdata(component); + struct device *dev = component->dev; + int ret; + + /* Add basic dapm widgets and routes */ + ret = snd_soc_dapm_new_controls(dapm, simple_amp->data->dapm_widgets, + simple_amp->data->num_dapm_widgets); + if (ret) { + dev_err(dev, "Failed to add basic dapm widgets (%d)\n", ret); + return ret; + } + + ret = snd_soc_dapm_add_routes(dapm, simple_amp->data->dapm_routes, + simple_amp->data->num_dapm_routes); + if (ret) { + dev_err(dev, "Failed to add basic dapm routes (%d)\n", ret); + return ret; + } + + return 0; +} + +static int simple_amp_component_probe(struct snd_soc_component *component) +{ + /* Add basic dapm widgets and routes */ + return simple_amp_add_basic_dapm(component); +} + static const struct snd_soc_component_driver simple_amp_component_driver = { - .dapm_widgets = simple_amp_dapm_widgets, - .num_dapm_widgets = ARRAY_SIZE(simple_amp_dapm_widgets), - .dapm_routes = simple_amp_dapm_routes, - .num_dapm_routes = ARRAY_SIZE(simple_amp_dapm_routes), + .probe = simple_amp_component_probe, }; static int simple_amp_probe(struct platform_device *pdev) @@ -75,6 +111,10 @@ static int simple_amp_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, simple_amp); + simple_amp->data = of_device_get_match_data(dev); + if (!simple_amp->data) + return -EINVAL; + simple_amp->gpiod_enable = devm_gpiod_get_optional(dev, "enable", GPIOD_OUT_LOW); if (IS_ERR(simple_amp->gpiod_enable)) @@ -86,9 +126,16 @@ static int simple_amp_probe(struct platform_device *pdev) NULL, 0); } +static const struct simple_amp_data simple_audio_amplifier_data = { + .dapm_widgets = simple_amp_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(simple_amp_dapm_widgets), + .dapm_routes = simple_amp_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(simple_amp_dapm_routes), +}; + static const struct of_device_id simple_amp_ids[] = { - { .compatible = "dioo,dio2125", }, - { .compatible = "simple-audio-amplifier", }, + { .compatible = "dioo,dio2125", .data = &simple_audio_amplifier_data}, + { .compatible = "simple-audio-amplifier", .data = &simple_audio_amplifier_data}, { } }; MODULE_DEVICE_TABLE(of, simple_amp_ids); From 41e3ebbfcab1eb5c6403e24130bc1690dae4f108 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:53 +0200 Subject: [PATCH 954/972] ASoC: simple-amplifier: Introduce support for gpio-audio-amp Improve the simple-amplifier introducing preliminary support for gpio-audio-amp. Those amplifiers are amplifiers driven by gpios. This support introduction doesn't handle any GPIO yet but introduces the compatible strings and the related DAPM table. Two gpio-audio-amp are available: A mono and a stereo version. The mono version has only one audio channel and gpio settings impact features such as the gain or mute of this sole channel. The stereo version has two channels (left and right). Gpio settings impact both channels in the same manner and at the same time. For instance, the gain setting set the gain of both channels as well as the mute setting mutes both channels. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-10-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 3e644c1c269606..1704cdbb7de5d9 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -4,6 +4,7 @@ * Author: Jerome Brunet */ +#include #include #include #include @@ -12,6 +13,9 @@ #include struct simple_amp_data { + unsigned int supports; +#define SIMPLE_AUDIO_SUPPORT_PGA BIT(0) + const struct snd_soc_dapm_widget *dapm_widgets; unsigned int num_dapm_widgets; const struct snd_soc_dapm_route *dapm_routes; @@ -66,6 +70,38 @@ static const struct snd_soc_dapm_route simple_amp_dapm_routes[] = { { "OUTR", NULL, "DRV" }, }; +static const struct snd_soc_dapm_widget simple_amp_mono_pga_dapm_widgets[] = { + SND_SOC_DAPM_INPUT("IN"), + SND_SOC_DAPM_OUTPUT("OUT"), + SND_SOC_DAPM_PGA_E("PGA", SND_SOC_NOPM, 0, 0, NULL, 0, simple_amp_power_event, + (SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD)), + SND_SOC_DAPM_REGULATOR_SUPPLY("vdd", 0, 0), +}; + +static const struct snd_soc_dapm_route simple_amp_mono_pga_dapm_routes[] = { + { "PGA", NULL, "IN" }, + { "PGA", NULL, "vdd" }, + { "OUT", NULL, "PGA" }, +}; + +static const struct snd_soc_dapm_widget simple_amp_stereo_pga_dapm_widgets[] = { + SND_SOC_DAPM_INPUT("INL"), + SND_SOC_DAPM_INPUT("INR"), + SND_SOC_DAPM_OUTPUT("OUTL"), + SND_SOC_DAPM_OUTPUT("OUTR"), + SND_SOC_DAPM_PGA_E("PGA", SND_SOC_NOPM, 0, 0, NULL, 0, simple_amp_power_event, + (SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD)), + SND_SOC_DAPM_REGULATOR_SUPPLY("vdd", 0, 0), +}; + +static const struct snd_soc_dapm_route simple_amp_stereo_pga_dapm_routes[] = { + { "PGA", NULL, "INL" }, + { "PGA", NULL, "INR" }, + { "PGA", NULL, "vdd" }, + { "OUTL", NULL, "PGA" }, + { "OUTR", NULL, "PGA" }, +}; + static int simple_amp_add_basic_dapm(struct snd_soc_component *component) { struct snd_soc_dapm_context *dapm = snd_soc_component_to_dapm(component); @@ -133,9 +169,27 @@ static const struct simple_amp_data simple_audio_amplifier_data = { .num_dapm_routes = ARRAY_SIZE(simple_amp_dapm_routes), }; +static const struct simple_amp_data simple_audio_mono_pga_data = { + .supports = SIMPLE_AUDIO_SUPPORT_PGA, + .dapm_widgets = simple_amp_mono_pga_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(simple_amp_mono_pga_dapm_widgets), + .dapm_routes = simple_amp_mono_pga_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(simple_amp_mono_pga_dapm_routes), +}; + +static const struct simple_amp_data simple_audio_stereo_pga_data = { + .supports = SIMPLE_AUDIO_SUPPORT_PGA, + .dapm_widgets = simple_amp_stereo_pga_dapm_widgets, + .num_dapm_widgets = ARRAY_SIZE(simple_amp_stereo_pga_dapm_widgets), + .dapm_routes = simple_amp_stereo_pga_dapm_routes, + .num_dapm_routes = ARRAY_SIZE(simple_amp_stereo_pga_dapm_routes), +}; + static const struct of_device_id simple_amp_ids[] = { { .compatible = "dioo,dio2125", .data = &simple_audio_amplifier_data}, { .compatible = "simple-audio-amplifier", .data = &simple_audio_amplifier_data}, + { .compatible = "gpio-audio-amp-mono", .data = &simple_audio_mono_pga_data}, + { .compatible = "gpio-audio-amp-stereo", .data = &simple_audio_stereo_pga_data}, { } }; MODULE_DEVICE_TABLE(of, simple_amp_ids); From f68933d0eb674397d67121794e5bc1f73fe1c4bd Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:54 +0200 Subject: [PATCH 955/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for extra power supplies The gpio-audio-amp devices can use additional power supplies: - vddio, - vdda1, - vdda2 Add support for those additional power supplies. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-11-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 85 ++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 1704cdbb7de5d9..89e7e9e53b0374 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -15,6 +15,7 @@ struct simple_amp_data { unsigned int supports; #define SIMPLE_AUDIO_SUPPORT_PGA BIT(0) +#define SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES BIT(1) const struct snd_soc_dapm_widget *dapm_widgets; unsigned int num_dapm_widgets; @@ -127,10 +128,86 @@ static int simple_amp_add_basic_dapm(struct snd_soc_component *component) return 0; } +struct simple_amp_supply { + const char *prop_name; + const struct snd_soc_dapm_widget dapm_widget; + const struct snd_soc_dapm_route dapm_route; +}; + +static const struct simple_amp_supply simple_amp_supplies[] = { + { + .prop_name = "vddio-supply", + .dapm_widget = SND_SOC_DAPM_REGULATOR_SUPPLY("vddio", 0, 0), + .dapm_route = { "PGA", NULL, "vddio" }, + }, { + .prop_name = "vdda1-supply", + .dapm_widget = SND_SOC_DAPM_REGULATOR_SUPPLY("vdda1", 0, 0), + .dapm_route = { "PGA", NULL, "vdda1" }, + }, { + .prop_name = "vdda2-supply", + .dapm_widget = SND_SOC_DAPM_REGULATOR_SUPPLY("vdda2", 0, 0), + .dapm_route = { "PGA", NULL, "vdda2" }, + }, + { /* End of list */} +}; + +static int simple_amp_add_power_supplies(struct snd_soc_component *component) +{ + struct snd_soc_dapm_context *dapm = snd_soc_component_to_dapm(component); + struct simple_amp *simple_amp = snd_soc_component_get_drvdata(component); + const struct simple_amp_supply *supply; + struct device *dev = component->dev; + int ret; + + /* + * Those additional power supplies are attached to the PGA. + * If PGA is not supported, simply skipped them. + */ + if (!(simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_PGA)) { + dev_err(dev, "Extra power supplied need PGA\n"); + return -EINVAL; + } + + supply = simple_amp_supplies; + do { + if (!of_property_present(dev->of_node, supply->prop_name)) + continue; + + ret = snd_soc_dapm_new_controls(dapm, &supply->dapm_widget, 1); + if (ret) { + dev_err(dev, "Failed to add control for '%s' (%d)\n", + supply->prop_name, ret); + return ret; + } + ret = snd_soc_dapm_add_routes(dapm, &supply->dapm_route, 1); + if (ret) { + dev_err(dev, "Failed to add route for '%s' (%d)\n", + supply->prop_name, ret); + return ret; + } + } while ((++supply)->prop_name); + + return 0; +} + static int simple_amp_component_probe(struct snd_soc_component *component) { + struct simple_amp *simple_amp = snd_soc_component_get_drvdata(component); + int ret; + /* Add basic dapm widgets and routes */ - return simple_amp_add_basic_dapm(component); + ret = simple_amp_add_basic_dapm(component); + if (ret) + return ret; + + /* Add additional power supplies */ + if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES) { + ret = simple_amp_add_power_supplies(component); + if (ret) + return ret; + } + + return 0; } static const struct snd_soc_component_driver simple_amp_component_driver = { @@ -170,7 +247,8 @@ static const struct simple_amp_data simple_audio_amplifier_data = { }; static const struct simple_amp_data simple_audio_mono_pga_data = { - .supports = SIMPLE_AUDIO_SUPPORT_PGA, + .supports = SIMPLE_AUDIO_SUPPORT_PGA | + SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES, .dapm_widgets = simple_amp_mono_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_mono_pga_dapm_widgets), .dapm_routes = simple_amp_mono_pga_dapm_routes, @@ -178,7 +256,8 @@ static const struct simple_amp_data simple_audio_mono_pga_data = { }; static const struct simple_amp_data simple_audio_stereo_pga_data = { - .supports = SIMPLE_AUDIO_SUPPORT_PGA, + .supports = SIMPLE_AUDIO_SUPPORT_PGA | + SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES, .dapm_widgets = simple_amp_stereo_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_stereo_pga_dapm_widgets), .dapm_routes = simple_amp_stereo_pga_dapm_routes, From 432331970eca53a7417e52aec6333dbb6e059e1c Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:55 +0200 Subject: [PATCH 956/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for mute gpio A gpio can be used to control the amplifier mute feature. Add support for this mute gpio. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-12-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 127 +++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 89e7e9e53b0374..c8991e340def53 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -12,10 +12,18 @@ #include #include +struct simple_amp_single { + struct gpio_desc *gpio; + bool is_inverted; + int kctrl_val; + const char *control_name; +}; + struct simple_amp_data { unsigned int supports; #define SIMPLE_AUDIO_SUPPORT_PGA BIT(0) #define SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES BIT(1) +#define SIMPLE_AUDIO_SUPPORT_MUTE BIT(2) const struct snd_soc_dapm_widget *dapm_widgets; unsigned int num_dapm_widgets; @@ -26,6 +34,7 @@ struct simple_amp_data { struct simple_amp { const struct simple_amp_data *data; struct gpio_desc *gpiod_enable; + struct simple_amp_single mute; }; static int simple_amp_power_event(struct snd_soc_dapm_widget *w, @@ -103,6 +112,78 @@ static const struct snd_soc_dapm_route simple_amp_stereo_pga_dapm_routes[] = { { "OUTR", NULL, "PGA" }, }; +static int simple_amp_single_kctrl_write_gpio(struct simple_amp_single *single, + int kctrl_val) +{ + int gpio_val; + + gpio_val = single->is_inverted ? !kctrl_val : kctrl_val; + + return gpiod_set_value_cansleep(single->gpio, gpio_val); +} + +static int simple_amp_single_kctrl_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->count = 1; + uinfo->value.integer.min = 0; + uinfo->value.integer.max = 1; + uinfo->type = SNDRV_CTL_ELEM_TYPE_BOOLEAN; + return 0; +} + +static int simple_amp_single_kctrl_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_single *single = (struct simple_amp_single *)kcontrol->private_value; + + ucontrol->value.integer.value[0] = single->kctrl_val; + + return 0; +} + +static int simple_amp_single_kctrl_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_single *single = (struct simple_amp_single *)kcontrol->private_value; + int kctrl_val; + int err; + + kctrl_val = ucontrol->value.integer.value[0] ? 1 : 0; + + if (kctrl_val == single->kctrl_val) + return 0; + + err = simple_amp_single_kctrl_write_gpio(single, kctrl_val); + if (err) + return err; + + single->kctrl_val = kctrl_val; + + return 1; /* The value changed */ +} + +static int simple_amp_single_add_kcontrol(struct snd_soc_component *component, + struct simple_amp_single *single) +{ + struct snd_kcontrol_new control = { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = single->control_name, + .info = simple_amp_single_kctrl_info, + .get = simple_amp_single_kctrl_get, + .put = simple_amp_single_kctrl_put, + .private_value = (unsigned long)single, + }; + int ret; + + /* Be consistent between single->kctrl_val value and the GPIO value */ + ret = simple_amp_single_kctrl_write_gpio(single, single->kctrl_val); + if (ret) + return ret; + + return snd_soc_add_component_controls(component, &control, 1); +} + static int simple_amp_add_basic_dapm(struct snd_soc_component *component) { struct snd_soc_dapm_context *dapm = snd_soc_component_to_dapm(component); @@ -207,6 +288,21 @@ static int simple_amp_component_probe(struct snd_soc_component *component) return ret; } + if (simple_amp->mute.gpio) { + /* + * The name of the GPIO used is mute. According to this name, 1 + * means muted and 0 means un-muted. + * + * An inversion is expected by ALSA. Indeed from ALSA point of + * view, 1 means 'on' (un-muted) and 0 means 'off' (muted). + */ + simple_amp->mute.is_inverted = true; + simple_amp->mute.kctrl_val = 1; /* Un-muted */ + ret = simple_amp_single_add_kcontrol(component, &simple_amp->mute); + if (ret) + return ret; + } + return 0; } @@ -214,10 +310,26 @@ static const struct snd_soc_component_driver simple_amp_component_driver = { .probe = simple_amp_component_probe, }; +static int simple_amp_parse_single_gpio(struct device *dev, + struct simple_amp_single *single, + const char *gpio_property) +{ + /* Start with the inactive value */ + single->is_inverted = false; + single->kctrl_val = 0; + single->gpio = devm_gpiod_get_optional(dev, gpio_property, GPIOD_OUT_LOW); + if (IS_ERR(single->gpio)) + return dev_err_probe(dev, PTR_ERR(single->gpio), + "Failed to get '%s' gpio\n", + gpio_property); + return 0; +} + static int simple_amp_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct simple_amp *simple_amp; + int ret; simple_amp = devm_kzalloc(dev, sizeof(*simple_amp), GFP_KERNEL); if (!simple_amp) @@ -234,6 +346,15 @@ static int simple_amp_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(simple_amp->gpiod_enable), "Failed to get 'enable' gpio"); + if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_MUTE) { + ret = simple_amp_parse_single_gpio(dev, &simple_amp->mute, "mute"); + if (ret) + return ret; + } + + /* Set controls name */ + simple_amp->mute.control_name = "Switch"; + return devm_snd_soc_register_component(dev, &simple_amp_component_driver, NULL, 0); @@ -248,7 +369,8 @@ static const struct simple_amp_data simple_audio_amplifier_data = { static const struct simple_amp_data simple_audio_mono_pga_data = { .supports = SIMPLE_AUDIO_SUPPORT_PGA | - SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES, + SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES | + SIMPLE_AUDIO_SUPPORT_MUTE, .dapm_widgets = simple_amp_mono_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_mono_pga_dapm_widgets), .dapm_routes = simple_amp_mono_pga_dapm_routes, @@ -257,7 +379,8 @@ static const struct simple_amp_data simple_audio_mono_pga_data = { static const struct simple_amp_data simple_audio_stereo_pga_data = { .supports = SIMPLE_AUDIO_SUPPORT_PGA | - SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES, + SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES | + SIMPLE_AUDIO_SUPPORT_MUTE, .dapm_widgets = simple_amp_stereo_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_stereo_pga_dapm_widgets), .dapm_routes = simple_amp_stereo_pga_dapm_routes, From 46c21e55c3bf347dda1cc8e1e474f8983106092e Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:56 +0200 Subject: [PATCH 957/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for bypass gpio A gpio can be used to control the amplifier bypass feature. Add support for this bypass gpio in the same way as it has been done for the mute gpio. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-13-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index c8991e340def53..b9c09a04832a8b 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -24,6 +24,7 @@ struct simple_amp_data { #define SIMPLE_AUDIO_SUPPORT_PGA BIT(0) #define SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES BIT(1) #define SIMPLE_AUDIO_SUPPORT_MUTE BIT(2) +#define SIMPLE_AUDIO_SUPPORT_BYPASS BIT(3) const struct snd_soc_dapm_widget *dapm_widgets; unsigned int num_dapm_widgets; @@ -35,6 +36,7 @@ struct simple_amp { const struct simple_amp_data *data; struct gpio_desc *gpiod_enable; struct simple_amp_single mute; + struct simple_amp_single bypass; }; static int simple_amp_power_event(struct snd_soc_dapm_widget *w, @@ -303,6 +305,12 @@ static int simple_amp_component_probe(struct snd_soc_component *component) return ret; } + if (simple_amp->bypass.gpio) { + ret = simple_amp_single_add_kcontrol(component, &simple_amp->bypass); + if (ret) + return ret; + } + return 0; } @@ -352,8 +360,15 @@ static int simple_amp_probe(struct platform_device *pdev) return ret; } + if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_BYPASS) { + ret = simple_amp_parse_single_gpio(dev, &simple_amp->bypass, "bypass"); + if (ret) + return ret; + } + /* Set controls name */ simple_amp->mute.control_name = "Switch"; + simple_amp->bypass.control_name = "Bypass Switch"; return devm_snd_soc_register_component(dev, &simple_amp_component_driver, @@ -370,7 +385,8 @@ static const struct simple_amp_data simple_audio_amplifier_data = { static const struct simple_amp_data simple_audio_mono_pga_data = { .supports = SIMPLE_AUDIO_SUPPORT_PGA | SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES | - SIMPLE_AUDIO_SUPPORT_MUTE, + SIMPLE_AUDIO_SUPPORT_MUTE | + SIMPLE_AUDIO_SUPPORT_BYPASS, .dapm_widgets = simple_amp_mono_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_mono_pga_dapm_widgets), .dapm_routes = simple_amp_mono_pga_dapm_routes, @@ -380,7 +396,8 @@ static const struct simple_amp_data simple_audio_mono_pga_data = { static const struct simple_amp_data simple_audio_stereo_pga_data = { .supports = SIMPLE_AUDIO_SUPPORT_PGA | SIMPLE_AUDIO_SUPPORT_POWER_SUPPLIES | - SIMPLE_AUDIO_SUPPORT_MUTE, + SIMPLE_AUDIO_SUPPORT_MUTE | + SIMPLE_AUDIO_SUPPORT_BYPASS, .dapm_widgets = simple_amp_stereo_pga_dapm_widgets, .num_dapm_widgets = ARRAY_SIZE(simple_amp_stereo_pga_dapm_widgets), .dapm_routes = simple_amp_stereo_pga_dapm_routes, From 5796026b8e9096158075d2a94667bfbb664d1f89 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:57 +0200 Subject: [PATCH 958/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for basic gain Several gpios can be used to control the amplifier gain. Add basic support for those gpios. This basic support doesn't include any mapping between the GPIOs value and the physical gain value (dB). The support for this kind of mapping will be added later on. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-14-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 125 ++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index b9c09a04832a8b..5b172e520dcd0e 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -4,6 +4,7 @@ * Author: Jerome Brunet */ +#include #include #include #include @@ -19,6 +20,13 @@ struct simple_amp_single { const char *control_name; }; +struct simple_amp_multi { + struct gpio_descs *gpios; + u32 kctrl_val; + u32 kctrl_max; + const char *control_name; +}; + struct simple_amp_data { unsigned int supports; #define SIMPLE_AUDIO_SUPPORT_PGA BIT(0) @@ -37,6 +45,7 @@ struct simple_amp { struct gpio_desc *gpiod_enable; struct simple_amp_single mute; struct simple_amp_single bypass; + struct simple_amp_multi gain; }; static int simple_amp_power_event(struct snd_soc_dapm_widget *w, @@ -186,6 +195,84 @@ static int simple_amp_single_add_kcontrol(struct snd_soc_component *component, return snd_soc_add_component_controls(component, &control, 1); } +static int simple_amp_multi_kctrl_write_gpios(struct simple_amp_multi *multi, + u32 kctrl_val) +{ + DECLARE_BITMAP(bm, 32); + u32 gpio_val; + + if (kctrl_val > multi->kctrl_max) + return -EINVAL; + + gpio_val = kctrl_val; + bitmap_from_arr32(bm, &gpio_val, multi->gpios->ndescs); + + return gpiod_multi_set_value_cansleep(multi->gpios, bm); +} + +static int simple_amp_multi_kctrl_int_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + + uinfo->count = 1; + uinfo->value.integer.min = 0; + uinfo->value.integer.max = multi->kctrl_max; + uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + return 0; +} + +static int simple_amp_multi_kctrl_int_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + + ucontrol->value.integer.value[0] = multi->kctrl_val; + return 0; +} + +static int simple_amp_multi_kctrl_int_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + u32 kctrl_val; + int ret; + + kctrl_val = ucontrol->value.integer.value[0]; + + if (kctrl_val == multi->kctrl_val) + return 0; + + ret = simple_amp_multi_kctrl_write_gpios(multi, kctrl_val); + if (ret) + return ret; + + multi->kctrl_val = kctrl_val; + + return 1; /* The value changed */ +} + +static int simple_amp_multi_add_kcontrol(struct snd_soc_component *component, + struct simple_amp_multi *multi) +{ + struct snd_kcontrol_new control = { + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, + .name = multi->control_name, + .info = simple_amp_multi_kctrl_int_info, + .get = simple_amp_multi_kctrl_int_get, + .put = simple_amp_multi_kctrl_int_put, + .private_value = (unsigned long)multi, + }; + int ret; + + /* Be consistent between multi->kctrl_val value and the GPIOs value */ + ret = simple_amp_multi_kctrl_write_gpios(multi, multi->kctrl_val); + if (ret) + return ret; + + return snd_soc_add_component_controls(component, &control, 1); +} + static int simple_amp_add_basic_dapm(struct snd_soc_component *component) { struct snd_soc_dapm_context *dapm = snd_soc_component_to_dapm(component); @@ -311,6 +398,12 @@ static int simple_amp_component_probe(struct snd_soc_component *component) return ret; } + if (simple_amp->gain.gpios) { + ret = simple_amp_multi_add_kcontrol(component, &simple_amp->gain); + if (ret) + return ret; + } + return 0; } @@ -333,6 +426,31 @@ static int simple_amp_parse_single_gpio(struct device *dev, return 0; } +static int simple_amp_parse_multi_gpio(struct device *dev, + struct simple_amp_multi *multi, + const char *gpios_property) +{ + /* Start with the value 0 (GPIO inactive). Can be changed later */ + multi->kctrl_val = 0; + multi->gpios = devm_gpiod_get_array_optional(dev, gpios_property, GPIOD_OUT_LOW); + if (IS_ERR(multi->gpios)) + return dev_err_probe(dev, PTR_ERR(multi->gpios), + "Failed to get '%s' gpios\n", + gpios_property); + if (!multi->gpios) + return 0; + + if (multi->gpios->ndescs > 16) + return dev_err_probe(dev, -EINVAL, + "Number of '%s' gpios limited to 16\n", + gpios_property); + + /* Set default value for the kctrl_max. Can be changed later */ + multi->kctrl_max = (1 << multi->gpios->ndescs) - 1; + + return 0; +} + static int simple_amp_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -366,7 +484,14 @@ static int simple_amp_probe(struct platform_device *pdev) return ret; } + if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_PGA) { + ret = simple_amp_parse_multi_gpio(dev, &simple_amp->gain, "gain"); + if (ret) + return ret; + } + /* Set controls name */ + simple_amp->gain.control_name = "Volume"; simple_amp->mute.control_name = "Switch"; simple_amp->bypass.control_name = "Bypass Switch"; From d25e00ecd5c5be3b24cc2637b3e2403da3e1aaa1 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:58 +0200 Subject: [PATCH 959/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for gain-ranges The mapping between physical gain values and gpio values can be expressed using ranges described in the gain-ranges property. This gain-ranges property is an array of ranges. Each range in the array is defined by the first point and last point in the range. Those points are a pair of values, the gpios value and the related gain (dB) value. With that, a given range defines N possible items (from the first point gpios value to the last point gpios value) in order to set a gain from the first point gain value to the last point gain value. Handle this description and the related kcontrol. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-15-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 317 +++++++++++++++++++++++++++- 1 file changed, 312 insertions(+), 5 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 5b172e520dcd0e..5759f9bc2f4ffe 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -7,11 +7,16 @@ #include #include #include +#include +#include #include #include #include #include +#include #include +#include +#include struct simple_amp_single { struct gpio_desc *gpio; @@ -20,11 +25,35 @@ struct simple_amp_single { const char *control_name; }; +struct simple_amp_point { + u32 gpio_val; + int gain_db; +}; + +struct simple_amp_range { + unsigned int nb_points; + struct simple_amp_point min; + struct simple_amp_point max; +}; + +struct simple_amp_ranges { + unsigned int nb_ranges; + struct simple_amp_range *tab_ranges; +}; + +enum simple_amp_mode { + SIMPLE_AMP_MODE_NONE, + SIMPLE_AMP_MODE_RANGES, +}; + struct simple_amp_multi { struct gpio_descs *gpios; u32 kctrl_val; u32 kctrl_max; const char *control_name; + unsigned int *tlv_array; + enum simple_amp_mode mode; + struct simple_amp_ranges ranges; }; struct simple_amp_data { @@ -195,6 +224,32 @@ static int simple_amp_single_add_kcontrol(struct snd_soc_component *component, return snd_soc_add_component_controls(component, &control, 1); } +static u32 simple_amp_multi_ranges_kctrl_to_gpio(u32 kctrl_val, + struct simple_amp_ranges *ranges) +{ + struct simple_amp_range *range; + u32 index = kctrl_val; + unsigned int i; + + for (i = 0; i < ranges->nb_ranges; i++) { + range = &ranges->tab_ranges[i]; + + if (index < range->nb_points) + return (range->max.gpio_val >= range->min.gpio_val) ? + range->min.gpio_val + index : + range->min.gpio_val - index; + + index -= range->nb_points; + } + + /* + * Given index out of possible ranges. This is shouldn't happen. + * Signal the issue and return the maximum value + */ + WARN(1, "kctrl_val %u out of ranges\n", kctrl_val); + return ranges->tab_ranges[ranges->nb_ranges - 1].max.gpio_val; +} + static int simple_amp_multi_kctrl_write_gpios(struct simple_amp_multi *multi, u32 kctrl_val) { @@ -204,7 +259,12 @@ static int simple_amp_multi_kctrl_write_gpios(struct simple_amp_multi *multi, if (kctrl_val > multi->kctrl_max) return -EINVAL; - gpio_val = kctrl_val; + if (multi->mode == SIMPLE_AMP_MODE_RANGES) + gpio_val = simple_amp_multi_ranges_kctrl_to_gpio(kctrl_val, + &multi->ranges); + else + gpio_val = kctrl_val; + bitmap_from_arr32(bm, &gpio_val, multi->gpios->ndescs); return gpiod_multi_set_value_cansleep(multi->gpios, bm); @@ -252,6 +312,38 @@ static int simple_amp_multi_kctrl_int_put(struct snd_kcontrol *kcontrol, return 1; /* The value changed */ } +static unsigned int *simple_amp_alloc_tlv_ranges(const struct simple_amp_ranges *ranges) +{ + unsigned int index; + unsigned int *tlv; + unsigned int *t; + unsigned int i; + + tlv = kzalloc_objs(*tlv, 2 + ranges->nb_ranges * 6, GFP_KERNEL); + if (!tlv) + return NULL; + + t = tlv; + + /* Fill first TLV */ + *t++ = SNDRV_CTL_TLVT_DB_RANGE; /* Tag */ + *t++ = ranges->nb_ranges * 6 * sizeof(*tlv); /* Len */ + /* Ranges are sorted from lower to higher value */ + index = 0; + for (i = 0; i < ranges->nb_ranges; i++) { + /* Fill range item i */ + *t++ = index; /* min */ + index += ranges->tab_ranges[i].nb_points; + *t++ = index - 1; /* max */ + *t++ = SNDRV_CTL_TLVT_DB_MINMAX; /* Tag */ + *t++ = 2 * sizeof(*tlv); /* Len */ + *t++ = ranges->tab_ranges[i].min.gain_db; /* min_dB */ + *t++ = ranges->tab_ranges[i].max.gain_db; /* max_dB */ + } + + return tlv; +} + static int simple_amp_multi_add_kcontrol(struct snd_soc_component *component, struct simple_amp_multi *multi) { @@ -265,12 +357,39 @@ static int simple_amp_multi_add_kcontrol(struct snd_soc_component *component, }; int ret; + switch (multi->mode) { + case SIMPLE_AMP_MODE_RANGES: + multi->tlv_array = simple_amp_alloc_tlv_ranges(&multi->ranges); + if (!multi->tlv_array) + return -ENOMEM; + + control.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ | + SNDRV_CTL_ELEM_ACCESS_READWRITE; + control.tlv.p = multi->tlv_array; + break; + + case SIMPLE_AMP_MODE_NONE: + /* Already set control configuration is enough */ + break; + + default: + return -EINVAL; + } + /* Be consistent between multi->kctrl_val value and the GPIOs value */ ret = simple_amp_multi_kctrl_write_gpios(multi, multi->kctrl_val); if (ret) - return ret; + goto err_free_tlv_array; - return snd_soc_add_component_controls(component, &control, 1); + ret = snd_soc_add_component_controls(component, &control, 1); + if (ret) + goto err_free_tlv_array; + + return 0; + +err_free_tlv_array: + kfree(multi->tlv_array); + return ret; } static int simple_amp_add_basic_dapm(struct snd_soc_component *component) @@ -407,8 +526,17 @@ static int simple_amp_component_probe(struct snd_soc_component *component) return 0; } +static void simple_amp_component_remove(struct snd_soc_component *component) +{ + struct simple_amp *simple_amp = snd_soc_component_get_drvdata(component); + + kfree(simple_amp->gain.tlv_array); + simple_amp->gain.tlv_array = NULL; +} + static const struct snd_soc_component_driver simple_amp_component_driver = { .probe = simple_amp_component_probe, + .remove = simple_amp_component_remove, }; static int simple_amp_parse_single_gpio(struct device *dev, @@ -426,10 +554,179 @@ static int simple_amp_parse_single_gpio(struct device *dev, return 0; } +static int simple_amp_cmp_ranges(const void *a, const void *b) +{ + const struct simple_amp_range *a_range = a; + const struct simple_amp_range *b_range = b; + + /* Ranges a and b don't overlap. This has been already checked */ + + return a_range->min.gain_db - b_range->max.gain_db; +} + +static int simple_amp_check_new_range(const struct simple_amp_range *new_range, + const struct simple_amp_range *tab_ranges, + unsigned int nb_ranges) +{ + unsigned int i; + + for (i = 0; i < nb_ranges; i++) { + /* Check for range overlaps */ + if (new_range->min.gain_db >= tab_ranges[i].min.gain_db && + new_range->min.gain_db <= tab_ranges[i].max.gain_db) + return -EINVAL; + + if (new_range->max.gain_db >= tab_ranges[i].min.gain_db && + new_range->max.gain_db <= tab_ranges[i].max.gain_db) + return -EINVAL; + + if (new_range->min.gain_db <= tab_ranges[i].min.gain_db && + new_range->max.gain_db >= tab_ranges[i].max.gain_db) + return -EINVAL; + } + return 0; +} + +static int simple_amp_parse_ranges(struct device *dev, + struct simple_amp_multi *multi, + const char *ranges_property) +{ + struct simple_amp_ranges *ranges = &multi->ranges; + struct simple_amp_range *range; + struct device_node *np = dev->of_node; + struct simple_amp_point first_point; + unsigned int max_gpio_val; + unsigned int i; + int ret; + u32 u; + s32 s; + + max_gpio_val = (1 << multi->gpios->ndescs) - 1; + + ret = of_property_count_u32_elems(np, ranges_property); + if (ret < 0) + return ret; + + /* The ranges array cannot be empty */ + if (ret == 0) + return -EINVAL; + /* + * One range item is composed of 2 points and each point is composed of + * 2 values. + */ + if (ret % 4) + return -EINVAL; + + ranges->nb_ranges = ret / 4; + + /* The worst case is one range per possible gpio value */ + if (ranges->nb_ranges > max_gpio_val + 1) + return -EINVAL; + + ranges->tab_ranges = devm_kcalloc(dev, ranges->nb_ranges, + sizeof(*ranges->tab_ranges), + GFP_KERNEL); + if (!ranges->tab_ranges) + return -ENOMEM; + + multi->kctrl_max = 0; + for (i = 0; i < ranges->nb_ranges; i++) { + range = &ranges->tab_ranges[i]; + + /* First gpios value */ + ret = of_property_read_u32_index(np, ranges_property, i * 4, &u); + if (ret) + return ret; + if (u > max_gpio_val) + return -EINVAL; + + range->min.gpio_val = u; + + /* First Gain value */ + ret = of_property_read_s32_index(np, ranges_property, i * 4 + 1, &s); + if (ret) + return ret; + + range->min.gain_db = s; + + /* Second gpios value */ + ret = of_property_read_u32_index(np, ranges_property, i * 4 + 2, &u); + if (ret) + return ret; + if (u > max_gpio_val) + return -EINVAL; + + range->max.gpio_val = u; + + /* Second Gain value */ + ret = of_property_read_s32_index(np, ranges_property, i * 4 + 3, &s); + if (ret) + return ret; + + range->max.gain_db = s; + + /* Save the first point for later usage */ + if (i == 0) + first_point = range->min; + + /* Fix min and max if needed */ + if (range->min.gain_db > range->max.gain_db) + swap(range->min, range->max); + + ret = simple_amp_check_new_range(range, ranges->tab_ranges, i); + if (ret) + return ret; + + range->nb_points = abs_diff(range->min.gpio_val, + range->max.gpio_val) + 1; + + multi->kctrl_max += range->nb_points; + } + + multi->kctrl_max -= 1; + + /* Sort the tab_range array by gain_db value */ + sort(ranges->tab_ranges, ranges->nb_ranges, sizeof(*ranges->tab_ranges), + simple_amp_cmp_ranges, NULL); + + /* + * multi->kctrl_val is the index in tab_ranges. + * + * Choose to have the initial amplification value set to the first point + * available in the first range available in the tab_ranges array before + * sorting. + * + * This first point has been identified before sorting. Search for it in + * the sorted array in order to set the multi->kctrl_val initial value. + */ + multi->kctrl_val = 0; + for (i = 0; i < ranges->nb_ranges; i++) { + range = &ranges->tab_ranges[i]; + + if (range->min.gpio_val == first_point.gpio_val && + range->min.gain_db == first_point.gain_db) + break; + + multi->kctrl_val += range->nb_points; + + if (range->max.gpio_val == first_point.gpio_val && + range->max.gain_db == first_point.gain_db) { + multi->kctrl_val--; + break; + } + } + + return 0; +} + static int simple_amp_parse_multi_gpio(struct device *dev, struct simple_amp_multi *multi, - const char *gpios_property) + const char *gpios_property, + const char *ranges_property) { + struct device_node *np = dev->of_node; + int ret; + /* Start with the value 0 (GPIO inactive). Can be changed later */ multi->kctrl_val = 0; multi->gpios = devm_gpiod_get_array_optional(dev, gpios_property, GPIOD_OUT_LOW); @@ -448,6 +745,15 @@ static int simple_amp_parse_multi_gpio(struct device *dev, /* Set default value for the kctrl_max. Can be changed later */ multi->kctrl_max = (1 << multi->gpios->ndescs) - 1; + multi->mode = SIMPLE_AMP_MODE_NONE; + if (of_property_present(np, ranges_property)) { + ret = simple_amp_parse_ranges(dev, multi, ranges_property); + if (ret < 0) + return dev_err_probe(dev, ret, "Failed to parse '%s'\n", + ranges_property); + multi->mode = SIMPLE_AMP_MODE_RANGES; + } + return 0; } @@ -485,7 +791,8 @@ static int simple_amp_probe(struct platform_device *pdev) } if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_PGA) { - ret = simple_amp_parse_multi_gpio(dev, &simple_amp->gain, "gain"); + ret = simple_amp_parse_multi_gpio(dev, &simple_amp->gain, "gain", + "gain-ranges"); if (ret) return ret; } From fb757dfa039fd2adaab5bf0de2df1530531b4060 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:16:59 +0200 Subject: [PATCH 960/972] ASoC: simple-amplifier: gpio-audio-amp: Add support for gain-labels The possible gain values can be described using labels instead of gain values in dB. Those different labels are attached to a gpio values using the gain-labels property. Using the gain-labels description is mutually exclusive with gain-ranges description used to describe the relationship between gpios values and gain values. Handle the gain-labels description and the related kcontrol. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-16-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 116 +++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 5759f9bc2f4ffe..3785044235bb52 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -41,9 +41,15 @@ struct simple_amp_ranges { struct simple_amp_range *tab_ranges; }; +struct simple_amp_labels { + unsigned int nb_labels; + const char **tab_labels; +}; + enum simple_amp_mode { SIMPLE_AMP_MODE_NONE, SIMPLE_AMP_MODE_RANGES, + SIMPLE_AMP_MODE_LABELS, }; struct simple_amp_multi { @@ -53,7 +59,10 @@ struct simple_amp_multi { const char *control_name; unsigned int *tlv_array; enum simple_amp_mode mode; - struct simple_amp_ranges ranges; + union { + struct simple_amp_ranges ranges; + struct simple_amp_labels labels; + }; }; struct simple_amp_data { @@ -312,6 +321,45 @@ static int simple_amp_multi_kctrl_int_put(struct snd_kcontrol *kcontrol, return 1; /* The value changed */ } +static int simple_amp_multi_kctrl_enum_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + + return snd_ctl_enum_info(uinfo, 1, multi->labels.nb_labels, + multi->labels.tab_labels); +} + +static int simple_amp_multi_kctrl_enum_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + + ucontrol->value.enumerated.item[0] = multi->kctrl_val; + return 0; +} + +static int simple_amp_multi_kctrl_enum_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct simple_amp_multi *multi = (struct simple_amp_multi *)kcontrol->private_value; + u32 kctrl_val; + int ret; + + kctrl_val = ucontrol->value.enumerated.item[0]; + + if (kctrl_val == multi->kctrl_val) + return 0; + + ret = simple_amp_multi_kctrl_write_gpios(multi, kctrl_val); + if (ret) + return ret; + + multi->kctrl_val = kctrl_val; + + return 1; /* The value changed */ +} + static unsigned int *simple_amp_alloc_tlv_ranges(const struct simple_amp_ranges *ranges) { unsigned int index; @@ -368,6 +416,13 @@ static int simple_amp_multi_add_kcontrol(struct snd_soc_component *component, control.tlv.p = multi->tlv_array; break; + case SIMPLE_AMP_MODE_LABELS: + /* Use enumerated values */ + control.info = simple_amp_multi_kctrl_enum_info; + control.get = simple_amp_multi_kctrl_enum_get; + control.put = simple_amp_multi_kctrl_enum_put; + break; + case SIMPLE_AMP_MODE_NONE: /* Already set control configuration is enough */ break; @@ -719,10 +774,44 @@ static int simple_amp_parse_ranges(struct device *dev, return 0; } +static int simple_amp_parse_labels(struct device *dev, + struct simple_amp_multi *multi, + const char *labels_property) +{ + struct simple_amp_labels *labels = &multi->labels; + struct device_node *np = dev->of_node; + int ret; + + ret = of_property_count_strings(np, labels_property); + if (ret < 0) + return ret; + + /* The labels array cannot be empty */ + if (ret == 0) + return -EINVAL; + + labels->nb_labels = ret; + if (labels->nb_labels > (1 << multi->gpios->ndescs)) + return -EINVAL; + + labels->tab_labels = devm_kcalloc(dev, labels->nb_labels, + sizeof(*labels->tab_labels), + GFP_KERNEL); + if (!labels->tab_labels) + return -ENOMEM; + + multi->kctrl_max = labels->nb_labels - 1; + multi->kctrl_val = 0; + + return of_property_read_string_array(np, labels_property, labels->tab_labels, + labels->nb_labels); +} + static int simple_amp_parse_multi_gpio(struct device *dev, struct simple_amp_multi *multi, const char *gpios_property, - const char *ranges_property) + const char *ranges_property, + const char *labels_property) { struct device_node *np = dev->of_node; int ret; @@ -752,6 +841,13 @@ static int simple_amp_parse_multi_gpio(struct device *dev, return dev_err_probe(dev, ret, "Failed to parse '%s'\n", ranges_property); multi->mode = SIMPLE_AMP_MODE_RANGES; + } else if (of_property_present(np, labels_property)) { + ret = simple_amp_parse_labels(dev, multi, labels_property); + if (ret < 0) + return dev_err_probe(dev, ret, "Failed to parse '%s'\n", + labels_property); + + multi->mode = SIMPLE_AMP_MODE_LABELS; } return 0; @@ -792,7 +888,7 @@ static int simple_amp_probe(struct platform_device *pdev) if (simple_amp->data->supports & SIMPLE_AUDIO_SUPPORT_PGA) { ret = simple_amp_parse_multi_gpio(dev, &simple_amp->gain, "gain", - "gain-ranges"); + "gain-ranges", "gain-labels"); if (ret) return ret; } @@ -802,6 +898,20 @@ static int simple_amp_probe(struct platform_device *pdev) simple_amp->mute.control_name = "Switch"; simple_amp->bypass.control_name = "Bypass Switch"; + if (simple_amp->gain.mode == SIMPLE_AMP_MODE_LABELS) { + /* + * The gain widget control will use enumerated values. + * + * Having just "Voltage" and "Switch" widget names with + * enumerated values and boolean value can confuse ALSA in terms + * of possible values (strings). + * + * Make things clear and avoid the just "Switch" name in that + * case. + */ + simple_amp->mute.control_name = "Out Switch"; + } + return devm_snd_soc_register_component(dev, &simple_amp_component_driver, NULL, 0); From 7b295a6fc63254381504b4abd37ff150eec19bea Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:17:00 +0200 Subject: [PATCH 961/972] ASoC: simple-amplifier: Update author and copyright After reworking the simple-amplifier driver and adding support for gpio-audio-amp in the driver, add myself as the author of the gpio-audio-amp part of the driver and add a related copyright. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-17-herve.codina@bootlin.com Signed-off-by: Mark Brown --- sound/soc/codecs/simple-amplifier.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/simple-amplifier.c b/sound/soc/codecs/simple-amplifier.c index 3785044235bb52..ca0e6ce8cc379b 100644 --- a/sound/soc/codecs/simple-amplifier.c +++ b/sound/soc/codecs/simple-amplifier.c @@ -1,7 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2017 BayLibre, SAS. - * Author: Jerome Brunet + * Support for gpio amplifier + * Copyright 2026 CS GROUP France + * Author: Herve Codina + * + * Basic simple amplifier driver + * Copyright (c) 2017 BayLibre, SAS. + * Author: Jerome Brunet */ #include @@ -967,4 +972,5 @@ module_platform_driver(simple_amp_driver); MODULE_DESCRIPTION("ASoC Simple Audio Amplifier driver"); MODULE_AUTHOR("Jerome Brunet "); +MODULE_AUTHOR("Herve Codina "); MODULE_LICENSE("GPL"); From 5bd8c4b7d3bbe03a56f915f5da95330cb74fa3f6 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Wed, 13 May 2026 10:17:01 +0200 Subject: [PATCH 962/972] MAINTAINERS: Add the ASoC gpio audio amplifier entry After contributing the component, add myself as the maintainer for the ASoC gpio audio amplifier component. Signed-off-by: Herve Codina Link: https://patch.msgid.link/20260513081702.317117-18-herve.codina@bootlin.com Signed-off-by: Mark Brown --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c2c6d79275c6eb..b7be25c9e6198b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -25032,6 +25032,13 @@ F: include/sound/dmaengine_pcm.h F: sound/core/pcm_dmaengine.c F: sound/soc/soc-generic-dmaengine-pcm.c +SOUND - SOC LAYER / GPIO AUDIO AMPLIFIER +M: Herve Codina +L: linux-sound@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/sound/gpio-audio-amp.yaml +F: sound/soc/codecs/simple-amplifier.c + SOUND - SOC LAYER / DYNAMIC AUDIO POWER MANAGEMENT (ASoC) M: Liam Girdwood M: Mark Brown From e79615b05c78d19b085c8eb7971c82cb5b0f22d1 Mon Sep 17 00:00:00 2001 From: Ezio Galeazzi Date: Sun, 17 May 2026 17:42:36 +0200 Subject: [PATCH 963/972] ALSA: hda/ca0132: add QUIRK_GENERIC path for Gigabyte GA-Z170X-Gaming G1 Some CA0132 implementations (e.g. Gigabyte GA-Z170X-Gaming G1) produce white noise when using the DSP firmware path. Add a QUIRK_GENERIC path that uses the standard HDA generic parser instead, with custom pin configs. This patch applies against v6.18.24. Signed-off-by: Ezio Galeazzi Link: https://lore.kernel.org/0c0b781f-1595-4595-921a-66d83cf5930b@gmail.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/Kconfig | 1 + sound/hda/codecs/ca0132.c | 111 +++++++++++++++++++++++++++++++++----- 2 files changed, 99 insertions(+), 13 deletions(-) diff --git a/sound/hda/codecs/Kconfig b/sound/hda/codecs/Kconfig index addbc94243365b..dcf340e5a0c1ad 100644 --- a/sound/hda/codecs/Kconfig +++ b/sound/hda/codecs/Kconfig @@ -69,6 +69,7 @@ comment "Set to Y if you want auto-loading the codec driver" config SND_HDA_CODEC_CA0132 tristate "Build Creative CA0132 codec support" + select SND_HDA_GENERIC help Say Y or M here to include Creative CA0132 codec support in snd-hda-intel driver. diff --git a/sound/hda/codecs/ca0132.c b/sound/hda/codecs/ca0132.c index ad533b04ab29cf..7d165f2dbd4190 100644 --- a/sound/hda/codecs/ca0132.c +++ b/sound/hda/codecs/ca0132.c @@ -24,6 +24,7 @@ #include "hda_local.h" #include "hda_auto_parser.h" #include "hda_jack.h" +#include "generic.h" #include "ca0132_regs.h" @@ -1060,6 +1061,8 @@ enum dsp_download_state { */ struct ca0132_spec { + struct hda_gen_spec gen; + const struct snd_kcontrol_new *mixers[5]; unsigned int num_mixers; const struct hda_verb *base_init_verbs; @@ -1174,6 +1177,7 @@ enum { QUIRK_R3D, QUIRK_AE5, QUIRK_AE7, + QUIRK_GENERIC, QUIRK_NONE = HDA_FIXUP_ID_NOT_SET, }; @@ -1292,6 +1296,20 @@ static const struct hda_pintbl ae7_pincfgs[] = { {} }; +static const struct hda_pintbl ca0132_generic_pincfgs[] = { + { 0x0b, 0x41014111 }, + { 0x0c, 0x414520f0 }, /* SPDIF out */ + { 0x0d, 0x01014010 }, /* lineout */ + { 0x0e, 0x41c501f0 }, + { 0x0f, 0x411111f0 }, /* disabled */ + { 0x10, 0x411111f0 }, /* disabled */ + { 0x11, 0x41012014 }, + { 0x12, 0x37a790f0 }, /* mic */ + { 0x13, 0x77a701f0 }, + { 0x18, 0x500000f0 }, + {} +}; + static const struct hda_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1028, 0x057b, "Alienware M17x R4", QUIRK_ALIENWARE_M17XR4), SND_PCI_QUIRK(0x1028, 0x0685, "Alienware 15 2015", QUIRK_ALIENWARE), @@ -1304,6 +1322,7 @@ static const struct hda_quirk ca0132_quirks[] = { SND_PCI_QUIRK(0x1458, 0xA016, "Recon3Di", QUIRK_R3DI), SND_PCI_QUIRK(0x1458, 0xA026, "Gigabyte G1.Sniper Z97", QUIRK_R3DI), SND_PCI_QUIRK(0x1458, 0xA036, "Gigabyte GA-Z170X-Gaming 7", QUIRK_R3DI), + SND_PCI_QUIRK(0x1458, 0xA046, "Gigabyte GA-Z170X-Gaming G1", QUIRK_GENERIC), SND_PCI_QUIRK(0x3842, 0x1038, "EVGA X99 Classified", QUIRK_R3DI), SND_PCI_QUIRK(0x3842, 0x104b, "EVGA X299 Dark", QUIRK_R3DI), SND_PCI_QUIRK(0x3842, 0x1055, "EVGA Z390 DARK", QUIRK_R3DI), @@ -1325,6 +1344,7 @@ static const struct hda_model_fixup ca0132_quirk_models[] = { { .id = QUIRK_R3D, .name = "r3d" }, { .id = QUIRK_AE5, .name = "ae5" }, { .id = QUIRK_AE7, .name = "ae7" }, + { .id = QUIRK_GENERIC, .name = "generic" }, {} }; @@ -9879,14 +9899,57 @@ static void sbz_detect_quirk(struct hda_codec *codec) } } +static void ca0132_generic_init_hook(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + + snd_hda_sequence_write(codec, spec->spec_init_verbs); +} + +static int ca0132_generic_probe(struct hda_codec *codec) +{ + struct ca0132_spec *spec = codec->spec; + struct auto_pin_cfg *cfg = &spec->gen.autocfg; + int err; + + snd_hda_gen_spec_init(&spec->gen); + + snd_hda_apply_pincfgs(codec, ca0132_generic_pincfgs); + + ca0132_init_chip(codec); + + err = ca0132_prepare_verbs(codec); + if (err < 0) + return err; + + err = snd_hda_parse_pin_def_config(codec, cfg, NULL); + if (err < 0) + return err; + err = snd_hda_gen_parse_auto_config(codec, cfg); + if (err < 0) + return err; + + spec->gen.init_hook = ca0132_generic_init_hook; + spec->gen.automute_speaker = 0; + spec->gen.automute_lo = 0; + + snd_hda_sequence_write(codec, spec->spec_init_verbs); + return 0; +} + static void ca0132_codec_remove(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - if (ca0132_quirk(spec) == QUIRK_ZXR_DBPRO) + switch (ca0132_quirk(spec)) { + case QUIRK_GENERIC: + snd_hda_gen_remove(codec); + return; + case QUIRK_ZXR_DBPRO: return dbpro_free(codec); - else + default: return ca0132_free(codec); + } } static int ca0132_codec_probe(struct hda_codec *codec, @@ -9903,14 +9966,21 @@ static int ca0132_codec_probe(struct hda_codec *codec, codec->spec = spec; spec->codec = codec; - /* Detect codec quirk */ - snd_hda_pick_fixup(codec, ca0132_quirk_models, ca0132_quirks, NULL); - if (ca0132_quirk(spec) == QUIRK_SBZ) - sbz_detect_quirk(codec); - + /* These must be set before any path is taken */ codec->pcm_format_first = 1; codec->no_sticky_stream = 1; + /* Detect codec quirk */ + snd_hda_pick_fixup(codec, ca0132_quirk_models, ca0132_quirks, NULL); + switch (ca0132_quirk(spec)) { + case QUIRK_SBZ: + sbz_detect_quirk(codec); + break; + case QUIRK_GENERIC: + return ca0132_generic_probe(codec); + default: + break; + } spec->dsp_state = DSP_DOWNLOAD_INIT; spec->num_mixers = 1; @@ -10011,36 +10081,51 @@ static int ca0132_codec_build_controls(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - if (ca0132_quirk(spec) == QUIRK_ZXR_DBPRO) + switch (ca0132_quirk(spec)) { + case QUIRK_GENERIC: + return snd_hda_gen_build_controls(codec); + case QUIRK_ZXR_DBPRO: return dbpro_build_controls(codec); - else + default: return ca0132_build_controls(codec); + } } static int ca0132_codec_build_pcms(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - if (ca0132_quirk(spec) == QUIRK_ZXR_DBPRO) + switch (ca0132_quirk(spec)) { + case QUIRK_GENERIC: + return snd_hda_gen_build_pcms(codec); + case QUIRK_ZXR_DBPRO: return dbpro_build_pcms(codec); - else + default: return ca0132_build_pcms(codec); + } } static int ca0132_codec_init(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; - if (ca0132_quirk(spec) == QUIRK_ZXR_DBPRO) + switch (ca0132_quirk(spec)) { + case QUIRK_GENERIC: + return snd_hda_gen_init(codec); + case QUIRK_ZXR_DBPRO: return dbpro_init(codec); - else + default: return ca0132_init(codec); + } } static int ca0132_codec_suspend(struct hda_codec *codec) { struct ca0132_spec *spec = codec->spec; + if (ca0132_quirk(spec) == QUIRK_GENERIC) + return 0; + cancel_delayed_work_sync(&spec->unsol_hp_work); return 0; } From 1ab8e422dc779a91acba2d0aafc47b0db6680b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Mon, 18 May 2026 11:32:05 -0300 Subject: [PATCH 964/972] ALSA: ice1724: Fix blocking open for independent surround PCMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The independent surround playback open path rejects a substream when the matching PDMA channel is reserved by the multi-channel PDMA0 stream. It currently returns -EBUSY for that case, although the driver has carried a FIXME noting that blocking mode is not handled properly. ALSA PCM open waits and retries only when the low-level open callback returns -EAGAIN. Returning -EBUSY therefore makes blocking opens fail immediately, the same as nonblocking opens. Return -EAGAIN for the temporary PDMA0 reservation conflict. The PCM core continues to report -EBUSY for O_NONBLOCK callers, while blocking callers sleep and retry. Also wake the independent surround PCM wait queue when hw_free releases a PDMA reservation. The reservation can be released by the pro PCM, while waiters are sleeping on the independent surround PCM, so waking the current substream PCM is not sufficient for this cross-PCM reservation. Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260518-ice1724-blocking-open-v1-1-1bfa3e5aa7cf@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/ice1712/ice1724.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/sound/pci/ice1712/ice1724.c b/sound/pci/ice1712/ice1724.c index 2e64f9c020e5a1..79d57938a1c8cb 100644 --- a/sound/pci/ice1712/ice1724.c +++ b/sound/pci/ice1712/ice1724.c @@ -730,13 +730,22 @@ static int snd_vt1724_pcm_hw_params(struct snd_pcm_substream *substream, static int snd_vt1724_pcm_hw_free(struct snd_pcm_substream *substream) { struct snd_ice1712 *ice = snd_pcm_substream_chip(substream); + bool released = false; int i; - guard(mutex)(&ice->open_mutex); - /* unmark surround channels */ - for (i = 0; i < 3; i++) - if (ice->pcm_reserved[i] == substream) + scoped_guard(mutex, &ice->open_mutex) { + /* unmark surround channels */ + for (i = 0; i < 3; i++) { + if (ice->pcm_reserved[i] != substream) + continue; ice->pcm_reserved[i] = NULL; + released = true; + } + } + + if (released && ice->pcm_ds) + wake_up(&ice->pcm_ds->open_wait); + return 0; } @@ -1364,7 +1373,7 @@ static int snd_vt1724_playback_indep_open(struct snd_pcm_substream *substream) scoped_guard(mutex, &ice->open_mutex) { /* already used by PDMA0? */ if (ice->pcm_reserved[substream->number]) - return -EBUSY; /* FIXME: should handle blocking mode properly */ + return -EAGAIN; } runtime->private_data = (void *)&vt1724_playback_dma_regs[substream->number]; ice->playback_con_substream_ds[substream->number] = substream; From 2c515c221a7a2fd0dcb335b9866a98d0bedd9ca5 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 18 May 2026 17:46:47 -0700 Subject: [PATCH 965/972] ALSA: oss: Use flexible allocation for PCM plugins Allocate PCM plugin objects with kzalloc_flex() for the trailing extra data area instead of open-coding the size calculation. This keeps the allocation tied to the existing flexible array member without changing the plugin lifetime. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260519004647.627429-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/oss/pcm_plugin.c b/sound/core/oss/pcm_plugin.c index 14b4a390a2192f..5f4d6945a7df64 100644 --- a/sound/core/oss/pcm_plugin.c +++ b/sound/core/oss/pcm_plugin.c @@ -146,7 +146,7 @@ int snd_pcm_plugin_build(struct snd_pcm_substream *plug, return -ENXIO; if (snd_BUG_ON(!src_format || !dst_format)) return -ENXIO; - plugin = kzalloc(sizeof(*plugin) + extra, GFP_KERNEL); + plugin = kzalloc_flex(*plugin, extra_data, extra); if (plugin == NULL) return -ENOMEM; plugin->name = name; From cce8ec8209892b972c743d66180cbe64f19ba83d Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 18 May 2026 17:48:34 -0700 Subject: [PATCH 966/972] ALSA: usb-audio: Use flexible allocation for FCP packets Allocate FCP request and response packets with kmalloc_flex() for the trailing packet data instead of passing the computed struct size directly to kmalloc(). Keep the computed packet sizes for the USB transfer length checks. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260519004834.627676-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/fcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/usb/fcp.c b/sound/usb/fcp.c index 0fc4d063c48a58..ea746bdb36ffcf 100644 --- a/sound/usb/fcp.c +++ b/sound/usb/fcp.c @@ -191,13 +191,13 @@ static int fcp_usb(struct usb_mixer_interface *mixer, u32 opcode, struct fcp_usb_packet *req __free(kfree) = NULL; size_t req_buf_size = struct_size(req, data, req_size); - req = kmalloc(req_buf_size, GFP_KERNEL); + req = kmalloc_flex(*req, data, req_size); if (!req) return -ENOMEM; struct fcp_usb_packet *resp __free(kfree) = NULL; size_t resp_buf_size = struct_size(resp, data, resp_size); - resp = kmalloc(resp_buf_size, GFP_KERNEL); + resp = kmalloc_flex(*resp, data, resp_size); if (!resp) return -ENOMEM; From 0c09413bde6ffa07b4b91a660fb6393dfdead82a Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 18 May 2026 17:49:35 -0700 Subject: [PATCH 967/972] ALSA: usb-audio: Use flexible allocation for Scarlett2 packets Allocate Scarlett2 USB packets and request buffers with the flex allocation helpers for their trailing data arrays. Keep the computed packet sizes where they are still needed for USB transfer lengths. Assisted-by: Codex:GPT-5.5 Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20260519004935.627797-1-rosenp@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_scarlett2.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/usb/mixer_scarlett2.c b/sound/usb/mixer_scarlett2.c index 89dbb403d25534..7c43ca51938e30 100644 --- a/sound/usb/mixer_scarlett2.c +++ b/sound/usb/mixer_scarlett2.c @@ -2614,13 +2614,13 @@ static int scarlett2_usb( struct scarlett2_usb_packet *req __free(kfree) = NULL; size_t req_buf_size = struct_size(req, data, req_size); - req = kmalloc(req_buf_size, GFP_KERNEL); + req = kmalloc_flex(*req, data, req_size); if (!req) return -ENOMEM; struct scarlett2_usb_packet *resp __free(kfree) = NULL; size_t resp_buf_size = struct_size(resp, data, resp_size); - resp = kmalloc(resp_buf_size, GFP_KERNEL); + resp = kmalloc_flex(*resp, data, resp_size); if (!resp) return -ENOMEM; @@ -2830,9 +2830,9 @@ static int scarlett2_usb_set_data_buf( u8 data[]; } __packed *req; int err; - int buf_size = struct_size(req, data, bytes); + size_t buf_size = struct_size(req, data, bytes); - req = kmalloc(buf_size, GFP_KERNEL); + req = kmalloc_flex(*req, data, bytes); if (!req) return -ENOMEM; @@ -9646,7 +9646,7 @@ static long scarlett2_hwdep_write(struct snd_hwdep *hw, /* Create and send the request */ len = struct_size(req, data, count); - req = kzalloc(len, GFP_KERNEL); + req = kzalloc_flex(*req, data, count); if (!req) return -ENOMEM; From 18977c0dd722f52217027ff75de2811c53cce2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A1ssio=20Gabriel?= Date: Tue, 19 May 2026 00:20:41 -0300 Subject: [PATCH 968/972] ALSA: usx2y: Drain pending US-428 pipe-4 output commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The US-428 pipe-4 output path submits at most one pending p4out entry from the shared-memory ring per input interrupt. If userspace queues more than one command before the interrupt handler runs, later commands remain pending until later input interrupts, even when async pipe-4 URBs are available. Drain pending entries while idle async URBs are available. Copy each command into the existing per-URB async buffer before submission, so the submitted transfer does not depend on a userspace-mapped ring slot remaining unchanged after p4out_sent is advanced. Also update p4out_sent only after usb_submit_urb() succeeds, so a failed submission is not reported as sent. This keeps the shared-memory ABI unchanged and fixes only the local queue-draining behavior. Signed-off-by: Cássio Gabriel Link: https://patch.msgid.link/20260519-alsa-usx2y-p4out-drain-v1-1-8f0a4550bae2@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/usx2y/usbusx2y.c | 39 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c index f34e78910200aa..4190227c5a2a58 100644 --- a/sound/usb/usx2y/usbusx2y.c +++ b/sound/usb/usx2y/usbusx2y.c @@ -180,7 +180,7 @@ static void i_usx2y_in04_int(struct urb *urb) struct usx2ydev *usx2y = urb->context; struct us428ctls_sharedmem *us428ctls = usx2y->us428ctls_sharedmem; struct us428_p4out *p4out; - int i, j, n, diff, send; + int i, j, n, diff, send, len; usx2y->in04_int_calls++; @@ -222,24 +222,31 @@ static void i_usx2y_in04_int(struct urb *urb) } while (!err && usx2y->us04->submitted < usx2y->us04->len); } } else { - if (us428ctls && us428ctls->p4out_last >= 0 && us428ctls->p4out_last < N_US428_P4OUT_BUFS) { - if (us428ctls->p4out_last != us428ctls->p4out_sent) { - send = us428ctls->p4out_sent + 1; - if (send >= N_US428_P4OUT_BUFS) - send = 0; - for (j = 0; j < URBS_ASYNC_SEQ && !err; ++j) { - if (!usx2y->as04.urb[j]->status) { - p4out = us428ctls->p4out + send; // FIXME if more than 1 p4out is new, 1 gets lost. - usb_fill_bulk_urb(usx2y->as04.urb[j], usx2y->dev, - usb_sndbulkpipe(usx2y->dev, 0x04), &p4out->val.vol, - p4out->type == ELT_LIGHT ? sizeof(struct us428_lights) : 5, - i_usx2y_out04_int, usx2y); - err = usb_submit_urb(usx2y->as04.urb[j], GFP_ATOMIC); + while (us428ctls && + us428ctls->p4out_last >= 0 && + us428ctls->p4out_last < N_US428_P4OUT_BUFS && + us428ctls->p4out_last != us428ctls->p4out_sent) { + for (j = 0; j < URBS_ASYNC_SEQ && !err; ++j) { + if (!usx2y->as04.urb[j]->status) { + send = us428ctls->p4out_sent + 1; + if (send >= N_US428_P4OUT_BUFS) + send = 0; + + p4out = us428ctls->p4out + send; + len = p4out->type == ELT_LIGHT ? + sizeof(struct us428_lights) : 5; + memcpy(usx2y->as04.urb[j]->transfer_buffer, + &p4out->val.vol, len); + usx2y->as04.urb[j]->transfer_buffer_length = len; + err = usb_submit_urb(usx2y->as04.urb[j], GFP_ATOMIC); + if (!err) us428ctls->p4out_sent = send; - break; - } + + break; } } + if (j >= URBS_ASYNC_SEQ || err) + break; } } From 75a3e43339a2e66605deb8e726bb5d6372b1798a Mon Sep 17 00:00:00 2001 From: Sheetal Date: Tue, 19 May 2026 09:48:58 +0000 Subject: [PATCH 969/972] ASoC: tegra: tegra210-mixer: Reject too-short fade durations DURATION_INV_N3 is computed from BIT_ULL(31 + prescalar) divided by the configured duration, and then written as a 32-bit RAM value. Durations of 32 samples or less overflow that value, so reject them and advertise the valid range through the fade duration control. Validate the ALSA integer control value as a long before storing it in the driver's u32 duration field. Use a u32 temporary for duration RAM writes because the largest valid inverse value can still exceed INT_MAX. Fixes: 36645381b864 ("ASoC: tegra: Add per-stream Mixer Fade controls") Signed-off-by: Sheetal Link: https://patch.msgid.link/20260519094858.1426057-1-sheetal@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_mixer.c | 33 +++++++++++++++++++++++++------- sound/soc/tegra/tegra210_mixer.h | 4 +++- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/sound/soc/tegra/tegra210_mixer.c b/sound/soc/tegra/tegra210_mixer.c index bfdd457f740c18..c237ba7531de03 100644 --- a/sound/soc/tegra/tegra210_mixer.c +++ b/sound/soc/tegra/tegra210_mixer.c @@ -150,7 +150,7 @@ static int tegra210_mixer_configure_gain(struct snd_soc_component *cmpnt, /* Write duration parameters */ for (i = 0; i < NUM_DURATION_PARMS; i++) { - int val; + u32 val; if (instant_gain) { val = 1; @@ -181,6 +181,17 @@ static int tegra210_mixer_configure_gain(struct snd_soc_component *cmpnt, return err; } +static int tegra210_mixer_fade_duration_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; + uinfo->count = 1; + uinfo->value.integer.min = TEGRA210_MIXER_FADE_DURATION_MIN; + uinfo->value.integer.max = TEGRA210_MIXER_FADE_DURATION_MAX; + + return 0; +} + static int tegra210_mixer_get_fade_duration(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { @@ -202,9 +213,10 @@ static int tegra210_mixer_put_fade_duration(struct snd_kcontrol *kcontrol, struct snd_soc_component *cmpnt = snd_kcontrol_chip(kcontrol); struct tegra210_mixer *mixer = snd_soc_component_get_drvdata(cmpnt); unsigned int id = mc->reg; - u32 duration = ucontrol->value.integer.value[0]; + long duration = ucontrol->value.integer.value[0]; - if (duration == 0 || duration > TEGRA210_MIXER_FADE_DURATION_MAX) + if (duration < (long)TEGRA210_MIXER_FADE_DURATION_MIN || + duration > TEGRA210_MIXER_FADE_DURATION_MAX) return -EINVAL; if (mixer->duration[id] == duration) @@ -614,10 +626,17 @@ static int tegra210_mixer_fade_status_info(struct snd_kcontrol *kcontrol, } #define FADE_CTRL(id) \ - SOC_SINGLE_EXT("RX" #id " Fade Duration", (id) - 1, 0, \ - TEGRA210_MIXER_FADE_DURATION_MAX, 0, \ - tegra210_mixer_get_fade_duration, \ - tegra210_mixer_put_fade_duration), \ + { \ + .iface = SNDRV_CTL_ELEM_IFACE_MIXER, \ + .name = "RX" #id " Fade Duration", \ + .info = tegra210_mixer_fade_duration_info, \ + .get = tegra210_mixer_get_fade_duration, \ + .put = tegra210_mixer_put_fade_duration, \ + .private_value = SOC_SINGLE_VALUE((id) - 1, 0, \ + TEGRA210_MIXER_FADE_DURATION_MIN, \ + TEGRA210_MIXER_FADE_DURATION_MAX, \ + 0, 0), \ + }, \ SOC_SINGLE_EXT("RX" #id " Fade Gain", (id) - 1, 0, \ TEGRA210_MIXER_GAIN_MAX, 0, \ tegra210_mixer_get_fade_gain, \ diff --git a/sound/soc/tegra/tegra210_mixer.h b/sound/soc/tegra/tegra210_mixer.h index bcbad08cbb9d32..d9c8fa6124ded4 100644 --- a/sound/soc/tegra/tegra210_mixer.h +++ b/sound/soc/tegra/tegra210_mixer.h @@ -87,9 +87,11 @@ #define NUM_GAIN_POLY_COEFFS 9 #define TEGRA210_MIXER_GAIN_MAX 0x20000 -#define TEGRA210_MIXER_FADE_DURATION_MAX 0x7fffffff #define TEGRA210_MIXER_PRESCALAR 6 +#define TEGRA210_MIXER_FADE_DURATION_MIN \ + (BIT(TEGRA210_MIXER_PRESCALAR - 1) + 1) +#define TEGRA210_MIXER_FADE_DURATION_MAX 0x7fffffff #define TEGRA210_MIXER_FADE_IDLE 0 #define TEGRA210_MIXER_FADE_ACTIVE 1 From a6f7c21accb5fca895ee0d490e395deecaddf12f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 19 May 2026 14:54:33 +0100 Subject: [PATCH 970/972] ASoC: cs35l56-shared-test: Subtract reg_base offset in dummy regmap Subtract the value of cs35l56 regmap_config->reg_base from addresses passed into the mock regmap bus. Chip register addresses transferred over SoundWire are offset by 0x8000 to move them after the address range reserved in the SoundWire spec. This commit prepares for changing the cs35l56-sdw driver to use regmap_config->reg_base to add this offset. When that is done the addresses passed into the mock regmap_bus will include this offset. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260519135435.479949-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-shared-test.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/codecs/cs35l56-shared-test.c b/sound/soc/codecs/cs35l56-shared-test.c index cfe7938065f90e..8060a275d32bd4 100644 --- a/sound/soc/codecs/cs35l56-shared-test.c +++ b/sound/soc/codecs/cs35l56-shared-test.c @@ -29,6 +29,7 @@ struct cs35l56_shared_test_priv { struct faux_device *gpio_dev; struct cs35l56_shared_test_mock_gpio *gpio_priv; struct regmap *registers; + unsigned int reg_offset; struct cs35l56_base *cs35l56_base; u8 applied_pad_pull_state[CS35L56_MAX_GPIO]; }; @@ -194,6 +195,8 @@ static int cs35l56_shared_test_reg_read(void *context, unsigned int reg, unsigne { struct cs35l56_shared_test_priv *priv = context; + reg -= priv->reg_offset; + switch (reg) { case CS35L56_SYNC_GPIO1_CFG ... CS35L56_ASP2_DIO_GPIO13_CFG: case CS35L56_GPIO1_CTRL1 ... CS35L56_GPIO13_CTRL1: @@ -214,6 +217,8 @@ static int cs35l56_shared_test_reg_write(void *context, unsigned int reg, unsign { struct cs35l56_shared_test_priv *priv = context; + reg -= priv->reg_offset; + switch (reg) { case CS35L56_UPDATE_REGS: return cs35l56_shared_test_updt_gpio_pres(priv, reg, val); @@ -657,6 +662,7 @@ static int cs35l56_shared_test_case_base_init(struct kunit *test, u8 type, u8 re test->priv = priv; priv->test = test; + priv->reg_offset = regmap_config->reg_base; /* Create dummy amp driver dev */ priv->amp_dev = faux_device_create("cs35l56_shared_test_drv", NULL, NULL); From 7f9ae6d20525d81ecefce5ffd6d982c8b79e7cbc Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 19 May 2026 14:54:34 +0100 Subject: [PATCH 971/972] ASoC: cs35l56: Use reg_base to offset addresses on SoundWire Set the reg_base member of regmap_config for SoundWire so that the regmap core will apply the 0x8000 offset to addresses, instead of doing it within our low-level regmap read/write callbacks. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260519135435.479949-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-sdw.c | 7 +------ sound/soc/codecs/cs35l56-shared.c | 2 ++ 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index d5b9e5f71de29d..d9dcca1e952fe3 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -59,8 +59,6 @@ static int cs35l56_sdw_slow_read(struct sdw_slave *peripheral, unsigned int reg, { int ret, i; - reg += CS35L56_SDW_ADDR_OFFSET; - for (i = 0; i < val_size; i += sizeof(u32)) { /* Poll for bus bridge idle */ ret = cs35l56_sdw_poll_mem_status(peripheral, @@ -123,11 +121,9 @@ static int cs35l56_sdw_read(void *context, const void *reg_buf, reg = le32_to_cpu(*(const __le32 *)reg_buf); - if (cs35l56_is_otp_register(reg)) + if (cs35l56_is_otp_register(reg - CS35L56_SDW_ADDR_OFFSET)) return cs35l56_sdw_slow_read(peripheral, reg, buf8, val_size); - reg += CS35L56_SDW_ADDR_OFFSET; - if (val_size == 4) return cs35l56_sdw_read_one(peripheral, reg, val_buf); @@ -186,7 +182,6 @@ static int cs35l56_sdw_gather_write(void *context, int ret; reg = le32_to_cpu(*(const __le32 *)reg_buf); - reg += CS35L56_SDW_ADDR_OFFSET; if (val_size == 4) return cs35l56_sdw_write_one(peripheral, reg, src_be); diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c index 795e2764d67ec8..8e3538e28fade3 100644 --- a/sound/soc/codecs/cs35l56-shared.c +++ b/sound/soc/codecs/cs35l56-shared.c @@ -1880,6 +1880,7 @@ EXPORT_SYMBOL_NS_GPL(cs35l56_regmap_spi, "SND_SOC_CS35L56_SHARED"); const struct regmap_config cs35l56_regmap_sdw = { .reg_bits = 32, + .reg_base = 0x8000, .val_bits = 32, .reg_stride = 4, .reg_format_endian = REGMAP_ENDIAN_LITTLE, @@ -1915,6 +1916,7 @@ const struct regmap_config cs35l63_regmap_sdw = { .reg_bits = 32, .val_bits = 32, .reg_stride = 4, + .reg_base = 0x8000, .reg_format_endian = REGMAP_ENDIAN_LITTLE, .val_format_endian = REGMAP_ENDIAN_BIG, .max_register = CS35L56_DSP1_PMEM_5114, From eb65b5ad932431609b34cfe41f49b55a16e42d58 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 19 May 2026 14:54:35 +0100 Subject: [PATCH 972/972] ASoC: cs35l56: Use standard SoundWire regmap implementation Use the regmap_sdw implementation for SoundWire instead of re-implementing the low-level bus transactions in cs35l56-sdw.c The cs35l56 registers are big-endian on I2C and SPI but little-endian over SoundWire. The firmware files are all big-endian and contain opaque blobs in big-endian order. So these must be endian-swapped to transfer over SoundWire. A custom regmap bus implementation is used to do this endian-swapping. The original implementation of this custom regmap bus was a complete bus backend, performing the endian swapping and low-level SoundWire bus read/write. This commit changes the custom regmap bus to only perform the endian-swap. It uses an underlying simple uncached regmap_sdw bus to deal with transferring the 32-bit registers over the SoundWire bus. Although this adds a small amount of overhead, from passing through the regmap APIs twice, it avoids having a local duplicate implementation of what regmap_sdw already does. The slow-read handling for OTP registers must access 8-bit SoundWire registers so it still uses low-level SoundWire bus reads. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20260519135435.479949-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/Kconfig | 2 +- sound/soc/codecs/cs35l56-sdw.c | 132 ++++++++++++--------------------- sound/soc/codecs/cs35l56.h | 1 + 3 files changed, 51 insertions(+), 84 deletions(-) diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 5fdd0334c3559a..a7c61f7c7f4c5c 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -887,7 +887,7 @@ config SND_SOC_CS35L56_SPI config SND_SOC_CS35L56_SDW tristate "Cirrus Logic CS35L56 CODEC (SDW)" depends on SOUNDWIRE - select REGMAP + select REGMAP_SOUNDWIRE select SND_SOC_CS35L56 select SND_SOC_CS35L56_SHARED help diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index d9dcca1e952fe3..d2b82a846ae8f8 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "cs35l56.h" @@ -95,55 +96,23 @@ static int cs35l56_sdw_slow_read(struct sdw_slave *peripheral, unsigned int reg, return 0; } -static int cs35l56_sdw_read_one(struct sdw_slave *peripheral, unsigned int reg, void *buf) -{ - int ret; - - ret = sdw_nread_no_pm(peripheral, reg, 4, (u8 *)buf); - if (ret != 0) { - dev_err(&peripheral->dev, "Read failed @%#x:%d\n", reg, ret); - return ret; - } - - swab32s((u32 *)buf); - - return 0; -} - static int cs35l56_sdw_read(void *context, const void *reg_buf, const size_t reg_size, void *val_buf, size_t val_size) { struct sdw_slave *peripheral = context; - u8 *buf8 = val_buf; - unsigned int reg, bytes; + struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev); + unsigned int reg_addr = get_unaligned_le32(reg_buf); int ret; - reg = le32_to_cpu(*(const __le32 *)reg_buf); + if (cs35l56_is_otp_register(reg_addr - CS35L56_SDW_ADDR_OFFSET)) + return cs35l56_sdw_slow_read(peripheral, reg_addr, (u8 *)val_buf, val_size); - if (cs35l56_is_otp_register(reg - CS35L56_SDW_ADDR_OFFSET)) - return cs35l56_sdw_slow_read(peripheral, reg, buf8, val_size); - - if (val_size == 4) - return cs35l56_sdw_read_one(peripheral, reg, val_buf); - - while (val_size) { - bytes = SDW_REG_NO_PAGE - (reg & SDW_REGADDR); /* to end of page */ - if (bytes > val_size) - bytes = val_size; - - ret = sdw_nread_no_pm(peripheral, reg, bytes, buf8); - if (ret != 0) { - dev_err(&peripheral->dev, "Read failed @%#x..%#x:%d\n", - reg, reg + bytes - 1, ret); - return ret; - } + ret = regmap_raw_read(cs35l56->sdw_bus_regmap, reg_addr, val_buf, val_size); + if (ret) + return ret; - swab32_array((u32 *)buf8, bytes / 4); - val_size -= bytes; - reg += bytes; - buf8 += bytes; - } + swab32_array((u32 *)val_buf, val_size / sizeof(u32)); return 0; } @@ -157,57 +126,34 @@ static inline void cs35l56_swab_copy(void *dest, const void *src, size_t nbytes) *dest32++ = swab32(*src32++); } -static int cs35l56_sdw_write_one(struct sdw_slave *peripheral, unsigned int reg, const void *buf) -{ - u32 val_le = swab32(*(u32 *)buf); - int ret; - - ret = sdw_nwrite_no_pm(peripheral, reg, 4, (u8 *)&val_le); - if (ret != 0) { - dev_err(&peripheral->dev, "Write failed @%#x:%d\n", reg, ret); - return ret; - } - - return 0; -} - static int cs35l56_sdw_gather_write(void *context, const void *reg_buf, size_t reg_size, const void *val_buf, size_t val_size) { struct sdw_slave *peripheral = context; - const u8 *src_be = val_buf; - u32 val_le_buf[64]; /* Define u32 so it is 32-bit aligned */ - unsigned int reg, bytes; + struct cs35l56_private *cs35l56 = dev_get_drvdata(&peripheral->dev); + unsigned int reg_addr = get_unaligned_le32(reg_buf); + u32 swab_buf[64]; /* Define u32 so it is 32-bit aligned */ int ret; - reg = le32_to_cpu(*(const __le32 *)reg_buf); - - if (val_size == 4) - return cs35l56_sdw_write_one(peripheral, reg, src_be); - - while (val_size) { - bytes = SDW_REG_NO_PAGE - (reg & SDW_REGADDR); /* to end of page */ - if (bytes > val_size) - bytes = val_size; - if (bytes > sizeof(val_le_buf)) - bytes = sizeof(val_le_buf); - - cs35l56_swab_copy(val_le_buf, src_be, bytes); - - ret = sdw_nwrite_no_pm(peripheral, reg, bytes, (u8 *)val_le_buf); - if (ret != 0) { - dev_err(&peripheral->dev, "Write failed @%#x..%#x:%d\n", - reg, reg + bytes - 1, ret); + while (val_size > sizeof(swab_buf)) { + cs35l56_swab_copy(swab_buf, val_buf, sizeof(swab_buf)); + ret = regmap_raw_write(cs35l56->sdw_bus_regmap, reg_addr, + swab_buf, sizeof(swab_buf)); + if (ret) return ret; - } - val_size -= bytes; - reg += bytes; - src_be += bytes; + val_size -= sizeof(swab_buf); + reg_addr += sizeof(swab_buf); + val_buf += sizeof(swab_buf); } - return 0; + if (val_size == 0) + return 0; + + cs35l56_swab_copy(swab_buf, val_buf, val_size); + + return regmap_raw_write(cs35l56->sdw_bus_regmap, reg_addr, swab_buf, val_size); } static int cs35l56_sdw_write(void *context, const void *val_buf, size_t val_size) @@ -226,7 +172,7 @@ static int cs35l56_sdw_write(void *context, const void *val_buf, size_t val_size * byte controls always have the same byte order, and firmware file blobs * can be written verbatim. */ -static const struct regmap_bus cs35l56_regmap_bus_sdw = { +static const struct regmap_bus cs35l56_regmap_swab_bus_sdw = { .read = cs35l56_sdw_read, .write = cs35l56_sdw_write, .gather_write = cs35l56_sdw_gather_write, @@ -234,6 +180,18 @@ static const struct regmap_bus cs35l56_regmap_bus_sdw = { .val_format_endian_default = REGMAP_ENDIAN_BIG, }; +/* Low-level SoundWire regmap to transfer the data over the bus */ +static const struct regmap_config cs35l56_sdw_bus_regmap = { + .name = "sdw-le32", + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, + .reg_format_endian = REGMAP_ENDIAN_LITTLE, + .val_format_endian = REGMAP_ENDIAN_LITTLE, + .max_register = CS35L56_DSP1_PMEM_5114 + 0x8000, + .cache_type = REGCACHE_NONE, +}; + static int cs35l56_sdw_get_unique_id(struct cs35l56_private *cs35l56) { int ret; @@ -555,8 +513,16 @@ static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_devi cs35l56->base.type = ((unsigned int)id->driver_data) & 0xff; - cs35l56->base.regmap = devm_regmap_init(dev, &cs35l56_regmap_bus_sdw, - peripheral, regmap_config); + /* Low-level regmap to transfer read/writes over SoundWire bus */ + cs35l56->sdw_bus_regmap = devm_regmap_init_sdw(peripheral, &cs35l56_sdw_bus_regmap); + if (IS_ERR(cs35l56->sdw_bus_regmap)) { + ret = PTR_ERR(cs35l56->sdw_bus_regmap); + return dev_err_probe(dev, ret, "Failed to allocate bus register map\n"); + } + + /* Wrapper regmap to simulate big-endian ordering */ + cs35l56->base.regmap = devm_regmap_init(dev, &cs35l56_regmap_swab_bus_sdw, + peripheral, regmap_config); if (IS_ERR(cs35l56->base.regmap)) { ret = PTR_ERR(cs35l56->base.regmap); return dev_err_probe(dev, ret, "Failed to allocate register map\n"); diff --git a/sound/soc/codecs/cs35l56.h b/sound/soc/codecs/cs35l56.h index cd71b23b2a3ace..d029fa3f86565f 100644 --- a/sound/soc/codecs/cs35l56.h +++ b/sound/soc/codecs/cs35l56.h @@ -37,6 +37,7 @@ struct cs35l56_private { struct snd_soc_component *component; struct regulator_bulk_data supplies[CS35L56_NUM_BULK_SUPPLIES]; struct sdw_slave *sdw_peripheral; + struct regmap *sdw_bus_regmap; const char *fallback_fw_suffix; struct work_struct sdw_irq_work; bool sdw_irq_no_unmask;