From 247981eecd3dd6ff51bd0a0223deba8af39c5498 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 10 Sep 2025 20:37:16 +0000 Subject: [PATCH 01/51] net: Use NAPI_* in test_bit when stopping napi kthread napi_stop_kthread waits for the NAPI_STATE_SCHED_THREADED to be unset before stopping the kthread. But it uses test_bit with the NAPIF_STATE_SCHED_THREADED and that might stop the kthread early before the flag is unset. Use the NAPI_* variant of the NAPI state bits in test_bit instead. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 689883de94dd ("net: stop napi kthreads when THREADED napi is disabled") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250910203716.1016546-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..8d49b2198d07 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); From 5577352b55833d0f4350eb5d62eda2df09e84922 Mon Sep 17 00:00:00 2001 From: Li Tian Date: Wed, 10 Sep 2025 08:37:32 +0800 Subject: [PATCH 02/51] net/mlx5: Not returning mlx5_link_info table when speed is unknown Because mlx5e_link_info and mlx5e_ext_link_info have holes e.g. Azure mlx5 reports PTYS 19. Do not return it unless speed is retrieved successfully. Fixes: 65a5d35571849 ("net/mlx5: Refactor link speed handling with mlx5_link_info struct") Suggested-by: Vitaly Kuznetsov Signed-off-by: Li Tian Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250910003732.5973-1-litian@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 2d7adf7444ba..aa9f2b0a77d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -1170,7 +1170,11 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev, mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, force_legacy); i = find_first_bit(&temp, max_size); - if (i < max_size) + + /* mlx5e_link_info has holes. Check speed + * is not zero as indication of one. + */ + if (i < max_size && table[i].speed) return &table[i]; return NULL; From 2690cb089502b80b905f2abdafd1bf2d54e1abef Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Wed, 10 Sep 2025 17:48:25 +0300 Subject: [PATCH 03/51] dpaa2-switch: fix buffer pool seeding for control traffic Starting with commit c50e7475961c ("dpaa2-switch: Fix error checking in dpaa2_switch_seed_bp()"), the probing of a second DPSW object errors out like below. fsl_dpaa2_switch dpsw.1: fsl_mc_driver_probe failed: -12 fsl_dpaa2_switch dpsw.1: probe with driver fsl_dpaa2_switch failed with error -12 The aforementioned commit brought to the surface the fact that seeding buffers into the buffer pool destined for control traffic is not successful and an access violation recoverable error can be seen in the MC firmware log: [E, qbman_rec_isr:391, QBMAN] QBMAN recoverable event 0x1000000 This happens because the driver incorrectly used the ID of the DPBP object instead of the hardware buffer pool ID when trying to release buffers into it. This is because any DPSW object uses two buffer pools, one managed by the Linux driver and destined for control traffic packet buffers and the other one managed by the MC firmware and destined only for offloaded traffic. And since the buffer pool managed by the MC firmware does not have an external facing DPBP equivalent, any subsequent DPBP objects created after the first DPSW will have a DPBP id different to the underlying hardware buffer ID. The issue was not caught earlier because these two numbers can be identical when all DPBP objects are created before the DPSW objects are. This is the case when the DPL file is used to describe the entire DPAA2 object layout and objects are created at boot time and it's also true for the first DPSW being created dynamically using ls-addsw. Fix this by using the buffer pool ID instead of the DPBP id when releasing buffers into the pool. Fixes: 2877e4f7e189 ("staging: dpaa2-switch: setup buffer pool and RX path rings") Signed-off-by: Ioana Ciornei Link: https://patch.msgid.link/20250910144825.2416019-1-ioana.ciornei@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 4643a3380618..b1e1ad9e4b48 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2736,7 +2736,7 @@ static int dpaa2_switch_setup_dpbp(struct ethsw_core *ethsw) dev_err(dev, "dpsw_ctrl_if_set_pools() failed\n"); goto err_get_attr; } - ethsw->bpid = dpbp_attrs.id; + ethsw->bpid = dpbp_attrs.bpid; return 0; From 201825fb4278accb6ace42915566c22391a0900d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 11 Sep 2025 15:43:15 +0100 Subject: [PATCH 04/51] net: ethtool: handle EOPNOTSUPP from ethtool get_ts_info() method Network drivers sometimes return -EOPNOTSUPP from their get_ts_info() method, and this should not cause the reporting of PHY timestamping information to be prohibited. Handle this error code, and also arrange for ethtool_net_get_ts_info_by_phc() to return -EOPNOTSUPP when the method is not implemented. This allows e.g. PHYs connected to DSA switches which support timestamping to report their timestamping capabilities. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uwiW3-00000004jRF-3CnC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..92e6a681c797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); From 2e7bba08923ebc675b1f0e0e0959e68e53047838 Mon Sep 17 00:00:00 2001 From: Anderson Nascimento Date: Thu, 11 Sep 2025 20:07:44 -0300 Subject: [PATCH 05/51] net/tcp: Fix a NULL pointer dereference when using TCP-AO with TCP_REPAIR A NULL pointer dereference can occur in tcp_ao_finish_connect() during a connect() system call on a socket with a TCP-AO key added and TCP_REPAIR enabled. The function is called with skb being NULL and attempts to dereference it on tcp_hdr(skb)->seq without a prior skb validation. Fix this by checking if skb is NULL before dereferencing it. The commentary is taken from bpf_skops_established(), which is also called in the same flow. Unlike the function being patched, bpf_skops_established() validates the skb before dereferencing it. int main(void){ struct sockaddr_in sockaddr; struct tcp_ao_add tcp_ao; int sk; int one = 1; memset(&sockaddr,'\0',sizeof(sockaddr)); memset(&tcp_ao,'\0',sizeof(tcp_ao)); sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); sockaddr.sin_family = AF_INET; memcpy(tcp_ao.alg_name,"cmac(aes128)",12); memcpy(tcp_ao.key,"ABCDEFGHABCDEFGH",16); tcp_ao.keylen = 16; memcpy(&tcp_ao.addr,&sockaddr,sizeof(sockaddr)); setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tcp_ao, sizeof(tcp_ao)); setsockopt(sk, IPPROTO_TCP, TCP_REPAIR, &one, sizeof(one)); sockaddr.sin_family = AF_INET; sockaddr.sin_port = htobe16(123); inet_aton("127.0.0.1", &sockaddr.sin_addr); connect(sk,(struct sockaddr *)&sockaddr,sizeof(sockaddr)); return 0; } $ gcc tcp-ao-nullptr.c -o tcp-ao-nullptr -Wall $ unshare -Urn BUG: kernel NULL pointer dereference, address: 00000000000000b6 PGD 1f648d067 P4D 1f648d067 PUD 1982e8067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 RIP: 0010:tcp_ao_finish_connect (net/ipv4/tcp_ao.c:1182) Fixes: 7c2ffaf21bd6 ("net/tcp: Calculate TCP-AO traffic keys") Signed-off-by: Anderson Nascimento Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911230743.2551-3-anderson@allelesecurity.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..3338b6cc85c4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) From 70d99623d5c11e1a9bcc564b8fbad6fa916913d8 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 12 Sep 2025 11:33:31 +0200 Subject: [PATCH 06/51] dpll: fix clock quality level reporting The DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC is not reported via netlink due to bug in dpll_msg_add_clock_quality_level(). The usage of DPLL_CLOCK_QUALITY_LEVEL_MAX for both DECLARE_BITMAP() and for_each_set_bit() is not correct because these macros requires bitmap size and not the highest valid bit in the bitmap. Use correct bitmap size to fix this issue. Fixes: a1afb959add1 ("dpll: add clock quality level attribute and op") Signed-off-by: Ivan Vecera Reviewed-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250912093331.862333-1-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 036f21cac0a9..0a852011653c 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -211,8 +211,8 @@ static int dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) { + DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) = { 0 }; const struct dpll_device_ops *ops = dpll_device_ops(dpll); - DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 }; enum dpll_clock_quality_level ql; int ret; @@ -221,7 +221,7 @@ dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack); if (ret) return ret; - for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) + for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX + 1) if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql)) return -EMSGSIZE; From 64863f4ca4945bdb62ce2b30823f39ea9fe95415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Sep 2025 23:58:16 +0100 Subject: [PATCH 07/51] rxrpc: Fix unhandled errors in rxgk_verify_packet_integrity() rxgk_verify_packet_integrity() may get more errors than just -EPROTO from rxgk_verify_mic_skb(). Pretty much anything other than -ENOMEM constitutes an unrecoverable error. In the case of -ENOMEM, we can just drop the packet and wait for a retransmission. Similar happens with rxgk_decrypt_skb() and its callers. Fix rxgk_decrypt_skb() or rxgk_verify_mic_skb() to return a greater variety of abort codes and fix their callers to abort the connection on any error apart from -ENOMEM. Also preclear the variables used to hold the abort code returned from rxgk_decrypt_skb() or rxgk_verify_mic_skb() to eliminate uninitialised variable warnings. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009739.html Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009740.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2038804.1757631496@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 18 ++++++++++-------- net/rxrpc/rxgk_app.c | 10 ++++++---- net/rxrpc/rxgk_common.h | 14 ++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc8..dce5a3d8a964 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c317..df684b5a8531 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -187,7 +187,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -236,9 +236,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a5655985..80164d89e19c 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } From 2429a197648178cd4dc930a9d87c13c547460564 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Sep 2025 00:06:17 +0100 Subject: [PATCH 08/51] rxrpc: Fix untrusted unsigned subtract Fix the following Smatch static checker warning: net/rxrpc/rxgk_app.c:65 rxgk_yfs_decode_ticket() warn: untrusted unsigned subtract. 'ticket_len - 10 * 4' by prechecking the length of what we're trying to extract in two places in the token and decoding for a response packet. Also use sizeof() on the struct we're extracting rather specifying the size numerically to be consistent with the other related statements. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-September/010135.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2039268.1757631977@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk_app.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index df684b5a8531..30275cb5ba3e 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -285,4 +290,8 @@ temporary_error: * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } From af82e857df5dd883a4867bcaf5dde041e57a4e33 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 11 Sep 2025 18:36:10 -0400 Subject: [PATCH 09/51] octeon_ep: Validate the VF ID Add a helper to validate the VF ID and use it in the VF ndo ops to prevent accessing out-of-range entries. Without this check, users can run commands such as: # ip link show dev enp135s0 2: enp135s0: mtu 1500 qdisc mq state UP mode DEFAULT group default qlen 1000 link/ether 00:00:00:01:01:00 brd ff:ff:ff:ff:ff:ff vf 0 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off vf 1 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff, spoof checking on, link-state enable, trust off # ip link set dev enp135s0 vf 4 mac 00:00:00:00:00:14 # echo $? 0 even though VF 4 does not exist, which results in silent success instead of returning an error. Fixes: 8a241ef9b9b8 ("octeon_ep: add ndo ops for VFs in PF driver") Signed-off-by: Kamal Heib Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250911223610.1803144-1-kheib@redhat.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeon_ep/octep_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 24499bb36c00..bcea3fc26a8c 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1124,11 +1124,24 @@ static int octep_set_features(struct net_device *dev, netdev_features_t features return err; } +static bool octep_is_vf_valid(struct octep_device *oct, int vf) +{ + if (vf >= CFG_GET_ACTIVE_VFS(oct->conf)) { + netdev_err(oct->netdev, "Invalid VF ID %d\n", vf); + return false; + } + + return true; +} + static int octep_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivi) { struct octep_device *oct = netdev_priv(dev); + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + ivi->vf = vf; ether_addr_copy(ivi->mac, oct->vf_info[vf].mac_addr); ivi->spoofchk = true; @@ -1143,6 +1156,9 @@ static int octep_set_vf_mac(struct net_device *dev, int vf, u8 *mac) struct octep_device *oct = netdev_priv(dev); int err; + if (!octep_is_vf_valid(oct, vf)) + return -EINVAL; + if (!is_valid_ether_addr(mac)) { dev_err(&oct->pdev->dev, "Invalid MAC Address %pM\n", mac); return -EADDRNOTAVAIL; From 56c0a2a9ddc2f5b5078c5fb0f81ab76bbc3d4c37 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Wed, 10 Sep 2025 16:29:16 +1000 Subject: [PATCH 10/51] qed: Don't collect too many protection override GRC elements In the protection override dump path, the firmware can return far too many GRC elements, resulting in attempting to write past the end of the previously-kmalloc'ed dump buffer. This will result in a kernel panic with reason: BUG: unable to handle kernel paging request at ADDRESS where "ADDRESS" is just past the end of the protection override dump buffer. The start address of the buffer is: p_hwfn->cdev->dbg_features[DBG_FEATURE_PROTECTION_OVERRIDE].dump_buf and the size of the buffer is buf_size in the same data structure. The panic can be arrived at from either the qede Ethernet driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc02662ed [qed] qed_dbg_protection_override_dump at ffffffffc0267792 [qed] qed_dbg_feature at ffffffffc026aa8f [qed] qed_dbg_all_data at ffffffffc026b211 [qed] qed_fw_fatal_reporter_dump at ffffffffc027298a [qed] devlink_health_do_dump at ffffffff82497f61 devlink_health_report at ffffffff8249cf29 qed_report_fatal_error at ffffffffc0272baf [qed] qede_sp_task at ffffffffc045ed32 [qede] process_one_work at ffffffff81d19783 or the qedf storage driver path: [exception RIP: qed_grc_dump_addr_range+0x108] qed_protection_override_dump at ffffffffc068b2ed [qed] qed_dbg_protection_override_dump at ffffffffc068c792 [qed] qed_dbg_feature at ffffffffc068fa8f [qed] qed_dbg_all_data at ffffffffc0690211 [qed] qed_fw_fatal_reporter_dump at ffffffffc069798a [qed] devlink_health_do_dump at ffffffff8aa95e51 devlink_health_report at ffffffff8aa9ae19 qed_report_fatal_error at ffffffffc0697baf [qed] qed_hw_err_notify at ffffffffc06d32d7 [qed] qed_spq_post at ffffffffc06b1011 [qed] qed_fcoe_destroy_conn at ffffffffc06b2e91 [qed] qedf_cleanup_fcport at ffffffffc05e7597 [qedf] qedf_rport_event_handler at ffffffffc05e7bf7 [qedf] fc_rport_work at ffffffffc02da715 [libfc] process_one_work at ffffffff8a319663 Resolve this by clamping the firmware's return value to the maximum number of legal elements the firmware should return. Fixes: d52c89f120de8 ("qed*: Utilize FW 8.37.2.0") Signed-off-by: Jamie Bainbridge Link: https://patch.msgid.link/f8e1182934aa274c18d0682a12dbaf347595469c.1757485536.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_debug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 9c3d3dd2f847..1f0cea3cae92 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -4462,10 +4462,11 @@ static enum dbg_status qed_protection_override_dump(struct qed_hwfn *p_hwfn, goto out; } - /* Add override window info to buffer */ + /* Add override window info to buffer, preventing buffer overflow */ override_window_dwords = - qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * - PROTECTION_OVERRIDE_ELEMENT_DWORDS; + min(qed_rd(p_hwfn, p_ptt, GRC_REG_NUMBER_VALID_OVERRIDE_WINDOW) * + PROTECTION_OVERRIDE_ELEMENT_DWORDS, + PROTECTION_OVERRIDE_DEPTH_DWORDS); if (override_window_dwords) { addr = BYTES_TO_DWORDS(GRC_REG_PROTECTION_OVERRIDE_WINDOW); offset += qed_grc_dump_addr_range(p_hwfn, From a9888628cb2c768202a4530e2816da1889cc3165 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:15 +0200 Subject: [PATCH 11/51] net: dst_metadata: fix IP_DF bit not extracted from tunnel headers Both OVS and TC flower allow extracting and matching on the DF bit of the outer IP header via OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT in the OVS_KEY_ATTR_TUNNEL and TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT in the TCA_FLOWER_KEY_ENC_FLAGS respectively. Flow dissector extracts this information as FLOW_DIS_F_TUNNEL_DONT_FRAGMENT from the tunnel info key. However, the IP_TUNNEL_DONT_FRAGMENT_BIT in the tunnel key is never actually set, because the tunneling code doesn't actually extract it from the IP header. OAM and CRIT_OPT are extracted by the tunnel implementation code, same code also sets the KEY flag, if present. UDP tunnel core takes care of setting the CSUM flag if the checksum is present in the UDP header, but the DONT_FRAGMENT is not handled at any layer. Fix that by checking the bit and setting the corresponding flag while populating the tunnel info in the IP layer where it belongs. Not using __assign_bit as we don't really need to clear the bit in a just initialized field. It also doesn't seem like using __assign_bit will make the code look better. Clearly, users didn't rely on this functionality for anything very important until now. The reason why this doesn't break OVS logic is that it only matches on what kernel previously parsed out and if kernel consistently reports this bit as zero, OVS will only match on it to be zero, which sort of works. But it is still a bug that the uAPI reports and allows matching on the field that is not actually checked in the packet. And this is causing misleading -df reporting in OVS datapath flows, while the tunnel traffic actually has the bit set in most cases. This may also cause issues if a hardware properly implements support for tunnel flag matching as it will disagree with the implementation in a software path of TC flower. Fixes: 7d5437c709de ("openvswitch: Add tunneling interface.") Fixes: 1d17568e74de ("net/sched: cls_flower: add support for matching tunnel control flags") Signed-off-by: Ilya Maximets Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250909165440.229890-2-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 4160731dcb6e..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include +#include #include #include #include @@ -220,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); + + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, From 6cafb93c1f2aec7875f7a024a53e15f0683df699 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 9 Sep 2025 18:54:16 +0200 Subject: [PATCH 12/51] selftests: openvswitch: add a simple test for tunnel metadata This test ensures that upon receiving decapsulated packets from a tunnel interface in openvswitch, the tunnel metadata fields are properly populated. This partially covers interoperability of the kernel tunnel ports and openvswitch tunnels (LWT) and parsing and formatting of the tunnel metadata fields of the openvswitch netlink uAPI. Doing so, this test also ensures that fields and flags are properly extracted during decapsulation by the tunnel core code, serving as a regression test for the previously fixed issue with the DF bit not being extracted from the outer IP header. The ovs-dpctl.py script already supports all that is necessary for the tunnel ports for this test, so we only need to adjust the ovs_add_if() function to pass the '-t' port type argument in order to be able to create tunnel ports in the openvswitch datapath. Reviewed-by: Aaron Conole Signed-off-by: Ilya Maximets Link: https://patch.msgid.link/20250909165440.229890-3-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- .../selftests/net/openvswitch/openvswitch.sh | 88 +++++++++++++++++-- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 3c8d3455d8e7..b327d3061ed5 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -25,6 +25,7 @@ tests=" nat_related_v4 ip4-nat-related: ICMP related matches work with SNAT netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces + tunnel_metadata ovs: test extraction of tunnel metadata drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -113,13 +114,13 @@ ovs_add_dp () { } ovs_add_if () { - info "Adding IF to DP: br:$2 if:$3" - if [ "$4" != "-u" ]; then - ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if "$2" "$3" \ - || return 1 + info "Adding IF to DP: br:$3 if:$4 ($2)" + if [ "$5" != "-u" ]; then + ovs_sbx "$1" python3 $ovs_base/ovs-dpctl.py add-if \ + -t "$2" "$3" "$4" || return 1 else python3 $ovs_base/ovs-dpctl.py add-if \ - -u "$2" "$3" >$ovs_dir/$3.out 2>$ovs_dir/$3.err & + -u -t "$2" "$3" "$4" >$ovs_dir/$4.out 2>$ovs_dir/$4.err & pid=$! on_exit "ovs_sbx $1 kill -TERM $pid 2>/dev/null" fi @@ -166,9 +167,9 @@ ovs_add_netns_and_veths () { fi if [ "$7" != "-u" ]; then - ovs_add_if "$1" "$2" "$4" || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" || return 1 else - ovs_add_if "$1" "$2" "$4" -u || return 1 + ovs_add_if "$1" "netdev" "$2" "$4" -u || return 1 fi if [ $TRACING -eq 1 ]; then @@ -756,6 +757,79 @@ test_upcall_interfaces() { return 0 } +ovs_add_kernel_tunnel() { + local sbxname=$1; shift + local ns=$1; shift + local tnl_type=$1; shift + local name=$1; shift + local addr=$1; shift + + info "setting up kernel ${tnl_type} tunnel ${name}" + ovs_sbx "${sbxname}" ip -netns ${ns} link add dev ${name} type ${tnl_type} $* || return 1 + on_exit "ovs_sbx ${sbxname} ip -netns ${ns} link del ${name} >/dev/null 2>&1" + ovs_sbx "${sbxname}" ip -netns ${ns} addr add dev ${name} ${addr} || return 1 + ovs_sbx "${sbxname}" ip -netns ${ns} link set dev ${name} mtu 1450 up || return 1 +} + +test_tunnel_metadata() { + which arping >/dev/null 2>&1 || return $ksft_skip + + sbxname="test_tunnel_metadata" + sbx_add "${sbxname}" || return 1 + + info "setting up new DP" + ovs_add_dp "${sbxname}" tdp0 -V 2:1 || return 1 + + ovs_add_netns_and_veths "${sbxname}" tdp0 tns left0 l0 \ + 172.31.110.1/24 || return 1 + + info "removing veth interface from openvswitch and setting IP" + ovs_del_if "${sbxname}" tdp0 left0 || return 1 + ovs_sbx "${sbxname}" ip addr add 172.31.110.2/24 dev left0 || return 1 + ovs_sbx "${sbxname}" ip link set left0 up || return 1 + + info "setting up tunnel port in openvswitch" + ovs_add_if "${sbxname}" "vxlan" tdp0 ovs-vxlan0 -u || return 1 + on_exit "ovs_sbx ${sbxname} ip link del ovs-vxlan0" + ovs_wait ip link show ovs-vxlan0 &>/dev/null || return 1 + ovs_sbx "${sbxname}" ip link set ovs-vxlan0 up || return 1 + + configs=$(echo ' + 1 172.31.221.1/24 1155332 32 set udpcsum flags\(df\|csum\) + 2 172.31.222.1/24 1234567 45 set noudpcsum flags\(df\) + 3 172.31.223.1/24 1020304 23 unset udpcsum flags\(csum\) + 4 172.31.224.1/24 1357986 15 unset noudpcsum' | sed '/^$/d') + + while read -r i addr id ttl df csum flags; do + ovs_add_kernel_tunnel "${sbxname}" tns vxlan vxlan${i} ${addr} \ + remote 172.31.110.2 id ${id} dstport 4789 \ + ttl ${ttl} df ${df} ${csum} || return 1 + done <<< "${configs}" + + ovs_wait grep -q 'listening on upcall packet handler' \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + + info "sending arping" + for i in 1 2 3 4; do + ovs_sbx "${sbxname}" ip netns exec tns \ + arping -I vxlan${i} 172.31.22${i}.2 -c 1 \ + >${ovs_dir}/arping.stdout 2>${ovs_dir}/arping.stderr + done + + info "checking that received decapsulated packets carry correct metadata" + while read -r i addr id ttl df csum flags; do + arp_hdr="arp\\(sip=172.31.22${i}.1,tip=172.31.22${i}.2,op=1,sha=" + addrs="src=172.31.110.1,dst=172.31.110.2" + ports="tp_src=[0-9]*,tp_dst=4789" + tnl_md="tunnel\\(tun_id=${id},${addrs},ttl=${ttl},${ports},${flags}\\)" + + ovs_sbx "${sbxname}" grep -qE "MISS upcall.*${tnl_md}.*${arp_hdr}" \ + ${ovs_dir}/ovs-vxlan0.out || return 1 + done <<< "${configs}" + + return 0 +} + run_test() { ( tname="$1" From a38108a23ab558b834d71d542d32c05ab0fb64d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Sep 2025 10:30:52 +0300 Subject: [PATCH 13/51] wifi: iwlwifi: pcie: fix byte count table for some devices In my previous fix for this condition, I erroneously listed 9000 instead of 7000 family, when 7000/8000 were already using iwlmvm. Thus the condition ended up wrong, causing the issue I had fixed for older devices to suddenly appear on 7000/8000 family devices. Correct the condition accordingly. Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/r/20250909165811.10729-1-00107082@163.com/ Fixes: 586e3cb33ba6 ("wifi: iwlwifi: fix byte count table for old devices") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250915102743.777aaafbcc6c.I84404edfdfbf400501f6fb06def5b86c501da198@changeid Signed-off-by: Miri Korenblit --- drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c index d912e709a92c..bb03dad4a300 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/tx.c @@ -2092,7 +2092,7 @@ static void iwl_txq_gen1_update_byte_cnt_tbl(struct iwl_trans *trans, break; } - if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_9000 && + if (trans->mac_cfg->device_family >= IWL_DEVICE_FAMILY_7000 && trans->mac_cfg->device_family < IWL_DEVICE_FAMILY_AX210) len = DIV_ROUND_UP(len, 4); From 4351ca3fcb3ffecf12631b4996bf085a2dad0db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Thu, 11 Sep 2025 15:33:34 +0200 Subject: [PATCH 14/51] rds: ib: Increment i_fastreg_wrs before bailing out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to increment i_fastreg_wrs before we bail out from rds_ib_post_reg_frmr(). We have a fixed budget of how many FRWR operations that can be outstanding using the dedicated QP used for memory registrations and de-registrations. This budget is enforced by the atomic_t i_fastreg_wrs. If we bail out early in rds_ib_post_reg_frmr(), we will "leak" the possibility of posting an FRWR operation, and if that accumulates, no FRWR operation can be carried out. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Fixes: 3a2886cca703 ("net/rds: Keep track of and wait for FRWR segments in use upon shutdown") Cc: stable@vger.kernel.org Signed-off-by: Håkon Bugge Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250911133336.451212-1-haakon.bugge@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_frmr.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b0022178..bd861191157b 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } From 35ae4e86292ef7dfe4edbb9942955c884e984352 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:34 +0000 Subject: [PATCH 15/51] bonding: set random address only when slaves already exist After commit 5c3bf6cba791 ("bonding: assign random address if device address is same as bond"), bonding will erroneously randomize the MAC address of the first interface added to the bond if fail_over_mac = follow. Correct this by additionally testing for the bond being empty before randomizing the MAC. Fixes: 5c3bf6cba791 ("bonding: assign random address if device address is same as bond") Reported-by: Qiuling Ren Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 257333c88710..8832bc9f107b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2132,6 +2132,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + bond_has_slaves(bond) && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. From 71379e1c95af2c57567fcac24184c94cb7de4cd6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 10 Sep 2025 02:43:35 +0000 Subject: [PATCH 16/51] selftests: bonding: add fail_over_mac testing Add a test to check each value of bond fail_over_mac option. Also fix a minor garp_test print issue. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250910024336.400253-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 139 +++++++++++++++++- .../drivers/net/bonding/bond_topo_2d1c.sh | 3 + .../drivers/net/bonding/bond_topo_3d1c.sh | 2 + 3 files changed, 142 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 7bc148889ca7..e3f3cc803b56 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -7,6 +7,7 @@ ALL_TESTS=" prio arp_validate num_grat_arp + fail_over_mac " lib_dir=$(dirname "$0") @@ -352,8 +353,8 @@ garp_test() exp_num=$(echo "${param}" | cut -f6 -d ' ') active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") - slowwait_for_counter $((exp_num + 5)) $exp_num \ - tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" + slowwait_for_counter $((exp_num + 5)) $exp_num tc_rule_handle_stats_get \ + "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}" &> /dev/null # check result real_num=$(tc_rule_handle_stats_get "dev s${active_slave#eth} ingress" 101 ".packets" "-n ${g_ns}") @@ -376,6 +377,140 @@ num_grat_arp() done } +check_all_mac_same() +{ + RET=0 + # all slaves should have same mac address (with the first port's mac) + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + local eth1_mac=$(ip -n "$s_ns" -j link show eth1 | jq -r '.[]["address"]') + local eth2_mac=$(ip -n "$s_ns" -j link show eth2 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ] || [ "$eth0_mac" != "$bond_mac" ] || \ + [ "$eth1_mac" != "$bond_mac" ] || [ "$eth2_mac" != "$bond_mac" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_first() +{ + RET=0 + # bond mac address should be same with the first added slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + if [ "$bond_mac" != "${mac[0]}" ]; then + RET=1 + fi +} + +check_bond_mac_same_with_active() +{ + RET=0 + # bond mac address should be same with active slave + local bond_mac=$(ip -n "$s_ns" -j link show bond0 | jq -r '.[]["address"]') + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + local active_slave_mac=$(ip -n "$s_ns" -j link show "$active_slave" | jq -r '.[]["address"]') + if [ "$bond_mac" != "$active_slave_mac" ]; then + RET=1 + fi +} + +check_backup_slave_mac_not_change() +{ + RET=0 + # backup slave's mac address is not changed + if ip -n "$s_ns" -d -j link show type bond_slave | jq -e '.[] + | select(.linkinfo.info_slave_data.state=="BACKUP") + | select(.address != .linkinfo.info_slave_data.perm_hwaddr)' &> /dev/null; then + RET=1 + fi +} + +check_backup_slave_mac_inherit() +{ + local backup_mac + RET=0 + + # backup slaves should use mac[1] or mac[2] + local backup_macs=$(ip -n "$s_ns" -d -j link show type bond_slave | \ + jq -r '.[] | select(.linkinfo.info_slave_data.state=="BACKUP") | .address') + for backup_mac in $backup_macs; do + if [ "$backup_mac" != "${mac[1]}" ] && [ "$backup_mac" != "${mac[2]}" ]; then + RET=1 + fi + done +} + +check_first_slave_random_mac() +{ + RET=0 + # remove the first added slave and added it back + ip -n "$s_ns" link set eth0 nomaster + ip -n "$s_ns" link set eth0 master bond0 + + # the first slave should use random mac address + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" = "${mac[0]}" ] && RET=1 + log_test "bond fail_over_mac follow" "random first slave mac" + + # remove the first slave, the permanent MAC address should be restored back + ip -n "$s_ns" link set eth0 nomaster + eth0_mac=$(ip -n "$s_ns" -j link show eth0 | jq -r '.[]["address"]') + [ "$eth0_mac" != "${mac[0]}" ] && RET=1 +} + +do_active_backup_failover() +{ + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + ip -n ${s_ns} link set ${active_slave} down + slowwait 2 active_slave_changed $active_slave + ip -n ${s_ns} link set ${active_slave} up +} + +fail_over_mac() +{ + # Bring down the first interface on the switch to force the bond to + # select another active interface instead of the first one that joined. + ip -n "$g_ns" link set s0 down + + # fail_over_mac none + bond_reset "mode active-backup miimon 100 fail_over_mac 0" + check_all_mac_same + log_test "fail_over_mac 0" "all slaves have same mac" + do_active_backup_failover + check_all_mac_same + log_test "fail_over_mac 0" "failover: all slaves have same mac" + + # fail_over_mac active + bond_reset "mode active-backup miimon 100 fail_over_mac 1" + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "backup slave mac is not changed" + do_active_backup_failover + check_bond_mac_same_with_active + log_test "fail_over_mac 1" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_not_change + log_test "fail_over_mac 1" "failover: backup slave mac is not changed" + + # fail_over_mac follow + bond_reset "mode active-backup miimon 100 fail_over_mac 2" + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "backup slave mac inherit" + do_active_backup_failover + check_bond_mac_same_with_first + log_test "fail_over_mac 2" "failover: bond mac is same with first slave mac" + check_bond_mac_same_with_active + log_test "fail_over_mac 2" "failover: bond mac is same with active slave mac" + check_backup_slave_mac_inherit + log_test "fail_over_mac 2" "failover: backup slave mac inherit" + check_first_slave_random_mac + log_test "fail_over_mac 2" "first slave mac random" + +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh index 195ef83cfbf1..167aa4a4a12a 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_2d1c.sh @@ -39,6 +39,8 @@ g_ip4="192.0.2.254" s_ip6="2001:db8::1" c_ip6="2001:db8::10" g_ip6="2001:db8::254" +mac[0]="00:0a:0b:0c:0d:01" +mac[1]="00:0a:0b:0c:0d:02" gateway_create() { @@ -62,6 +64,7 @@ server_create() for i in $(seq 0 1); do ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 diff --git a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh index 3a1333d9a85b..23a2932301cc 100644 --- a/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_topo_3d1c.sh @@ -26,6 +26,7 @@ # +-------------------------------------+ source bond_topo_2d1c.sh +mac[2]="00:0a:0b:0c:0d:03" setup_prepare() { @@ -36,6 +37,7 @@ setup_prepare() # Add the extra device as we use 3 down links for bond0 local i=2 ip -n ${s_ns} link add eth${i} type veth peer name s${i} netns ${g_ns} + ip -n "${s_ns}" link set "eth${i}" addr "${mac[$i]}" ip -n ${g_ns} link set s${i} up ip -n ${g_ns} link set s${i} master br0 ip -n ${s_ns} link set eth${i} master bond0 From f755be0b1ff429a2ecf709beeb1bcd7abc111c2b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:50 +0200 Subject: [PATCH 17/51] mptcp: propagate shutdown to subflows when possible When the MPTCP DATA FIN have been ACKed, there is no more MPTCP related metadata to exchange, and all subflows can be safely shutdown. Before this patch, the subflows were actually terminated at 'close()' time. That's certainly fine most of the time, but not when the userspace 'shutdown()' a connection, without close()ing it. When doing so, the subflows were staying in LAST_ACK state on one side -- and consequently in FIN_WAIT2 on the other side -- until the 'close()' of the MPTCP socket. Now, when the DATA FIN have been ACKed, all subflows are shutdown. A consequence of this is that the TCP 'FIN' flag can be set earlier now, but the end result is the same. This affects the packetdrill tests looking at the end of the MPTCP connections, but for a good reason. Note that tcp_shutdown() will check the subflow state, so no need to do that again before calling it. Fixes: 3721b9b64676 ("mptcp: Track received DATA_FIN sequence number and add related helpers") Cc: stable@vger.kernel.org Fixes: 16a9a9da1723 ("mptcp: Add helper to process acks of DATA_FIN") Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-1-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fd97b21e9e..5e497a83e967 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: From 14e22b43df25dbd4301351b882486ea38892ae4f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:51 +0200 Subject: [PATCH 18/51] selftests: mptcp: connect: catch IO errors on listen side IO errors were correctly printed to stderr, and propagated up to the main loop for the server side, but the returned value was ignored. As a consequence, the program for the listener side was no longer exiting with an error code in case of IO issues. Because of that, some issues might not have been seen. But very likely, most issues either had an effect on the client side, or the file transfer was not the expected one, e.g. the connection got reset before the end. Still, it is better to fix this. The main consequence of this issue is the error that was reported by the selftests: the received and sent files were different, and the MIB counters were not printed. Also, when such errors happened during the 'disconnect' tests, the program tried to continue until the timeout. Now when an IO error is detected, the program exits directly with an error. Fixes: 05be5e273c84 ("selftests: mptcp: add disconnect tests") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-2-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 4f07ac9fa207..1408698df099 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1093,6 +1093,7 @@ int main_loop_s(int listensock) struct pollfd polls; socklen_t salen; int remotesock; + int err = 0; int fd = 0; again: @@ -1125,7 +1126,7 @@ again: SOCK_TEST_TCPULP(remotesock, 0); memset(&winfo, 0, sizeof(winfo)); - copyfd_io(fd, remotesock, 1, true, &winfo); + err = copyfd_io(fd, remotesock, 1, true, &winfo); } else { perror("accept"); return 1; @@ -1134,10 +1135,10 @@ again: if (cfg_input) close(fd); - if (--cfg_repeat > 0) + if (!err && --cfg_repeat > 0) goto again; - return 0; + return err; } static void init_rng(void) From 8708c5d8b3fb3f6d5d3b9e6bfe01a505819f519a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:52 +0200 Subject: [PATCH 19/51] selftests: mptcp: avoid spurious errors on TCP disconnect The disconnect test-case, with 'plain' TCP sockets generates spurious errors, e.g. 07 ns1 TCP -> ns1 (dead:beef:1::1:10006) MPTCP read: Connection reset by peer read: Connection reset by peer (duration 155ms) [FAIL] client exit code 3, server 3 netns ns1-FloSdv (listener) socket stat for 10006: TcpActiveOpens 2 0.0 TcpPassiveOpens 2 0.0 TcpEstabResets 2 0.0 TcpInSegs 274 0.0 TcpOutSegs 276 0.0 TcpOutRsts 3 0.0 TcpExtPruneCalled 2 0.0 TcpExtRcvPruned 1 0.0 TcpExtTCPPureAcks 104 0.0 TcpExtTCPRcvCollapsed 2 0.0 TcpExtTCPBacklogCoalesce 42 0.0 TcpExtTCPRcvCoalesce 43 0.0 TcpExtTCPChallengeACK 1 0.0 TcpExtTCPFromZeroWindowAdv 42 0.0 TcpExtTCPToZeroWindowAdv 41 0.0 TcpExtTCPWantZeroWindowAdv 13 0.0 TcpExtTCPOrigDataSent 164 0.0 TcpExtTCPDelivered 165 0.0 TcpExtTCPRcvQDrop 1 0.0 In the failing scenarios (TCP -> MPTCP), the involved sockets are actually plain TCP ones, as fallbacks for passive sockets at 2WHS time cause the MPTCP listeners to actually create 'plain' TCP sockets. Similar to commit 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect"), the root cause is in the user-space bits: the test program tries to disconnect as soon as all the pending data has been spooled, generating an RST. If such option reaches the peer before the connection has reached the closed status, the TCP socket will report an error to the user-space, as per protocol specification, causing the above failure. Note that it looks like this issue got more visible since the "tcp: receiver changes" series from commit 06baf9bfa6ca ("Merge branch 'tcp-receiver-changes'"). Address the issue by explicitly waiting for the TCP sockets (-t) to reach a closed status before performing the disconnect. More precisely, the test program now waits for plain TCP sockets or TCP subflows in addition to the MPTCP sockets that were already monitored. While at it, use 'ss' with '-n' to avoid resolving service names, which is not needed here. Fixes: 218cc166321f ("selftests: mptcp: avoid spurious errors on disconnect") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-3-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 1408698df099..b148cadb96d0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -1248,7 +1248,7 @@ void xdisconnect(int fd) else xerror("bad family"); - strcpy(cmd, "ss -M | grep -q "); + strcpy(cmd, "ss -Mnt | grep -q "); cmdlen = strlen(cmd); if (!inet_ntop(addr.ss_family, raw_addr, &cmd[cmdlen], sizeof(cmd) - cmdlen)) @@ -1258,7 +1258,7 @@ void xdisconnect(int fd) /* * wait until the pending data is completely flushed and all - * the MPTCP sockets reached the closed status. + * the sockets reached the closed status. * disconnect will bypass/ignore/drop any pending data. */ for (i = 0; ; i += msec_sleep) { From a17c5aa3a32373f80b4714b411bd8d4ffee6fc6a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:53 +0200 Subject: [PATCH 20/51] selftests: mptcp: print trailing bytes with od This is better than printing random bytes in the terminal. Note that Jakub suggested 'hexdump', but Mat found out this tool is not often installed by default. 'od' can do a similar job, and it is in the POSIX specs and available in coreutils, so it should be on more systems. While at it, display a few more bytes, just to fill in the two lines. And no need to display the 3rd only line showing the next number of bytes: 0000040. Suggested-by: Jakub Kicinski Suggested-by: Mat Martineau Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-4-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 09cd24b2ae46..d62e653d48b0 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -384,7 +384,7 @@ mptcp_lib_make_file() { mptcp_lib_print_file_err() { ls -l "${1}" 1>&2 echo "Trailing bytes are: " - tail -c 27 "${1}" + tail -c 32 "${1}" | od -x | head -n2 } # $1: input file ; $2: output file ; $3: what kind of file From cf74e0aa0eb024741c8b5cb7e839d2824ca77336 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:54 +0200 Subject: [PATCH 21/51] selftests: mptcp: connect: print pcap prefix To be able to find which capture files have been produced after several runs. This prefix was not printed anywhere before. While at it, always use the same prefix by taking info from ns1, instead of "$connector_ns", which is sometimes ns1, sometimes ns2 in the subtests. Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-5-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index c2ab9f7f0d21..47ecb5b3836e 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -211,6 +211,11 @@ if $checksum; then done fi +if $capture; then + rndh="${ns1:4}" + mptcp_lib_pr_info "Packet capture files will have this prefix: ${rndh}-" +fi + set_ethtool_flags() { local ns="$1" local dev="$2" @@ -361,7 +366,6 @@ do_transfer() if $capture; then local capuser - local rndh="${connector_ns:4}" if [ -z $SUDO_USER ] ; then capuser="" else From 96939cec994070aa5df852c10fad5fc303a97ea3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:20 +0200 Subject: [PATCH 22/51] mptcp: set remote_deny_join_id0 on SYN recv When a SYN containing the 'C' flag (deny join id0) was received, this piece of information was not propagated to the path-manager. Even if this flag is mainly set on the server side, a client can also tell the server it cannot try to establish new subflows to the client's initial IP address and port. The server's PM should then record such info when received, and before sending events about the new connection. Fixes: df377be38725 ("mptcp: add deny_join_id0 in mptcp_options_received") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-1-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..f31a3a79531a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -883,6 +883,10 @@ create_child: ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: [PATCH 23/51] mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } From 24733e193a0d68f20d220e86da0362460c9aa812 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:22 +0200 Subject: [PATCH 24/51] selftests: mptcp: userspace pm: validate deny-join-id0 flag The previous commit adds the MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 flag. Make sure it is correctly announced by the other peer when it has been received. pm_nl_ctl will now display 'deny_join_id0:1' when monitoring the events, and when this flag was set by the other peer. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-3-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_nl_ctl.c | 7 +++++++ tools/testing/selftests/net/mptcp/userspace_pm.sh | 14 +++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 994a556f46c1..93fea3442216 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -188,6 +188,13 @@ static int capture_events(int fd, int event_group) fprintf(stderr, ",error:%u", *(__u8 *)RTA_DATA(attrs)); else if (attrs->rta_type == MPTCP_ATTR_SERVER_SIDE) fprintf(stderr, ",server_side:%u", *(__u8 *)RTA_DATA(attrs)); + else if (attrs->rta_type == MPTCP_ATTR_FLAGS) { + __u16 flags = *(__u16 *)RTA_DATA(attrs); + + /* only print when present, easier */ + if (flags & MPTCP_PM_EV_FLAG_DENY_JOIN_ID0) + fprintf(stderr, ",deny_join_id0:1"); + } attrs = RTA_NEXT(attrs, msg_len); } diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 970c329735ff..3d45991f24ed 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -201,6 +201,9 @@ make_connection() is_v6="v4" fi + # set this on the client side only: will not affect the rest + ip netns exec "$ns2" sysctl -q net.mptcp.allow_join_initial_addr_port=0 + :>"$client_evts" :>"$server_evts" @@ -223,23 +226,28 @@ make_connection() local client_token local client_port local client_serverside + local client_nojoin local server_token local server_serverside + local server_nojoin client_token=$(mptcp_lib_evts_get_info token "$client_evts") client_port=$(mptcp_lib_evts_get_info sport "$client_evts") client_serverside=$(mptcp_lib_evts_get_info server_side "$client_evts") + client_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$client_evts") server_token=$(mptcp_lib_evts_get_info token "$server_evts") server_serverside=$(mptcp_lib_evts_get_info server_side "$server_evts") + server_nojoin=$(mptcp_lib_evts_get_info deny_join_id0 "$server_evts") print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" - if [ "$client_token" != "" ] && [ "$server_token" != "" ] && [ "$client_serverside" = 0 ] && - [ "$server_serverside" = 1 ] + if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && + [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass print_title "Connection info: ${client_addr}:${client_port} -> ${connect_addr}:${app_port}" else - test_fail "Expected tokens (c:${client_token} - s:${server_token}) and server (c:${client_serverside} - s:${server_serverside})" + test_fail "Expected tokens (c:${client_token} - s:${server_token}), server (c:${client_serverside} - s:${server_serverside}), nojoin (c:${client_nojoin} - s:${server_nojoin})" mptcp_lib_result_print_all_tap exit ${KSFT_FAIL} fi From 92da495cb65719583aa06bc946aeb18a10e1e6e2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:23 +0200 Subject: [PATCH 25/51] mptcp: tfo: record 'deny join id0' info When TFO is used, the check to see if the 'C' flag (deny join id0) was set was bypassed. This flag can be set when TFO is used, so the check should also be done when TFO is used. Note that the set_fully_established label is also used when a 4th ACK is received. In this case, deny_join_id0 will not be set. Fixes: dfc8d0603033 ("mptcp: implement delayed seq generation for passive fastopen") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-4-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a8ea28442b2..1103b3341a70 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); From b86418beade11d45540a2d20c4ec1128849b6c27 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Sep 2025 14:52:24 +0200 Subject: [PATCH 26/51] selftests: mptcp: sockopt: fix error messages This patch fixes several issues in the error reporting of the MPTCP sockopt selftest: 1. Fix diff not printed: The error messages for counter mismatches had the actual difference ('diff') as argument, but it was missing in the format string. Displaying it makes the debugging easier. 2. Fix variable usage: The error check for 'mptcpi_bytes_acked' incorrectly used 'ret2' (sent bytes) for both the expected value and the difference calculation. It now correctly uses 'ret' (received bytes), which is the expected value for bytes_acked. 3. Fix off-by-one in diff: The calculation for the 'mptcpi_rcv_delta' diff was 's.mptcpi_rcv_delta - ret', which is off-by-one. It has been corrected to 's.mptcpi_rcv_delta - (ret + 1)' to match the expected value in the condition above it. Fixes: 5dcff89e1455 ("selftests: mptcp: explicitly tests aggregate counters") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-5-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_sockopt.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index e934dd26a59d..112c07c4c37a 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -667,22 +667,26 @@ static void process_one_client(int fd, int pipefd) do_getsockopts(&s, fd, ret, ret2); if (s.mptcpi_rcv_delta != (uint64_t)ret + 1) - xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64, s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - ret); + xerror("mptcpi_rcv_delta %" PRIu64 ", expect %" PRIu64 ", diff %" PRId64, + s.mptcpi_rcv_delta, ret + 1, s.mptcpi_rcv_delta - (ret + 1)); /* be nice when running on top of older kernel */ if (s.pkt_stats_avail) { if (s.last_sample.mptcpi_bytes_sent != ret2) - xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_sent %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_sent, ret2, s.last_sample.mptcpi_bytes_sent - ret2); if (s.last_sample.mptcpi_bytes_received != ret) - xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64, + xerror("mptcpi_bytes_received %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, s.last_sample.mptcpi_bytes_received, ret, s.last_sample.mptcpi_bytes_received - ret); if (s.last_sample.mptcpi_bytes_acked != ret) - xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64, - s.last_sample.mptcpi_bytes_acked, ret2, - s.last_sample.mptcpi_bytes_acked - ret2); + xerror("mptcpi_bytes_acked %" PRIu64 ", expect %" PRIu64 + ", diff %" PRId64, + s.last_sample.mptcpi_bytes_acked, ret, + s.last_sample.mptcpi_bytes_acked - ret); } close(fd); From 93ab4881a4e2b9657bdce4b8940073bfb4ed5eab Mon Sep 17 00:00:00 2001 From: Yeounsu Moon Date: Sat, 13 Sep 2025 15:01:36 +0900 Subject: [PATCH 27/51] net: natsemi: fix `rx_dropped` double accounting on `netif_rx()` failure `netif_rx()` already increments `rx_dropped` core stat when it fails. The driver was also updating `ndev->stats.rx_dropped` in the same path. Since both are reported together via `ip -s -s` command, this resulted in drops being counted twice in user-visible stats. Keep the driver update on `if (unlikely(!skb))`, but skip it after `netif_rx()` errors. Fixes: caf586e5f23c ("net: add a core netdev->rx_dropped counter") Signed-off-by: Yeounsu Moon Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250913060135.35282-3-yyyynoom@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/natsemi/ns83820.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 56d5464222d9..cdbf82affa7b 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -820,7 +820,7 @@ static void rx_irq(struct net_device *ndev) struct ns83820 *dev = PRIV(ndev); struct rx_info *info = &dev->rx_info; unsigned next_rx; - int rx_rc, len; + int len; u32 cmdsts; __le32 *desc; unsigned long flags; @@ -881,8 +881,10 @@ static void rx_irq(struct net_device *ndev) if (likely(CMDSTS_OK & cmdsts)) { #endif skb_put(skb, len); - if (unlikely(!skb)) + if (unlikely(!skb)) { + ndev->stats.rx_dropped++; goto netdev_mangle_me_harder_failed; + } if (cmdsts & CMDSTS_DEST_MULTI) ndev->stats.multicast++; ndev->stats.rx_packets++; @@ -901,15 +903,12 @@ static void rx_irq(struct net_device *ndev) __vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag); } #endif - rx_rc = netif_rx(skb); - if (NET_RX_DROP == rx_rc) { -netdev_mangle_me_harder_failed: - ndev->stats.rx_dropped++; - } + netif_rx(skb); } else { dev_kfree_skb_irq(skb); } +netdev_mangle_me_harder_failed: nr++; next_rx = info->next_rx; desc = info->descs + (DESC_SIZE * next_rx); From 84bf1ac85af84d354c7a2fdbdc0d4efc8aaec34b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 25 Aug 2025 16:00:14 -0700 Subject: [PATCH 28/51] ice: fix Rx page leak on multi-buffer frames The ice_put_rx_mbuf() function handles calling ice_put_rx_buf() for each buffer in the current frame. This function was introduced as part of handling multi-buffer XDP support in the ice driver. It works by iterating over the buffers from first_desc up to 1 plus the total number of fragments in the frame, cached from before the XDP program was executed. If the hardware posts a descriptor with a size of 0, the logic used in ice_put_rx_mbuf() breaks. Such descriptors get skipped and don't get added as fragments in ice_add_xdp_frag. Since the buffer isn't counted as a fragment, we do not iterate over it in ice_put_rx_mbuf(), and thus we don't call ice_put_rx_buf(). Because we don't call ice_put_rx_buf(), we don't attempt to re-use the page or free it. This leaves a stale page in the ring, as we don't increment next_to_alloc. The ice_reuse_rx_page() assumes that the next_to_alloc has been incremented properly, and that it always points to a buffer with a NULL page. Since this function doesn't check, it will happily recycle a page over the top of the next_to_alloc buffer, losing track of the old page. Note that this leak only occurs for multi-buffer frames. The ice_put_rx_mbuf() function always handles at least one buffer, so a single-buffer frame will always get handled correctly. It is not clear precisely why the hardware hands us descriptors with a size of 0 sometimes, but it happens somewhat regularly with "jumbo frames" used by 9K MTU. To fix ice_put_rx_mbuf(), we need to make sure to call ice_put_rx_buf() on all buffers between first_desc and next_to_clean. Borrow the logic of a similar function in i40e used for this same purpose. Use the same logic also in ice_get_pgcnts(). Instead of iterating over just the number of fragments, use a loop which iterates until the current index reaches to the next_to_clean element just past the current frame. Unlike i40e, the ice_put_rx_mbuf() function does call ice_put_rx_buf() on the last buffer of the frame indicating the end of packet. For non-linear (multi-buffer) frames, we need to take care when adjusting the pagecnt_bias. An XDP program might release fragments from the tail of the frame, in which case that fragment page is already released. Only update the pagecnt_bias for the first descriptor and fragments still remaining post-XDP program. Take care to only access the shared info for fragmented buffers, as this avoids a significant cache miss. The xdp_xmit value only needs to be updated if an XDP program is run, and only once per packet. Drop the xdp_xmit pointer argument from ice_put_rx_mbuf(). Instead, set xdp_xmit in the ice_clean_rx_irq() function directly. This avoids needing to pass the argument and avoids an extra bit-wise OR for each buffer in the frame. Move the increment of the ntc local variable to ensure its updated *before* all calls to ice_get_pgcnts() or ice_put_rx_mbuf(), as the loop logic requires the index of the element just after the current frame. Now that we use an index pointer in the ring to identify the packet, we no longer need to track or cache the number of fragments in the rx_ring. Cc: Christoph Petrausch Cc: Jesper Dangaard Brouer Reported-by: Jaroslav Pulchart Closes: https://lore.kernel.org/netdev/CAK8fFZ4hY6GUJNENz3wY9jaYLZXGfpr7dnZxzGMYoE44caRbgw@mail.gmail.com/ Fixes: 743bbd93cf29 ("ice: put Rx buffers after being done with current frame") Tested-by: Michal Kubiak Signed-off-by: Jacob Keller Acked-by: Jesper Dangaard Brouer Tested-by: Priya Singh Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_txrx.c | 80 ++++++++++------------- drivers/net/ethernet/intel/ice/ice_txrx.h | 1 - 2 files changed, 34 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec94..41e7e29879a3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -894,10 +894,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, rx_buf->page_offset, size); sinfo->xdp_frags_size += size; - /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() - * can pop off frags but driver has to handle it on its own - */ - rx_ring->nr_frags = sinfo->nr_frags; if (page_is_pfmemalloc(rx_buf->page)) xdp_buff_set_frag_pfmemalloc(xdp); @@ -968,20 +964,20 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, /** * ice_get_pgcnts - grab page_count() for gathered fragments * @rx_ring: Rx descriptor ring to store the page counts on + * @ntc: the next to clean element (not included in this frame!) * * This function is intended to be called right before running XDP * program so that the page recycling mechanism will be able to take * a correct decision regarding underlying pages; this is done in such * way as XDP program can change the refcount of page */ -static void ice_get_pgcnts(struct ice_rx_ring *rx_ring) +static void ice_get_pgcnts(struct ice_rx_ring *rx_ring, unsigned int ntc) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; struct ice_rx_buf *rx_buf; u32 cnt = rx_ring->count; - for (int i = 0; i < nr_frags; i++) { + while (idx != ntc) { rx_buf = &rx_ring->rx_buf[idx]; rx_buf->pgcnt = page_count(rx_buf->page); @@ -1154,62 +1150,51 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) } /** - * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags + * ice_put_rx_mbuf - ice_put_rx_buf() caller, for all buffers in frame * @rx_ring: Rx ring with all the auxiliary data * @xdp: XDP buffer carrying linear + frags part - * @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage - * @ntc: a current next_to_clean value to be stored at rx_ring + * @ntc: the next to clean element (not included in this frame!) * @verdict: return code from XDP program execution * - * Walk through gathered fragments and satisfy internal page - * recycle mechanism; we take here an action related to verdict - * returned by XDP program; + * Called after XDP program is completed, or on error with verdict set to + * ICE_XDP_CONSUMED. + * + * Walk through buffers from first_desc to the end of the frame, releasing + * buffers and satisfying internal page recycle mechanism. The action depends + * on verdict from XDP program. */ static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - u32 *xdp_xmit, u32 ntc, u32 verdict) + u32 ntc, u32 verdict) { - u32 nr_frags = rx_ring->nr_frags + 1; u32 idx = rx_ring->first_desc; u32 cnt = rx_ring->count; - u32 post_xdp_frags = 1; struct ice_rx_buf *buf; - int i; + u32 xdp_frags = 0; + int i = 0; if (unlikely(xdp_buff_has_frags(xdp))) - post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags; + xdp_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - for (i = 0; i < post_xdp_frags; i++) { + while (idx != ntc) { buf = &rx_ring->rx_buf[idx]; + if (++idx == cnt) + idx = 0; - if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) { + /* An XDP program could release fragments from the end of the + * buffer. For these, we need to keep the pagecnt_bias as-is. + * To do this, only adjust pagecnt_bias for fragments up to + * the total remaining after the XDP program has run. + */ + if (verdict != ICE_XDP_CONSUMED) ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - *xdp_xmit |= verdict; - } else if (verdict & ICE_XDP_CONSUMED) { + else if (i++ <= xdp_frags) buf->pagecnt_bias++; - } else if (verdict == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } ice_put_rx_buf(rx_ring, buf); - - if (++idx == cnt) - idx = 0; - } - /* handle buffers that represented frags released by XDP prog; - * for these we keep pagecnt_bias as-is; refcount from struct page - * has been decremented within XDP prog and we do not have to increase - * the biased refcnt - */ - for (; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - ice_put_rx_buf(rx_ring, buf); - if (++idx == cnt) - idx = 0; } xdp->data = NULL; rx_ring->first_desc = ntc; - rx_ring->nr_frags = 0; } /** @@ -1317,6 +1302,10 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* retrieve a buffer from the ring */ rx_buf = ice_get_rx_buf(rx_ring, size, ntc); + /* Increment ntc before calls to ice_put_rx_mbuf() */ + if (++ntc == cnt) + ntc = 0; + if (!xdp->data) { void *hard_start; @@ -1325,24 +1314,23 @@ static int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); xdp_buff_clear_frags_flag(xdp); } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { - ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED); + ice_put_rx_mbuf(rx_ring, xdp, ntc, ICE_XDP_CONSUMED); break; } - if (++ntc == cnt) - ntc = 0; /* skip if it is NOP desc */ if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_get_pgcnts(rx_ring); + ice_get_pgcnts(rx_ring, ntc); xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); if (xdp_verdict == ICE_XDP_PASS) goto construct_skb; total_rx_bytes += xdp_get_buff_len(xdp); total_rx_pkts++; - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); + xdp_xmit |= xdp_verdict & (ICE_XDP_TX | ICE_XDP_REDIR); continue; construct_skb: @@ -1355,7 +1343,7 @@ construct_skb: rx_ring->ring_stats->rx_stats.alloc_buf_failed++; xdp_verdict = ICE_XDP_CONSUMED; } - ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict); + ice_put_rx_mbuf(rx_ring, xdp, ntc, xdp_verdict); if (!skb) break; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index fef750c5f288..2fd8e78178a2 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -358,7 +358,6 @@ struct ice_rx_ring { struct ice_tx_ring *xdp_ring; struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; - u32 nr_frags; u16 max_frame; u16 rx_buf_len; dma_addr_t dma; /* physical address of ring */ From e37084a26070c546ae7961ee135bbfb15fbe13fd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Fri, 22 Aug 2025 17:16:17 +0200 Subject: [PATCH 29/51] i40e: remove redundant memory barrier when cleaning Tx descs i40e has a feature which writes to memory location last descriptor successfully sent. Memory barrier in i40e_clean_tx_irq() was used to avoid forward-reading descriptor fields in case DD bit was not set. Having mentioned feature in place implies that such situation will not happen as we know in advance how many descriptors HW has dealt with. Besides, this barrier placement was wrong. Idea is to have this protection *after* reading DD bit from HW descriptor, not before. Digging through git history showed me that indeed barrier was before DD bit check, anyways the commit introducing i40e_get_head() should have wiped it out altogether. Also, there was one commit doing s/read_barrier_depends/smp_rmb when get head feature was already in place, but it was only theoretical based on ixgbe experiences, which is different in these terms as that driver has to read DD bit from HW descriptor. Fixes: 1943d8ba9507 ("i40e/i40evf: enable hardware feature head write back") Signed-off-by: Maciej Fijalkowski Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c33039130..b194eae03208 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -948,9 +948,6 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, if (!eop_desc) break; - /* prevent any other reads prior to eop_desc */ - smp_rmb(); - i40e_trace(clean_tx_irq, tx_ring, tx_desc, tx_buf); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) From b85936e95a4bd2a07e134af71e2c0750a69d2b8b Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:28 +0200 Subject: [PATCH 30/51] ixgbe: initialize aci.lock before it's used Currently aci.lock is initialized too late. A bunch of ACI callbacks using the lock are called prior it's initialized. Commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path") highlights that issue what results in call trace. [ 4.092899] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ 4.092910] WARNING: CPU: 0 PID: 578 at kernel/locking/mutex.c:154 mutex_lock+0x6d/0x80 [ 4.098757] Call Trace: [ 4.098847] [ 4.098922] ixgbe_aci_send_cmd+0x8c/0x1e0 [ixgbe] [ 4.099108] ? hrtimer_try_to_cancel+0x18/0x110 [ 4.099277] ixgbe_aci_get_fw_ver+0x52/0xa0 [ixgbe] [ 4.099460] ixgbe_check_fw_error+0x1fc/0x2f0 [ixgbe] [ 4.099650] ? usleep_range_state+0x69/0xd0 [ 4.099811] ? usleep_range_state+0x8c/0xd0 [ 4.099964] ixgbe_probe+0x3b0/0x12d0 [ixgbe] [ 4.100132] local_pci_probe+0x43/0xa0 [ 4.100267] work_for_cpu_fn+0x13/0x20 [ 4.101647] Move aci.lock mutex initialization to ixgbe_sw_init() before any ACI command is sent. Along with that move also related SWFW semaphore in order to reduce size of ixgbe_probe() and that way all locks are initialized in ixgbe_sw_init(). Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 80e6a2ef1350..9a6a67a6d644 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6973,6 +6973,13 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, break; } + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (hw->mac.type == ixgbe_mac_e610) + mutex_init(&hw->aci.lock); + #ifdef IXGBE_FCOE /* FCoE support exists, always init the FCoE lock */ spin_lock_init(&adapter->fcoe.lock); @@ -11643,10 +11650,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - if (ixgbe_check_fw_error(adapter)) return ixgbe_recovery_probe(adapter); @@ -11850,8 +11853,6 @@ skip_sriov: ether_addr_copy(hw->mac.addr, hw->mac.perm_addr); ixgbe_mac_set_default_filter(adapter); - if (hw->mac.type == ixgbe_mac_e610) - mutex_init(&hw->aci.lock); timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); if (ixgbe_removed(hw->hw_addr)) { @@ -12007,9 +12008,9 @@ err_register: devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); +err_sw_init: if (hw->mac.type == ixgbe_mac_e610) mutex_destroy(&adapter->hw.aci.lock); -err_sw_init: ixgbe_disable_sriov(adapter); adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP; iounmap(adapter->io_addr); From 316ba68175b04a9f6f75295764789ea94e31d48c Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 8 Sep 2025 13:26:29 +0200 Subject: [PATCH 31/51] ixgbe: destroy aci.lock later within ixgbe_remove path There's another issue with aci.lock and previous patch uncovers it. aci.lock is being destroyed during removing ixgbe while some of the ixgbe closing routines are still ongoing. These routines use Admin Command Interface which require taking aci.lock which has been already destroyed what leads to call trace. [ +0.000004] DEBUG_LOCKS_WARN_ON(lock->magic != lock) [ +0.000007] WARNING: CPU: 12 PID: 10277 at kernel/locking/mutex.c:155 mutex_lock+0x5f/0x70 [ +0.000002] Call Trace: [ +0.000003] [ +0.000006] ixgbe_aci_send_cmd+0xc8/0x220 [ixgbe] [ +0.000049] ? try_to_wake_up+0x29d/0x5d0 [ +0.000009] ixgbe_disable_rx_e610+0xc4/0x110 [ixgbe] [ +0.000032] ixgbe_disable_rx+0x3d/0x200 [ixgbe] [ +0.000027] ixgbe_down+0x102/0x3b0 [ixgbe] [ +0.000031] ixgbe_close_suspend+0x28/0x90 [ixgbe] [ +0.000028] ixgbe_close+0xfb/0x100 [ixgbe] [ +0.000025] __dev_close_many+0xae/0x220 [ +0.000005] dev_close_many+0xc2/0x1a0 [ +0.000004] ? kernfs_should_drain_open_files+0x2a/0x40 [ +0.000005] unregister_netdevice_many_notify+0x204/0xb00 [ +0.000006] ? __kernfs_remove.part.0+0x109/0x210 [ +0.000006] ? kobj_kset_leave+0x4b/0x70 [ +0.000008] unregister_netdevice_queue+0xf6/0x130 [ +0.000006] unregister_netdev+0x1c/0x40 [ +0.000005] ixgbe_remove+0x216/0x290 [ixgbe] [ +0.000021] pci_device_remove+0x42/0xb0 [ +0.000007] device_release_driver_internal+0x19c/0x200 [ +0.000008] driver_detach+0x48/0x90 [ +0.000003] bus_remove_driver+0x6d/0xf0 [ +0.000006] pci_unregister_driver+0x2e/0xb0 [ +0.000005] ixgbe_exit_module+0x1c/0xc80 [ixgbe] Same as for the previous commit, the issue has been highlighted by the commit 337369f8ce9e ("locking/mutex: Add MUTEX_WARN_ON() into fast path"). Move destroying aci.lock to the end of ixgbe_remove(), as this simply fixes the issue. Fixes: 4600cdf9f5ac ("ixgbe: Enable link management in E610 device") Signed-off-by: Jedrzej Jagielski Reviewed-by: Przemek Kitszel Reviewed-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9a6a67a6d644..6218bdb7f941 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -12061,10 +12061,8 @@ static void ixgbe_remove(struct pci_dev *pdev) set_bit(__IXGBE_REMOVING, &adapter->state); cancel_work_sync(&adapter->service_task); - if (adapter->hw.mac.type == ixgbe_mac_e610) { + if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_disable_link_status_events(adapter); - mutex_destroy(&adapter->hw.aci.lock); - } if (adapter->mii_bus) mdiobus_unregister(adapter->mii_bus); @@ -12124,6 +12122,9 @@ static void ixgbe_remove(struct pci_dev *pdev) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); + if (adapter->hw.mac.type == ixgbe_mac_e610) + mutex_destroy(&adapter->hw.aci.lock); + if (disable_dev) pci_disable_device(pdev); } From 528eb4e19ec0df30d0c9ae4074ce945667dde919 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Wed, 10 Sep 2025 22:47:21 +0900 Subject: [PATCH 32/51] igc: don't fail igc_probe() on LED setup error When igc_led_setup() fails, igc_probe() fails and triggers kernel panic in free_netdev() since unregister_netdev() is not called. [1] This behavior can be tested using fault-injection framework, especially the failslab feature. [2] Since LED support is not mandatory, treat LED setup failures as non-fatal and continue probe with a warning message, consequently avoiding the kernel panic. [1] kernel BUG at net/core/dev.c:12047! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 UID: 0 PID: 937 Comm: repro-igc-led-e Not tainted 6.17.0-rc4-enjuk-tnguy-00865-gc4940196ab02 #64 PREEMPT(voluntary) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:free_netdev+0x278/0x2b0 [...] Call Trace: igc_probe+0x370/0x910 local_pci_probe+0x3a/0x80 pci_device_probe+0xd1/0x200 [...] [2] #!/bin/bash -ex FAILSLAB_PATH=/sys/kernel/debug/failslab/ DEVICE=0000:00:05.0 START_ADDR=$(grep " igc_led_setup" /proc/kallsyms \ | awk '{printf("0x%s", $1)}') END_ADDR=$(printf "0x%x" $((START_ADDR + 0x100))) echo $START_ADDR > $FAILSLAB_PATH/require-start echo $END_ADDR > $FAILSLAB_PATH/require-end echo 1 > $FAILSLAB_PATH/times echo 100 > $FAILSLAB_PATH/probability echo N > $FAILSLAB_PATH/ignore-gfp-wait echo $DEVICE > /sys/bus/pci/drivers/igc/bind Fixes: ea578703b03d ("igc: Add support for LEDs on i225/i226") Signed-off-by: Kohei Enju Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Reviewed-by: Vitaly Lifshits Reviewed-by: Kurt Kanzenbach Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_main.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 266bfcf2a28f..a427f05814c1 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -345,6 +345,7 @@ struct igc_adapter { /* LEDs */ struct mutex led_mutex; struct igc_led_classdev *leds; + bool leds_available; }; void igc_up(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index e79b14d50b24..728d7ca5338b 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7335,8 +7335,14 @@ static int igc_probe(struct pci_dev *pdev, if (IS_ENABLED(CONFIG_IGC_LEDS)) { err = igc_led_setup(adapter); - if (err) - goto err_register; + if (err) { + netdev_warn_once(netdev, + "LED init failed (%d); continuing without LED support\n", + err); + adapter->leds_available = false; + } else { + adapter->leds_available = true; + } } return 0; @@ -7392,7 +7398,7 @@ static void igc_remove(struct pci_dev *pdev) cancel_work_sync(&adapter->watchdog_task); hrtimer_cancel(&adapter->hrtimer); - if (IS_ENABLED(CONFIG_IGC_LEDS)) + if (IS_ENABLED(CONFIG_IGC_LEDS) && adapter->leds_available) igc_led_free(adapter); /* Release control of h/w to f/w. If f/w is AMT enabled, this From 109f8b51543d106aee50dfe911f439e43fb30c7a Mon Sep 17 00:00:00 2001 From: "Remy D. Farley" Date: Sat, 13 Sep 2025 14:05:28 +0000 Subject: [PATCH 33/51] doc/netlink: Fix typos in operation attributes I'm trying to generate Rust bindings for netlink using the yaml spec. It looks like there's a typo in conntrack spec: attribute set conntrack-attrs defines attributes "counters-{orig,reply}" (plural), while get operation references "counter-{orig,reply}" (singular). The latter should be fixed, as it denotes multiple counters (packet and byte). The corresonding C define is CTA_COUNTERS_ORIG. Also, dump request references "nfgen-family" attribute, which neither exists in conntrack-attrs attrset nor ctattr_type enum. There's member of nfgenmsg struct with the same name, which is where family value is actually taken from. > static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, > struct sk_buff *skb, > const struct nlmsghdr *nlh, > const struct nlattr * const cda[], > struct netlink_ext_ack *extack) > { > int err; > struct nfgenmsg *nfmsg = nlmsg_data(nlh); > u_int8_t u3 = nfmsg->nfgen_family; ^^^^^^^^^^^^ Signed-off-by: Remy D. Farley Fixes: 23fc9311a526 ("netlink: specs: add conntrack dump and stats dump support") Link: https://patch.msgid.link/20250913140515.1132886-1-one-d-wide@protonmail.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/conntrack.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/netlink/specs/conntrack.yaml b/Documentation/netlink/specs/conntrack.yaml index c6832633ab7b..591e22a2ee43 100644 --- a/Documentation/netlink/specs/conntrack.yaml +++ b/Documentation/netlink/specs/conntrack.yaml @@ -575,8 +575,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst @@ -591,7 +591,6 @@ operations: request: value: 0x101 attributes: - - nfgen-family - mark - filter - status @@ -608,8 +607,8 @@ operations: - nat-dst - timeout - mark - - counter-orig - - counter-reply + - counters-orig + - counters-reply - use - id - nat-dst From 94ff1ed3030e88cfe4e34c1d47c5832995c953c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Sep 2025 16:42:55 -0700 Subject: [PATCH 34/51] MAINTAINERS: make the DPLL entry cover drivers DPLL maintainers should probably be CCed on driver patches, too. Remove the *, which makes the pattern only match files directly under drivers/dpll but not its sub-directories. Acked-by: Jiri Pirko Acked-by: Vadim Fedorenko Acked-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250915234255.1306612-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index ea95802d87e1..086b4a7bd04f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7430,7 +7430,7 @@ S: Supported F: Documentation/devicetree/bindings/dpll/dpll-device.yaml F: Documentation/devicetree/bindings/dpll/dpll-pin.yaml F: Documentation/driver-api/dpll.rst -F: drivers/dpll/* +F: drivers/dpll/ F: include/linux/dpll.h F: include/uapi/linux/dpll.h From 6b4be64fd9fec16418f365c2d8e47a7566e9eba5 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 15 Sep 2025 15:24:32 +0300 Subject: [PATCH 35/51] net/mlx5e: Harden uplink netdev access against device unbind The function mlx5_uplink_netdev_get() gets the uplink netdevice pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can be removed and its pointer cleared when unbound from the mlx5_core.eth driver. This results in a NULL pointer, causing a kernel panic. BUG: unable to handle page fault for address: 0000000000001300 at RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core] Call Trace: mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core] esw_offloads_enable+0x593/0x910 [mlx5_core] mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core] mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core] devlink_nl_eswitch_set_doit+0x60/0xd0 genl_family_rcv_msg_doit+0xe0/0x130 genl_rcv_msg+0x183/0x290 netlink_rcv_skb+0x4b/0xf0 genl_rcv+0x24/0x40 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 __sys_sendto+0x119/0x180 do_syscall_64+0x53/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Ensure the pointer is valid before use by checking it for NULL. If it is valid, immediately call netdev_hold() to take a reference, and preventing the netdevice from being freed while it is in use. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_rep.c | 27 +++++++++++++++---- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 ++++++++++- include/linux/mlx5/driver.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..cd0242eb008c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b4977650183..5f2d6c35f1ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b02..74ea5da58b7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..10fe492e1fed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; From 7601a0a46216f4ba05adff2de75923b4e8e585c2 Mon Sep 17 00:00:00 2001 From: Lama Kayal Date: Mon, 15 Sep 2025 15:24:34 +0300 Subject: [PATCH 36/51] net/mlx5e: Add a miss level for ipsec crypto offload The cited commit adds a miss table for switchdev mode. But it uses the same level as policy table. Will hit the following error when running command: # ip xfrm state add src 192.168.1.22 dst 192.168.1.21 proto \ esp spi 1001 reqid 10001 aead 'rfc4106(gcm(aes))' \ 0x3a189a7f9374955d3817886c8587f1da3df387ff 128 \ mode tunnel offload dev enp8s0f0 dir in Error: mlx5_core: Device failed to offload this state. The dmesg error is: mlx5_core 0000:03:00.0: ipsec_miss_create:578:(pid 311797): fail to create IPsec miss_rule err=-22 Fix it by adding a new miss level to avoid the error. Fixes: 7d9e292ecd67 ("net/mlx5e: Move IPSec policy check after decryption") Signed-off-by: Jianbo Liu Signed-off-by: Chris Mi Signed-off-by: Lama Kayal Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/fs.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c | 3 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 9560fcba643f..ac65e3191480 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -92,6 +92,7 @@ enum { MLX5E_ACCEL_FS_ESP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_ACCEL_FS_POL_FT_LEVEL, + MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index ffcd0cdeb775..23703f28386a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -185,6 +185,7 @@ struct mlx5e_ipsec_rx_create_attr { u32 family; int prio; int pol_level; + int pol_miss_level; int sa_level; int status_level; enum mlx5_flow_namespace_type chains_ns; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 98b6a3a623f9..65dc3529283b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -747,6 +747,7 @@ static void ipsec_rx_create_attr_set(struct mlx5e_ipsec *ipsec, attr->family = family; attr->prio = MLX5E_NIC_PRIO; attr->pol_level = MLX5E_ACCEL_FS_POL_FT_LEVEL; + attr->pol_miss_level = MLX5E_ACCEL_FS_POL_MISS_FT_LEVEL; attr->sa_level = MLX5E_ACCEL_FS_ESP_FT_LEVEL; attr->status_level = MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL; attr->chains_ns = MLX5_FLOW_NAMESPACE_KERNEL; @@ -833,7 +834,7 @@ static int ipsec_rx_chains_create_miss(struct mlx5e_ipsec *ipsec, ft_attr.max_fte = 1; ft_attr.autogroup.max_num_groups = 1; - ft_attr.level = attr->pol_level; + ft_attr.level = attr->pol_miss_level; ft_attr.prio = attr->prio; ft = mlx5_create_auto_grouped_flow_table(attr->ns, &ft_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index cb165085a4c1..db552c012b4f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -114,9 +114,9 @@ #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) /* Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, - * {IPsec RoCE MPV,Alias table},IPsec RoCE policy + * IPsec policy miss, {IPsec RoCE MPV,Alias table},IPsec RoCE policy */ -#define KERNEL_NIC_PRIO_NUM_LEVELS 10 +#define KERNEL_NIC_PRIO_NUM_LEVELS 11 #define KERNEL_NIC_NUM_PRIOS 1 /* One more level for tc, and one more for promisc */ #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 2) From b6f56a44e4c1014b08859dcf04ed246500e310e5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Sep 2025 13:35:15 +0200 Subject: [PATCH 37/51] net: rfkill: gpio: Fix crash due to dereferencering uninitialized pointer Since commit 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") rfkill_find_type() gets called with the possibly uninitialized "const char *type_name;" local variable. On x86 systems when rfkill-gpio binds to a "BCM4752" or "LNV4752" acpi_device, the rfkill->type is set based on the ACPI acpi_device_id: rfkill->type = (unsigned)id->driver_data; and there is no "type" property so device_property_read_string() will fail and leave type_name uninitialized, leading to a potential crash. rfkill_find_type() does accept a NULL pointer, fix the potential crash by initializing type_name to NULL. Note likely sofar this has not been caught because: 1. Not many x86 machines actually have a "BCM4752"/"LNV4752" acpi_device 2. The stack happened to contain NULL where type_name is stored Fixes: 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") Cc: stable@vger.kernel.org Cc: Heikki Krogerus Signed-off-by: Hans de Goede Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20250913113515.21698-1-hansg@kernel.org Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e97761..cf2dcec6ce5a 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) From a8ba87f04ca9cdec06776ce92dce1395026dc3bb Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:26 +0000 Subject: [PATCH 38/51] bonding: don't set oif to bond dev when getting NS target destination Unlike IPv4, IPv6 routing strictly requires the source address to be valid on the outgoing interface. If the NS target is set to a remote VLAN interface, and the source address is also configured on a VLAN over a bond interface, setting the oif to the bond device will fail to retrieve the correct destination route. Fix this by not setting the oif to the bond device when retrieving the NS target destination. This allows the correct destination device (the VLAN interface) to be determined, so that bond_verify_device_path can return the proper VLAN tags for sending NS messages. Reported-by: David Wilder Closes: https://lore.kernel.org/netdev/aGOKggdfjv0cApTO@fedora/ Suggested-by: Jay Vosburgh Tested-by: David Wilder Acked-by: Jay Vosburgh Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8832bc9f107b..57be04f6cb11 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3356,7 +3356,6 @@ static void bond_ns_send_all(struct bonding *bond, struct slave *slave) /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; - fl6.flowi6_oif = bond->dev->ifindex; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { From dc5f94b1ec8f9f65d1ad7a5f45fcac740544e35f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 16 Sep 2025 08:01:27 +0000 Subject: [PATCH 39/51] selftests: bonding: add vlan over bond testing Add a vlan over bond testing to make sure arp/ns target works. Also change all the configs to mudules. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250916080127.430626-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- .../drivers/net/bonding/bond_options.sh | 58 +++++++++++++++++++ .../selftests/drivers/net/bonding/config | 1 + 2 files changed, 59 insertions(+) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index e3f3cc803b56..187b478d0ddf 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -8,6 +8,7 @@ ALL_TESTS=" arp_validate num_grat_arp fail_over_mac + vlan_over_bond " lib_dir=$(dirname "$0") @@ -508,7 +509,64 @@ fail_over_mac() log_test "fail_over_mac 2" "failover: backup slave mac inherit" check_first_slave_random_mac log_test "fail_over_mac 2" "first slave mac random" +} +vlan_over_bond_arp() +{ + local mode="$1" + RET=0 + + bond_reset "mode $mode arp_interval 100 arp_ip_target 192.0.3.10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 101 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond arp" "$mode" +} + +vlan_over_bond_ns() +{ + local mode="$1" + RET=0 + + if skip_ns; then + log_test_skip "vlan_over_bond ns" "$mode" + return 0 + fi + + bond_reset "mode $mode arp_interval 100 ns_ip6_target 2001:db8::3:10" + ip -n "${s_ns}" link add bond0.3 link bond0 type vlan id 3 + ip -n "${s_ns}" link set bond0.3 up + ip -n "${s_ns}" addr add 192.0.3.1/24 dev bond0.3 + ip -n "${s_ns}" addr add 2001:db8::3:1/64 dev bond0.3 + + slowwait_for_counter 5 5 tc_rule_handle_stats_get \ + "dev eth0.3 ingress" 102 ".packets" "-n ${c_ns}" &> /dev/null || RET=1 + log_test "vlan over bond ns" "$mode" +} + +vlan_over_bond() +{ + # add vlan 3 for client + ip -n "${c_ns}" link add eth0.3 link eth0 type vlan id 3 + ip -n "${c_ns}" link set eth0.3 up + ip -n "${c_ns}" addr add 192.0.3.10/24 dev eth0.3 + ip -n "${c_ns}" addr add 2001:db8::3:10/64 dev eth0.3 + + # Add tc rule to check the vlan pkts + tc -n "${c_ns}" qdisc add dev eth0.3 clsact + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol arp \ + handle 101 flower skip_hw arp_op request \ + arp_sip 192.0.3.1 arp_tip 192.0.3.10 action pass + tc -n "${c_ns}" filter add dev eth0.3 ingress protocol ipv6 \ + handle 102 flower skip_hw ip_proto icmpv6 \ + type 135 src_ip 2001:db8::3:1 action pass + + vlan_over_bond_arp "active-backup" + vlan_over_bond_ns "active-backup" } trap cleanup EXIT diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index 4d16a69ffc65..832fa1caeb66 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -10,3 +10,4 @@ CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y +CONFIG_VLAN_8021Q=m From a72175c985132885573593222a7b088cf49b07ae Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Tue, 16 Sep 2025 06:32:07 -0700 Subject: [PATCH 40/51] octeon_ep: fix VF MAC address lifecycle handling Currently, VF MAC address info is not updated when the MAC address is configured from VF, and it is not cleared when the VF is removed. This leads to stale or missing MAC information in the PF, which may cause incorrect state tracking or inconsistencies when VFs are hot-plugged or reassigned. Fix this by: - storing the VF MAC address in the PF when it is set from VF - clearing the stored VF MAC address when the VF is removed This ensures that the PF always has correct VF MAC state. Fixes: cde29af9e68e ("octeon_ep: add PF-VF mailbox communication") Signed-off-by: Sathesh B Edara Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250916133207.21737-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c index ebecdd29f3bd..0867fab61b19 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c @@ -196,6 +196,7 @@ static void octep_pfvf_get_mac_addr(struct octep_device *oct, u32 vf_id, vf_id); return; } + ether_addr_copy(oct->vf_info[vf_id].mac_addr, rsp->s_set_mac.mac_addr); rsp->s_set_mac.type = OCTEP_PFVF_MBOX_TYPE_RSP_ACK; } @@ -205,6 +206,8 @@ static void octep_pfvf_dev_remove(struct octep_device *oct, u32 vf_id, { int err; + /* Reset VF-specific information maintained by the PF */ + memset(&oct->vf_info[vf_id], 0, sizeof(struct octep_pfvf_info)); err = octep_ctrl_net_dev_remove(oct, vf_id); if (err) { rsp->s.type = OCTEP_PFVF_MBOX_TYPE_RSP_NACK; From 45c8a6cc2bcd780e634a6ba8e46bffbdf1fc5c01 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:46 +0000 Subject: [PATCH 41/51] tcp: Clear tcp_sk(sk)->fastopen_rsk in tcp_disconnect(). syzbot reported the splat below where a socket had tcp_sk(sk)->fastopen_rsk in the TCP_ESTABLISHED state. [0] syzbot reused the server-side TCP Fast Open socket as a new client before the TFO socket completes 3WHS: 1. accept() 2. connect(AF_UNSPEC) 3. connect() to another destination As of accept(), sk->sk_state is TCP_SYN_RECV, and tcp_disconnect() changes it to TCP_CLOSE and makes connect() possible, which restarts timers. Since tcp_disconnect() forgot to clear tcp_sk(sk)->fastopen_rsk, the retransmit timer triggered the warning and the intended packet was not retransmitted. Let's call reqsk_fastopen_remove() in tcp_disconnect(). [0]: WARNING: CPU: 2 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Modules linked in: CPU: 2 UID: 0 PID: 0 Comm: swapper/2 Not tainted 6.17.0-rc5-g201825fb4278 #62 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Code: 41 55 41 54 55 53 48 8b af b8 08 00 00 48 89 fb 48 85 ed 0f 84 55 01 00 00 0f b6 47 12 3c 03 74 0c 0f b6 47 12 3c 04 74 04 90 <0f> 0b 90 48 8b 85 c0 00 00 00 48 89 ef 48 8b 40 30 e8 6a 4f 06 3e RSP: 0018:ffffc900002f8d40 EFLAGS: 00010293 RAX: 0000000000000002 RBX: ffff888106911400 RCX: 0000000000000017 RDX: 0000000002517619 RSI: ffffffff83764080 RDI: ffff888106911400 RBP: ffff888106d5c000 R08: 0000000000000001 R09: ffffc900002f8de8 R10: 00000000000000c2 R11: ffffc900002f8ff8 R12: ffff888106911540 R13: ffff888106911480 R14: ffff888106911840 R15: ffffc900002f8de0 FS: 0000000000000000(0000) GS:ffff88907b768000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8044d69d90 CR3: 0000000002c30003 CR4: 0000000000370ef0 Call Trace: tcp_write_timer (net/ipv4/tcp_timer.c:738) call_timer_fn (kernel/time/timer.c:1747) __run_timers (kernel/time/timer.c:1799 kernel/time/timer.c:2372) timer_expire_remote (kernel/time/timer.c:2385 kernel/time/timer.c:2376 kernel/time/timer.c:2135) tmigr_handle_remote_up (kernel/time/timer_migration.c:944 kernel/time/timer_migration.c:1035) __walk_groups.isra.0 (kernel/time/timer_migration.c:533 (discriminator 1)) tmigr_handle_remote (kernel/time/timer_migration.c:1096) handle_softirqs (./arch/x86/include/asm/jump_label.h:36 ./include/trace/events/irq.h:142 kernel/softirq.c:580) irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680 kernel/softirq.c:696) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 (discriminator 35) arch/x86/kernel/apic/apic.c:1050 (discriminator 35)) Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc55..ad76556800f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; From 1fd0362262baff9381615a5b346298abbc165ba3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:47 +0000 Subject: [PATCH 42/51] selftest: packetdrill: Add tcp_fastopen_server_reset-after-disconnect.pkt. The test reproduces the scenario explained in the previous patch. Without the patch, the test triggers the warning and cannot see the last retransmitted packet. # ./ksft_runner.sh tcp_fastopen_server_reset-after-disconnect.pkt TAP version 13 1..2 [ 29.229250] ------------[ cut here ]------------ [ 29.231414] WARNING: CPU: 26 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer+0x32/0x9f0 ... tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 1 ipv4 tcp_fastopen_server_reset-after-disconnect.pkt:26: error handling packet: Timed out waiting for packet not ok 2 ipv6 # Totals: pass:0 fail:2 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- ...fastopen_server_reset-after-disconnect.pkt | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt diff --git a/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt new file mode 100644 index 000000000000..26794e7ddfd5 --- /dev/null +++ b/tools/testing/selftests/net/packetdrill/tcp_fastopen_server_reset-after-disconnect.pkt @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +`./defaults.sh + ./set_sysctls.py /proc/sys/net/ipv4/tcp_fastopen=0x602 /proc/sys/net/ipv4/tcp_timestamps=0` + + 0 socket(..., SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_TCP) = 3 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 + +0 bind(3, ..., ...) = 0 + +0 listen(3, 1) = 0 + + +0 < S 0:10(10) win 32792 + +0 > S. 0:0(0) ack 11 win 65535 + +// sk->sk_state is TCP_SYN_RECV + +.1 accept(3, ..., ...) = 4 + +// tcp_disconnect() sets sk->sk_state to TCP_CLOSE + +0 connect(4, AF_UNSPEC, ...) = 0 + +0 > R. 1:1(0) ack 11 win 65535 + +// connect() sets sk->sk_state to TCP_SYN_SENT + +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0 connect(4, ..., ...) = -1 EINPROGRESS (Operation is now in progress) + +0 > S 0:0(0) win 65535 + +// tp->fastopen_rsk must be NULL + +1 > S 0:0(0) win 65535 From 0aeb54ac4cd5cf8f60131b4d9ec0b6dc9c27b20d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:13 -0700 Subject: [PATCH 43/51] tls: make sure to abort the stream if headers are bogus Normally we wait for the socket to buffer up the whole record before we service it. If the socket has a tiny buffer, however, we read out the data sooner, to prevent connection stalls. Make sure that we abort the connection when we find out late that the record is actually invalid. Retrying the parsing is fine in itself but since we copy some more data each time before we parse we can overflow the allocated skb space. Constructing a scenario in which we're under pressure without enough data in the socket to parse the length upfront is quite hard. syzbot figured out a way to do this by serving us the header in small OOB sends, and then filling in the recvbuf with a large normal send. Make sure that tls_rx_msg_size() aborts strp, if we reach an invalid record there's really no way to recover. Reported-by: Lee Jones Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 14 +++++++++----- net/tls/tls_sw.c | 3 +-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/net/tls/tls.h b/net/tls/tls.h index 4e077068e6d9..e4c42731ce39 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index d71643b494a1..98e12f0ff57e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -520,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e..daac9fd4be7e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2474,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } From 4c05c7ed880fb58790731fb53571af67b7632d87 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:14 -0700 Subject: [PATCH 44/51] selftests: tls: test skb copy under mem pressure and OOB Add a test which triggers mem pressure via OOB writes. Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tls.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0f5640d8dc7f..dd093f9df6f1 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2770,6 +2770,22 @@ TEST_F(tls_err, poll_partial_rec_async) } } +/* Use OOB+large send to trigger copy mode due to memory pressure. + * OOB causes a short read. + */ +TEST_F(tls_err, oob_pressure) +{ + char buf[1<<16]; + int i; + + memrnd(buf, sizeof(buf)); + + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); + EXPECT_EQ(send(self->fd2, buf, sizeof(buf), 0), sizeof(buf)); + for (i = 0; i < 64; i++) + EXPECT_EQ(send(self->fd2, buf, 5, MSG_OOB), 5); +} + TEST(non_established) { struct tls12_crypto_info_aes_gcm_256 tls12; struct sockaddr_in addr; From 3fbfe251cc9f6d391944282cdb9bcf0bd02e01f8 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 17 Sep 2025 16:48:54 +0300 Subject: [PATCH 45/51] Revert "net/mlx5e: Update and set Xon/Xoff upon port speed set" This reverts commit d24341740fe48add8a227a753e68b6eedf4b385a. It causes errors when trying to configure QoS, as well as loss of L2 connectivity (on multi-host devices). Reported-by: Jakub Kicinski Link: https://lore.kernel.org/20250910170011.70528106@kernel.org Fixes: d24341740fe4 ("net/mlx5e: Update and set Xon/Xoff upon port speed set") Signed-off-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e680673ffb72..15eded36b872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -139,8 +139,6 @@ void mlx5e_update_carrier(struct mlx5e_priv *priv) if (up) { netdev_info(priv->netdev, "Link up\n"); netif_carrier_on(priv->netdev); - mlx5e_port_manual_buffer_config(priv, 0, priv->netdev->mtu, - NULL, NULL, NULL); } else { netdev_info(priv->netdev, "Link down\n"); netif_carrier_off(priv->netdev); From 87ebb628a5acb892eba41ef1d8989beb8f036034 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 17 Sep 2025 13:53:37 +0000 Subject: [PATCH 46/51] net: clear sk->sk_ino in sk_set_socket(sk, NULL) Andrei Vagin reported that blamed commit broke CRIU. Indeed, while we want to keep sk_uid unchanged when a socket is cloned, we want to clear sk->sk_ino. Otherwise, sock_diag might report multiple sockets sharing the same inode number. Move the clearing part from sock_orphan() to sk_set_socket(sk, NULL), called both from sock_orphan() and sk_clone_lock(). Fixes: 5d6b58c932ec ("net: lockless sock_i_ino()") Closes: https://lore.kernel.org/netdev/aMhX-VnXkYDpKd9V@google.com/ Closes: https://github.com/checkpoint-restore/criu/issues/2744 Reported-by: Andrei Vagin Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Link: https://patch.msgid.link/20250917135337.1736101-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index fb13322a11fc..2e14283c5be1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2061,6 +2061,9 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock) if (sock) { WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); } } @@ -2082,8 +2085,6 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; - /* Note: sk_uid is unchanged. */ - WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } From cca7b1cfd7b8a0eff2a3510c5e0f10efe8fa3758 Mon Sep 17 00:00:00 2001 From: Alexey Nepomnyashih Date: Wed, 17 Sep 2025 15:30:58 +0000 Subject: [PATCH 47/51] net: liquidio: fix overflow in octeon_init_instr_queue() The expression `(conf->instr_type == 64) << iq_no` can overflow because `iq_no` may be as high as 64 (`CN23XX_MAX_RINGS_PER_PF`). Casting the operand to `u64` ensures correct 64-bit arithmetic. Fixes: f21fb3ed364b ("Add support of Cavium Liquidio ethernet adapters") Signed-off-by: Alexey Nepomnyashih Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index de8a6ce86ad7..12105ffb5dac 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -126,7 +126,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, oct->io_qmask.iq |= BIT_ULL(iq_no); /* Set the 32B/64B mode for each input queue */ - oct->io_qmask.iq64B |= ((conf->instr_type == 64) << iq_no); + oct->io_qmask.iq64B |= ((u64)(conf->instr_type == 64) << iq_no); iq->iqcmd_64B = (conf->instr_type == 64); oct->fn_list.setup_iq_regs(oct, iq_no); From 7736aff4704188d4483b7b36ff06d201c8be4844 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 18 Sep 2025 12:15:56 +0300 Subject: [PATCH 48/51] MAINTAINERS: update sundance entry Signed-off-by: Denis Kirjanov Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 086b4a7bd04f..70c9d368c8d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24255,7 +24255,7 @@ F: Documentation/devicetree/bindings/input/allwinner,sun4i-a10-lradc-keys.yaml F: drivers/input/keyboard/sun4i-lradc-keys.c SUNDANCE NETWORK DRIVER -M: Denis Kirjanov +M: Denis Kirjanov L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/dlink/sundance.c From 3191df0a4882c827cac29925e80ecb1775b904bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 18 Sep 2025 13:15:06 +0300 Subject: [PATCH 49/51] devlink rate: Remove unnecessary 'static' from a couple places devlink_rate_node_get_by_name() and devlink_rate_nodes_destroy() have a couple of unnecessary static variables for iterating over devlink rates. This could lead to races/corruption/unhappiness if two concurrent operations execute the same function. Remove 'static' from both. It's amazing this was missed for 4+ years. While at it, I confirmed there are no more examples of this mistake in net/ with 1, 2 or 3 levels of indentation. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Cosmin Ratiu Signed-off-by: Jakub Kicinski --- net/devlink/rate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1..264fb82cba19 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); From cfa7d9b1e3a8604afc84e9e51d789c29574fb216 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 13:46:02 +0800 Subject: [PATCH 50/51] cnic: Fix use-after-free bugs in cnic_delete_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original code uses cancel_delayed_work() in cnic_cm_stop_bnx2x_hw(), which does not guarantee that the delayed work item 'delete_task' has fully completed if it was already running. Additionally, the delayed work item is cyclic, the flush_workqueue() in cnic_cm_stop_bnx2x_hw() only blocks and waits for work items that were already queued to the workqueue prior to its invocation. Any work items submitted after flush_workqueue() is called are not included in the set of tasks that the flush operation awaits. This means that after the cyclic work items have finished executing, a delayed work item may still exist in the workqueue. This leads to use-after-free scenarios where the cnic_dev is deallocated by cnic_free_dev(), while delete_task remains active and attempt to dereference cnic_dev in cnic_delete_task(). A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) cnic_netdev_event() | cnic_stop_hw() | cnic_delete_task() cnic_cm_stop_bnx2x_hw() | ... cancel_delayed_work() | /* the queue_delayed_work() flush_workqueue() | executes after flush_workqueue()*/ | queue_delayed_work() cnic_free_dev(dev)//free | cnic_delete_task() //new instance | dev = cp->dev; //use Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the cyclic delayed work item is properly canceled and that any ongoing execution of the work item completes before the cnic_dev is deallocated. Furthermore, since cancel_delayed_work_sync() uses __flush_work(work, true) to synchronously wait for any currently executing instance of the work item to finish, the flush_workqueue() becomes redundant and should be removed. This bug was identified through static analysis. To reproduce the issue and validate the fix, I simulated the cnic PCI device in QEMU and introduced intentional delays — such as inserting calls to ssleep() within the cnic_delete_task() function — to increase the likelihood of triggering the bug. Fixes: fdf24086f475 ("cnic: Defer iscsi connection cleanup") Signed-off-by: Duoming Zhou Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/cnic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c index a9040c42d2ff..6e97a5a7daaf 100644 --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@ -4230,8 +4230,7 @@ static void cnic_cm_stop_bnx2x_hw(struct cnic_dev *dev) cnic_bnx2x_delete_wait(dev, 0); - cancel_delayed_work(&cp->delete_task); - flush_workqueue(cnic_wq); + cancel_delayed_work_sync(&cp->delete_task); if (atomic_read(&cp->iscsi_conn) != 0) netdev_warn(dev->netdev, "%d iSCSI connections not destroyed\n", From f8b4687151021db61841af983f1cb7be6915d4ef Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 17 Sep 2025 14:38:53 +0800 Subject: [PATCH 51/51] octeontx2-pf: Fix use-after-free bugs in otx2_sync_tstamp() The original code relies on cancel_delayed_work() in otx2_ptp_destroy(), which does not ensure that the delayed work item synctstamp_work has fully completed if it was already running. This leads to use-after-free scenarios where otx2_ptp is deallocated by otx2_ptp_destroy(), while synctstamp_work remains active and attempts to dereference otx2_ptp in otx2_sync_tstamp(). Furthermore, the synctstamp_work is cyclic, the likelihood of triggering the bug is nonnegligible. A typical race condition is illustrated below: CPU 0 (cleanup) | CPU 1 (delayed work callback) otx2_remove() | otx2_ptp_destroy() | otx2_sync_tstamp() cancel_delayed_work() | kfree(ptp) | | ptp = container_of(...); //UAF | ptp-> //UAF This is confirmed by a KASAN report: BUG: KASAN: slab-use-after-free in __run_timer_base.part.0+0x7d7/0x8c0 Write of size 8 at addr ffff88800aa09a18 by task bash/136 ... Call Trace: dump_stack_lvl+0x55/0x70 print_report+0xcf/0x610 ? __run_timer_base.part.0+0x7d7/0x8c0 kasan_report+0xb8/0xf0 ? __run_timer_base.part.0+0x7d7/0x8c0 __run_timer_base.part.0+0x7d7/0x8c0 ? __pfx___run_timer_base.part.0+0x10/0x10 ? __pfx_read_tsc+0x10/0x10 ? ktime_get+0x60/0x140 ? lapic_next_event+0x11/0x20 ? clockevents_program_event+0x1d4/0x2a0 run_timer_softirq+0xd1/0x190 handle_softirqs+0x16a/0x550 irq_exit_rcu+0xaf/0xe0 sysvec_apic_timer_interrupt+0x70/0x80 ... Allocated by task 1: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0x7f/0x90 otx2_ptp_init+0xb1/0x860 otx2_probe+0x4eb/0xc30 local_pci_probe+0xdc/0x190 pci_device_probe+0x2fe/0x470 really_probe+0x1ca/0x5c0 __driver_probe_device+0x248/0x310 driver_probe_device+0x44/0x120 __driver_attach+0xd2/0x310 bus_for_each_dev+0xed/0x170 bus_add_driver+0x208/0x500 driver_register+0x132/0x460 do_one_initcall+0x89/0x300 kernel_init_freeable+0x40d/0x720 kernel_init+0x1a/0x150 ret_from_fork+0x10c/0x1a0 ret_from_fork_asm+0x1a/0x30 Freed by task 136: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3a/0x60 __kasan_slab_free+0x3f/0x50 kfree+0x137/0x370 otx2_ptp_destroy+0x38/0x80 otx2_remove+0x10d/0x4c0 pci_device_remove+0xa6/0x1d0 device_release_driver_internal+0xf8/0x210 pci_stop_bus_device+0x105/0x150 pci_stop_and_remove_bus_device_locked+0x15/0x30 remove_store+0xcc/0xe0 kernfs_fop_write_iter+0x2c3/0x440 vfs_write+0x871/0xd70 ksys_write+0xee/0x1c0 do_syscall_64+0xac/0x280 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Replace cancel_delayed_work() with cancel_delayed_work_sync() to ensure that the delayed work item is properly canceled before the otx2_ptp is deallocated. This bug was initially identified through static analysis. To reproduce and test it, I simulated the OcteonTX2 PCI device in QEMU and introduced artificial delays within the otx2_sync_tstamp() function to increase the likelihood of triggering the bug. Fixes: 2958d17a8984 ("octeontx2-pf: Add support for ptp 1-step mode on CN10K silicon") Signed-off-by: Duoming Zhou Reviewed-by: Vadim Fedorenko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c index e52cc6b1a26c..dedd586ed310 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c @@ -491,7 +491,7 @@ void otx2_ptp_destroy(struct otx2_nic *pfvf) if (!ptp) return; - cancel_delayed_work(&pfvf->ptp->synctstamp_work); + cancel_delayed_work_sync(&pfvf->ptp->synctstamp_work); ptp_clock_unregister(ptp->ptp_clock); kfree(ptp);