From 6e8e8c64110fd8930e9ee2d669e519cc6eecb0e3 Mon Sep 17 00:00:00 2001 From: LoGin Date: Wed, 4 Feb 2026 15:12:36 +0800 Subject: [PATCH] feat(procfs): add /proc/sys/net/ipv4/ip_local_port_range interface (#1749) * feat(procfs): add /proc/sys/net/ipv4/ip_local_port_range interface - Add net module to procfs sys directory with IPv4 support - Implement ip_local_port_range file for reading/writing port range configuration - Update PortManager to support configurable local port range with atomic operations - Extend ephemeral port allocation to use configured range instead of hardcoded values - Add gvisor test cases for IPv4 UDP socket functionality Signed-off-by: longjin --- kernel/src/driver/net/mod.rs | 37 ++++- kernel/src/filesystem/procfs/sys/mod.rs | 15 +++ kernel/src/filesystem/procfs/sys/net/ipv4.rs | 108 +++++++++++++++ kernel/src/filesystem/procfs/sys/net/mod.rs | 52 +++++++ .../src/net/socket/inet/common/multicast.rs | 26 +--- kernel/src/net/socket/inet/common/port.rs | 95 +++++++++---- kernel/src/net/socket/inet/datagram/mod.rs | 20 ++- .../inet/datagram/multicast_loopback.rs | 24 +++- kernel/src/net/socket/inet/datagram/option.rs | 19 +++ .../net/socket/inet/datagram/udp_bindings.rs | 8 +- kernel/src/net/socket/inet/stream/inner.rs | 23 +++- .../src/net/socket/inet/stream/lifecycle.rs | 10 ++ kernel/src/net/socket/posix/ip_option.rs | 1 + kernel/src/net/tcp_close_defer.rs | 11 +- kernel/src/process/namespace/net_namespace.rs | 35 +++++ kernel/src/process/rseq.rs | 2 +- .../test_ip_local_port_range_netns.c | 127 ++++++++++++++++++ .../tests/syscall/gvisor/runner/Cargo.lock | 1 + .../tests/syscall/gvisor/runner/Cargo.toml | 1 + .../syscall/gvisor/runner/src/lib_sync.rs | 14 ++ user/apps/tests/syscall/gvisor/whitelist.txt | 2 + 21 files changed, 555 insertions(+), 76 deletions(-) create mode 100644 kernel/src/filesystem/procfs/sys/net/ipv4.rs create mode 100644 kernel/src/filesystem/procfs/sys/net/mod.rs create mode 100644 user/apps/c_unitest/test_ip_local_port_range_netns.c diff --git a/kernel/src/driver/net/mod.rs b/kernel/src/driver/net/mod.rs index 3747e38f3..5cc941b72 100644 --- a/kernel/src/driver/net/mod.rs +++ b/kernel/src/driver/net/mod.rs @@ -246,6 +246,7 @@ pub struct IfaceCommon { tcp_close_defer: crate::net::tcp_close_defer::TcpCloseDefer, /// TCP listener/backlog 语义辅助(Linux-like 丢 SYN 等)。 tcp_listener_backlog: crate::net::tcp_listener_backlog::TcpListenerBacklog, + ipv4_multicast_refcnt: Mutex>, } impl fmt::Debug for IfaceCommon { @@ -284,6 +285,7 @@ impl IfaceCommon { packet_sockets: RwLock::new(Vec::new()), tcp_close_defer: crate::net::tcp_close_defer::TcpCloseDefer::new(), tcp_listener_backlog: crate::net::tcp_listener_backlog::TcpListenerBacklog::new(), + ipv4_multicast_refcnt: Mutex::new(Vec::new()), } } @@ -298,6 +300,38 @@ impl IfaceCommon { self.tcp_listener_backlog.unregister_tcp_listen_port(port); } + pub fn ipv4_multicast_join_ref( + &self, + group: smoltcp::wire::Ipv4Address, + ) -> Result<(), smoltcp::iface::MulticastError> { + let mut guard = self.ipv4_multicast_refcnt.lock(); + if let Some((_, ref mut cnt)) = guard.iter_mut().find(|(g, _)| *g == group) { + *cnt = cnt.saturating_add(1); + return Ok(()); + } + self.smol_iface + .lock() + .join_multicast_group(smoltcp::wire::IpAddress::Ipv4(group))?; + guard.push((group, 1)); + Ok(()) + } + + pub fn ipv4_multicast_leave_ref(&self, group: smoltcp::wire::Ipv4Address) { + let mut guard = self.ipv4_multicast_refcnt.lock(); + let Some(pos) = guard.iter().position(|(g, _)| *g == group) else { + return; + }; + if guard[pos].1 > 1 { + guard[pos].1 -= 1; + return; + } + guard.swap_remove(pos); + let _ = self + .smol_iface + .lock() + .leave_multicast_group(smoltcp::wire::IpAddress::Ipv4(group)); + } + /// 驱动收包入口使用的通用丢包策略(避免驱动理解 L4 语义)。 #[inline] pub fn should_drop_rx_packet(&self, packet: &[u8]) -> bool { @@ -350,8 +384,7 @@ impl IfaceCommon { // Reclaim TCP sockets that have fully closed. // Lock order: sockets -> tcp_close_defer (matches close path, which may touch sockets then defer close). - self.tcp_close_defer - .reap_closed(&mut sockets, &self.port_manager); + self.tcp_close_defer.reap_closed(&mut sockets); // drop sockets here to avoid deadlock drop(interface); diff --git a/kernel/src/filesystem/procfs/sys/mod.rs b/kernel/src/filesystem/procfs/sys/mod.rs index 58c877c0c..61de85e2b 100644 --- a/kernel/src/filesystem/procfs/sys/mod.rs +++ b/kernel/src/filesystem/procfs/sys/mod.rs @@ -3,6 +3,7 @@ //! 提供类似 Linux 的 /proc/sys 接口,支持动态配置内核参数 mod kernel; +mod net; mod vm; use crate::filesystem::{ @@ -14,6 +15,7 @@ use alloc::{ sync::{Arc, Weak}, }; use kernel::KernelDirOps; +use net::NetDirOps; use system_error::SystemError; use vm::VmDirOps; @@ -58,6 +60,16 @@ impl DirOps for SysDirOps { cached_children.insert(name.to_string(), inode.clone()); return Ok(inode); } + if name == "net" { + let mut cached_children = dir.cached_children().write(); + if let Some(child) = cached_children.get(name) { + return Ok(child.clone()); + } + + let inode = NetDirOps::new_inode(dir.self_ref_weak().clone()); + cached_children.insert(name.to_string(), inode.clone()); + return Ok(inode); + } Err(SystemError::ENOENT) } @@ -70,5 +82,8 @@ impl DirOps for SysDirOps { cached_children .entry("vm".to_string()) .or_insert_with(|| VmDirOps::new_inode(dir.self_ref_weak().clone())); + cached_children + .entry("net".to_string()) + .or_insert_with(|| NetDirOps::new_inode(dir.self_ref_weak().clone())); } } diff --git a/kernel/src/filesystem/procfs/sys/net/ipv4.rs b/kernel/src/filesystem/procfs/sys/net/ipv4.rs new file mode 100644 index 000000000..28e54803d --- /dev/null +++ b/kernel/src/filesystem/procfs/sys/net/ipv4.rs @@ -0,0 +1,108 @@ +use crate::filesystem::{ + procfs::{ + template::{Builder, DirOps, FileOps, ProcDir, ProcDirBuilder, ProcFileBuilder}, + utils::proc_read, + }, + vfs::{FilePrivateData, IndexNode, InodeMode}, +}; +use crate::libs::mutex::MutexGuard; +use crate::net::socket::inet::common::port::PortManager; +use alloc::{ + format, + string::{String, ToString}, + sync::{Arc, Weak}, + vec::Vec, +}; +use system_error::SystemError; + +#[derive(Debug)] +pub struct Ipv4DirOps; + +impl Ipv4DirOps { + pub fn new_inode(parent: Weak) -> Arc { + ProcDirBuilder::new(Self, InodeMode::from_bits_truncate(0o555)) + .parent(parent) + .build() + .unwrap() + } +} + +impl DirOps for Ipv4DirOps { + fn lookup_child( + &self, + dir: &ProcDir, + name: &str, + ) -> Result, SystemError> { + if name == "ip_local_port_range" { + let mut cached_children = dir.cached_children().write(); + if let Some(child) = cached_children.get(name) { + return Ok(child.clone()); + } + + let inode = IpLocalPortRangeFileOps::new_inode(dir.self_ref_weak().clone()); + cached_children.insert(name.to_string(), inode.clone()); + return Ok(inode); + } + + Err(SystemError::ENOENT) + } + + fn populate_children(&self, dir: &ProcDir) { + let mut cached_children = dir.cached_children().write(); + cached_children + .entry("ip_local_port_range".to_string()) + .or_insert_with(|| IpLocalPortRangeFileOps::new_inode(dir.self_ref_weak().clone())); + } +} + +#[derive(Debug)] +pub struct IpLocalPortRangeFileOps; + +impl IpLocalPortRangeFileOps { + pub fn new_inode(parent: Weak) -> Arc { + ProcFileBuilder::new(Self, InodeMode::from_bits_truncate(0o644)) + .parent(parent) + .build() + .unwrap() + } + + fn read_config() -> String { + let (min, max) = PortManager::local_port_range(); + format!("{} {}\n", min, max) + } + + fn write_config(data: &[u8]) -> Result { + let input = core::str::from_utf8(data).map_err(|_| SystemError::EINVAL)?; + let parts: Vec<&str> = input.split_whitespace().collect(); + if parts.len() < 2 { + return Err(SystemError::EINVAL); + } + let min: u16 = parts[0].parse().map_err(|_| SystemError::EINVAL)?; + let max: u16 = parts[1].parse().map_err(|_| SystemError::EINVAL)?; + PortManager::set_local_port_range(min, max)?; + Ok(data.len()) + } +} + +impl FileOps for IpLocalPortRangeFileOps { + fn read_at( + &self, + offset: usize, + len: usize, + buf: &mut [u8], + _data: MutexGuard, + ) -> Result { + let content = Self::read_config(); + proc_read(offset, len, buf, content.as_bytes()) + } + + fn write_at( + &self, + _offset: usize, + _len: usize, + buf: &[u8], + _data: MutexGuard, + ) -> Result { + Self::write_config(buf) + } +} diff --git a/kernel/src/filesystem/procfs/sys/net/mod.rs b/kernel/src/filesystem/procfs/sys/net/mod.rs new file mode 100644 index 000000000..833126d99 --- /dev/null +++ b/kernel/src/filesystem/procfs/sys/net/mod.rs @@ -0,0 +1,52 @@ +mod ipv4; + +use crate::filesystem::{ + procfs::template::{DirOps, ProcDir, ProcDirBuilder}, + vfs::{IndexNode, InodeMode}, +}; +use alloc::string::ToString; +use alloc::sync::{Arc, Weak}; +use ipv4::Ipv4DirOps; +use system_error::SystemError; + +use crate::filesystem::procfs::Builder; + +#[derive(Debug)] +pub struct NetDirOps; + +impl NetDirOps { + pub fn new_inode(parent: Weak) -> Arc { + ProcDirBuilder::new(Self, InodeMode::from_bits_truncate(0o555)) + .parent(parent) + .build() + .unwrap() + } +} + +impl DirOps for NetDirOps { + fn lookup_child( + &self, + dir: &ProcDir, + name: &str, + ) -> Result, SystemError> { + if name == "ipv4" { + let mut cached_children = dir.cached_children().write(); + if let Some(child) = cached_children.get(name) { + return Ok(child.clone()); + } + + let inode = Ipv4DirOps::new_inode(dir.self_ref_weak().clone()); + cached_children.insert(name.to_string(), inode.clone()); + return Ok(inode); + } + + Err(SystemError::ENOENT) + } + + fn populate_children(&self, dir: &ProcDir) { + let mut cached_children = dir.cached_children().write(); + cached_children + .entry("ipv4".to_string()) + .or_insert_with(|| Ipv4DirOps::new_inode(dir.self_ref_weak().clone())); + } +} diff --git a/kernel/src/net/socket/inet/common/multicast.rs b/kernel/src/net/socket/inet/common/multicast.rs index 187c69a51..33d01c603 100644 --- a/kernel/src/net/socket/inet/common/multicast.rs +++ b/kernel/src/net/socket/inet/common/multicast.rs @@ -111,10 +111,7 @@ pub fn drop_ipv4_memberships( if let Some(iface) = find_iface_by_ifindex(netns, entry.ifindex) { let bytes = entry.multiaddr.to_ne_bytes(); let multi = Ipv4Address::new(bytes[0], bytes[1], bytes[2], bytes[3]); - let _ = iface - .smol_iface() - .lock() - .leave_multicast_group(IpAddress::Ipv4(multi)); + iface.common().ipv4_multicast_leave_ref(multi); } } } @@ -247,10 +244,7 @@ pub fn apply_ipv4_membership( }); } - let join_result = { - let mut smol_iface = iface.smol_iface().lock(); - smol_iface.join_multicast_group(IpAddress::Ipv4(multi_ipv4)) - }; + let join_result = iface.common().ipv4_multicast_join_ref(multi_ipv4); if let Err(e) = join_result { { let mut groups = groups.lock(); @@ -265,7 +259,7 @@ pub fn apply_ipv4_membership( Ok(()) } IpOption::DROP_MEMBERSHIP => { - let (did_remove, still_joined) = { + let did_remove = { let mut groups = groups.lock(); let pos = groups.iter().position(|g| { if g.multiaddr != multi { @@ -281,12 +275,9 @@ pub fn apply_ipv4_membership( }); if let Some(idx) = pos { groups.swap_remove(idx); - let still_joined = groups - .iter() - .any(|g| g.multiaddr == multi && g.ifindex == resolved_ifindex); - (true, still_joined) + true } else { - (false, false) + false } }; @@ -294,12 +285,7 @@ pub fn apply_ipv4_membership( return Err(SystemError::EADDRNOTAVAIL); } - if !still_joined { - let _ = iface - .smol_iface() - .lock() - .leave_multicast_group(IpAddress::Ipv4(multi_ipv4)); - } + iface.common().ipv4_multicast_leave_ref(multi_ipv4); Ok(()) } _ => Err(SystemError::ENOPROTOOPT), diff --git a/kernel/src/net/socket/inet/common/port.rs b/kernel/src/net/socket/inet/common/port.rs index 19403773e..5f68e427b 100644 --- a/kernel/src/net/socket/inet/common/port.rs +++ b/kernel/src/net/socket/inet/common/port.rs @@ -1,4 +1,5 @@ use alloc::vec::Vec; +use core::sync::atomic::{AtomicU16, Ordering}; use hashbrown::HashMap; use smoltcp::wire::IpAddress; use system_error::SystemError; @@ -30,33 +31,45 @@ impl Default for PortManager { } } +pub const DEFAULT_LOCAL_PORT_RANGE: u32 = (32768u32 << 16) | 60999u32; + impl PortManager { + pub fn local_port_range() -> (u16, u16) { + ProcessManager::current_netns().local_port_range() + } + + pub fn set_local_port_range(min: u16, max: u16) -> Result<(), SystemError> { + ProcessManager::current_netns().set_local_port_range(min, max) + } + /// @brief 自动分配一个相对应协议中未被使用的PORT,如果动态端口均已被占用,返回错误码 EADDRINUSE pub fn get_ephemeral_port(&self, socket_type: Types) -> Result { // TODO: selects non-conflict high port - static EPHEMERAL_PORT: core::sync::atomic::AtomicU16 = - core::sync::atomic::AtomicU16::new(0); - let initial = (49152 + rand() % (65536 - 49152)) as u16; - let _ = EPHEMERAL_PORT.compare_exchange( - 0, - initial, - core::sync::atomic::Ordering::AcqRel, - core::sync::atomic::Ordering::Relaxed, - ); + static EPHEMERAL_PORT: AtomicU16 = AtomicU16::new(0); + let (min, max) = Self::local_port_range(); + let range = (max - min) as u32 + 1; + if range == 0 { + return Err(SystemError::EINVAL); + } + let current = EPHEMERAL_PORT.load(Ordering::Relaxed); + if current < min || current > max { + let initial = min + (rand() % range as usize) as u16; + EPHEMERAL_PORT.store(initial, Ordering::Relaxed); + } - let mut remaining = 65536 - 49152; // 剩余尝试分配端口次数 + let mut remaining = range; while remaining > 0 { let old = EPHEMERAL_PORT - .fetch_update( - core::sync::atomic::Ordering::AcqRel, - core::sync::atomic::Ordering::Relaxed, - |cur| Some(if cur == 65535 { 49152 } else { cur + 1 }), - ) + .fetch_update(Ordering::AcqRel, Ordering::Relaxed, |cur| { + let cur = if cur < min || cur > max { min } else { cur }; + Some(if cur >= max { min } else { cur + 1 }) + }) .unwrap_or_else(|cur| cur); - if old == 0 { - continue; - } - let port = if old == 65535 { 49152 } else { old + 1 }; + let port = if old < min || old >= max { + min + } else { + old + 1 + }; // 使用 ListenTable 检查端口是否被占用 match socket_type { @@ -83,9 +96,25 @@ impl PortManager { #[inline] pub fn bind_ephemeral_port(&self, socket_type: Types) -> Result { - let port = self.get_ephemeral_port(socket_type)?; - self.bind_port(socket_type, port)?; - return Ok(port); + let (min, max) = Self::local_port_range(); + let range = (max - min) as u32 + 1; + if range == 0 { + return Err(SystemError::EINVAL); + } + let mut remaining = range; + while remaining > 0 { + let port = self.get_ephemeral_port(socket_type)?; + match self.bind_port(socket_type, port) { + Ok(()) => return Ok(port), + Err(SystemError::EADDRINUSE) => { + // Race: another thread grabbed the port after we checked. + remaining -= 1; + continue; + } + Err(e) => return Err(e), + } + } + Err(SystemError::EADDRINUSE) } /// UDP: 绑定随机端口(支持 reuseaddr/reuseport 规则) @@ -96,9 +125,25 @@ impl PortManager { reuseport: bool, bind_id: usize, ) -> Result { - let port = self.get_ephemeral_port(Types::Udp)?; - self.bind_udp_port(port, addr, reuseaddr, reuseport, bind_id)?; - Ok(port) + let (min, max) = Self::local_port_range(); + let range = (max - min) as u32 + 1; + if range == 0 { + return Err(SystemError::EINVAL); + } + let mut remaining = range; + while remaining > 0 { + let port = self.get_ephemeral_port(Types::Udp)?; + match self.bind_udp_port(port, addr, reuseaddr, reuseport, bind_id) { + Ok(()) => return Ok(port), + Err(SystemError::EADDRINUSE) => { + // Race: another thread grabbed the port after we checked. + remaining -= 1; + continue; + } + Err(e) => return Err(e), + } + } + Err(SystemError::EADDRINUSE) } /// @brief 检测给定端口是否已被占用,如果未被占用则在 TCP 对应的表中记录 diff --git a/kernel/src/net/socket/inet/datagram/mod.rs b/kernel/src/net/socket/inet/datagram/mod.rs index 8154c83b6..5e8eefc05 100644 --- a/kernel/src/net/socket/inet/datagram/mod.rs +++ b/kernel/src/net/socket/inet/datagram/mod.rs @@ -15,6 +15,7 @@ use crate::net::socket::unix::utils::CmsgBuffer; use crate::net::socket::{AddressFamily, Socket, PMSG, PSO, PSOL}; use crate::net::socket::{IpOption, PIPV6}; use crate::process::namespace::net_namespace::NetNamespace; +use crate::process::namespace::NamespaceOps; use crate::process::ProcessManager; use crate::{libs::rwsem::RwSem, net::socket::endpoint::Endpoint}; use alloc::collections::VecDeque; @@ -110,6 +111,8 @@ pub struct UdpSocket { ip_multicast_ttl: AtomicI32, /// IP_MULTICAST_LOOP ip_multicast_loop: AtomicBool, + /// IP_MULTICAST_ALL + ip_multicast_all: AtomicBool, /// IP_MULTICAST_IF: interface index ip_multicast_ifindex: AtomicI32, /// IP_MULTICAST_IF: interface address (network byte order) @@ -190,6 +193,7 @@ impl UdpSocket { recv_err_v6: AtomicBool::new(false), ip_multicast_ttl: AtomicI32::new(1), ip_multicast_loop: AtomicBool::new(true), + ip_multicast_all: AtomicBool::new(true), ip_multicast_ifindex: AtomicI32::new(0), ip_multicast_addr: AtomicU32::new(0), ip_multicast_groups: Mutex::new(Vec::new()), @@ -940,9 +944,11 @@ impl UdpSocket { let octets = addr.octets(); let multiaddr = u32::from_ne_bytes(octets); let ifindex = mcast_ifindex.max(ifindex); - if multicast_loopback::multicast_registry() - .has_membership(multiaddr, ifindex) - { + if multicast_loopback::multicast_registry().has_membership( + self.netns.ns_common().nsid.data(), + multiaddr, + ifindex, + ) { udp_bindings::deliver_multicast_all( &self.netns, dest, @@ -1045,9 +1051,11 @@ impl UdpSocket { let multiaddr = u32::from_ne_bytes(octets); let ifindex = self.get_multicast_ifindex(); - if multicast_loopback::multicast_registry() - .has_membership(multiaddr, ifindex) - { + if multicast_loopback::multicast_registry().has_membership( + self.netns.ns_common().nsid.data(), + multiaddr, + ifindex, + ) { udp_bindings::deliver_multicast_all( &self.netns, dest, diff --git a/kernel/src/net/socket/inet/datagram/multicast_loopback.rs b/kernel/src/net/socket/inet/datagram/multicast_loopback.rs index e0b88bca8..b63938d57 100644 --- a/kernel/src/net/socket/inet/datagram/multicast_loopback.rs +++ b/kernel/src/net/socket/inet/datagram/multicast_loopback.rs @@ -8,6 +8,7 @@ use alloc::sync::Weak; use alloc::vec::Vec; use crate::libs::rwsem::RwSem; +use crate::process::namespace::NamespaceOps; use super::UdpSocket; @@ -16,6 +17,8 @@ use super::UdpSocket; struct MulticastMember { /// Weak reference to the socket socket: Weak, + /// Network namespace id + netns_id: usize, /// Multicast group address (in network byte order for IPv4) multiaddr: u32, /// Interface index where the group was joined @@ -36,14 +39,22 @@ impl MulticastLoopbackRegistry { /// Register a socket as a member of a multicast group pub fn register(&self, socket: Weak, multiaddr: u32, ifindex: i32) { + let netns_id = match socket.upgrade() { + Some(sock) => sock.netns().ns_common().nsid.data(), + None => return, + }; let mut members = self.members.write(); // Check if already registered let exists = members.iter().any(|m| { - m.multiaddr == multiaddr && m.ifindex == ifindex && m.socket.as_ptr() == socket.as_ptr() + m.multiaddr == multiaddr + && m.ifindex == ifindex + && m.netns_id == netns_id + && m.socket.as_ptr() == socket.as_ptr() }); if !exists { members.push(MulticastMember { socket, + netns_id, multiaddr, ifindex, }); @@ -66,11 +77,14 @@ impl MulticastLoopbackRegistry { members.retain(|m| m.socket.as_ptr() != socket.as_ptr()); } - pub fn has_membership(&self, multiaddr: u32, ifindex: i32) -> bool { + pub fn has_membership(&self, netns_id: usize, multiaddr: u32, ifindex: i32) -> bool { let members = self.members.read(); - members - .iter() - .any(|m| m.multiaddr == multiaddr && m.ifindex == ifindex) + members.iter().any(|m| { + m.netns_id == netns_id + && m.multiaddr == multiaddr + && m.ifindex == ifindex + && m.socket.strong_count() > 0 + }) } } diff --git a/kernel/src/net/socket/inet/datagram/option.rs b/kernel/src/net/socket/inet/datagram/option.rs index 86932daf3..4ca9bba58 100644 --- a/kernel/src/net/socket/inet/datagram/option.rs +++ b/kernel/src/net/socket/inet/datagram/option.rs @@ -408,6 +408,18 @@ impl UdpSocket { self.ip_multicast_loop.store(on, Ordering::Relaxed); Ok(()) } + IpOption::MULTICAST_ALL => { + let v = if val.len() == 1 { + val[0] as i32 + } else if val.len() >= core::mem::size_of::() { + i32::from_ne_bytes([val[0], val[1], val[2], val[3]]) + } else { + return Err(SystemError::EINVAL); + }; + let on = v != 0; + self.ip_multicast_all.store(on, Ordering::Relaxed); + Ok(()) + } IpOption::MULTICAST_IF => apply_ipv4_multicast_if( &self.netns, val, @@ -512,6 +524,13 @@ impl UdpSocket { 0i32 } } + IpOption::MULTICAST_ALL => { + if self.ip_multicast_all.load(Ordering::Relaxed) { + 1i32 + } else { + 0i32 + } + } IpOption::MULTICAST_IF => self.ip_multicast_addr.load(Ordering::Relaxed) as i32, IpOption::PKTINFO => { if self.recv_pktinfo_v4.load(Ordering::Relaxed) { diff --git a/kernel/src/net/socket/inet/datagram/udp_bindings.rs b/kernel/src/net/socket/inet/datagram/udp_bindings.rs index d2b3df08a..f14a5389d 100644 --- a/kernel/src/net/socket/inet/datagram/udp_bindings.rs +++ b/kernel/src/net/socket/inet/datagram/udp_bindings.rs @@ -109,9 +109,11 @@ pub fn deliver_multicast_all( }; let mut delivered = 0; for cand in candidates { - if !cand - .socket - .has_ipv4_multicast_membership(multiaddr, ifindex) + let multicast_all = cand.socket.ip_multicast_all.load(Ordering::Relaxed); + if !multicast_all + && !cand + .socket + .has_ipv4_multicast_membership(multiaddr, ifindex) { continue; } diff --git a/kernel/src/net/socket/inet/stream/inner.rs b/kernel/src/net/socket/inet/stream/inner.rs index 53ebf1f0e..c029439c5 100644 --- a/kernel/src/net/socket/inet/stream/inner.rs +++ b/kernel/src/net/socket/inet/stream/inner.rs @@ -327,7 +327,10 @@ impl Connecting { Inner::Connecting(self), Err(SystemError::EAGAIN_OR_EWOULDBLOCK), ), - ConnectResult::Connected => (Inner::Established(Established::new(self.inner)), Ok(())), + ConnectResult::Connected => ( + Inner::Established(Established::new(self.inner, true)), + Ok(()), + ), ConnectResult::Refused | ConnectResult::RefusedConsumed => { // unbind port self.inner @@ -362,7 +365,7 @@ impl Connecting { /// that the underlying socket is actually in the ESTABLISHED state. /// The caller must ensure that the socket handshake has completed successfully. pub unsafe fn into_established(self) -> Established { - Established::new(self.inner) + Established::new(self.inner, true) } /// Returns `true` when `conn_result` becomes ready, which indicates that the caller should @@ -585,7 +588,7 @@ impl Listening { // TODO is smoltcp socket swappable? core::mem::swap(&mut new_listen, connected); - return Ok((Established::new(new_listen), remote_endpoint)); + return Ok((Established::new(new_listen, false), remote_endpoint)); } pub fn update_io_events(&self, pollee: &AtomicUsize) { @@ -645,10 +648,11 @@ pub struct Established { inner: socket::inet::BoundInner, local: smoltcp::wire::IpEndpoint, peer: smoltcp::wire::IpEndpoint, + owns_port: bool, } impl Established { - pub fn new(inner: socket::inet::BoundInner) -> Self { + pub fn new(inner: socket::inet::BoundInner, owns_port: bool) -> Self { let local = inner .with::(|socket| socket.local_endpoint()) .unwrap_or(smoltcp::wire::IpEndpoint::new( @@ -661,7 +665,12 @@ impl Established { smoltcp::wire::IpAddress::Ipv4(smoltcp::wire::Ipv4Address::UNSPECIFIED), 0, )); - Self { inner, local, peer } + Self { + inner, + local, + peer, + owns_port, + } } pub fn with_mut) -> R>( @@ -683,6 +692,10 @@ impl Established { self.inner.handle() } + pub fn owns_port(&self) -> bool { + self.owns_port + } + pub fn close(&self) { self.inner .with_mut::(|socket| socket.close()); diff --git a/kernel/src/net/socket/inet/stream/lifecycle.rs b/kernel/src/net/socket/inet/stream/lifecycle.rs index 74f6314cf..1f4d8fd27 100644 --- a/kernel/src/net/socket/inet/stream/lifecycle.rs +++ b/kernel/src/net/socket/inet/stream/lifecycle.rs @@ -1,5 +1,6 @@ use crate::net::socket::common::ShutdownBit; use crate::net::socket::inet::InetSocket; +use crate::net::socket::inet::Types; use alloc::sync::Arc; use system_error::SystemError; @@ -457,6 +458,9 @@ impl TcpSocket { let iface = conn.iface().clone(); let me: alloc::sync::Weak = self.self_ref.clone(); conn.close(); + if conn.owns_port() { + iface.port_manager().unbind_port(Types::Tcp, local_port); + } iface.common().defer_tcp_close(handle, local_port, me); writer.replace(inner::Inner::Established(conn)); } @@ -481,6 +485,9 @@ impl TcpSocket { } else { es.close(); } + if es.owns_port() { + iface.port_manager().unbind_port(Types::Tcp, local_port); + } iface.common().defer_tcp_close(handle, local_port, me); writer.replace(inner::Inner::Established(es)); } @@ -490,7 +497,10 @@ impl TcpSocket { smoltcp::wire::IpAddress::Ipv6(_) => smoltcp::wire::IpVersion::Ipv6, _ => smoltcp::wire::IpVersion::Ipv4, }; + let port = sc.get_name().port; + let iface = sc.iface().clone(); sc.release(); + iface.port_manager().unbind_port(Types::Tcp, port); writer.replace(inner::Inner::Closed(inner::Closed::new(ver))); } inner::Inner::Listening(mut ls) => { diff --git a/kernel/src/net/socket/posix/ip_option.rs b/kernel/src/net/socket/posix/ip_option.rs index 5d5c88ad9..40cdadd03 100644 --- a/kernel/src/net/socket/posix/ip_option.rs +++ b/kernel/src/net/socket/posix/ip_option.rs @@ -20,6 +20,7 @@ pub enum IpOption { MULTICAST_LOOP = 34, ADD_MEMBERSHIP = 35, DROP_MEMBERSHIP = 36, + MULTICAST_ALL = 49, RECVERR_RFC4884 = 26, } diff --git a/kernel/src/net/tcp_close_defer.rs b/kernel/src/net/tcp_close_defer.rs index 73b13246d..ad30e42e8 100644 --- a/kernel/src/net/tcp_close_defer.rs +++ b/kernel/src/net/tcp_close_defer.rs @@ -13,9 +13,7 @@ use alloc::sync::Weak; use alloc::vec::Vec; use crate::libs::mutex::Mutex; -use crate::net::socket::inet::common::PortManager; use crate::net::socket::inet::InetSocket; -use crate::net::socket::inet::Types; #[derive(Debug, Clone)] struct ClosingTcpSocket { @@ -62,11 +60,7 @@ impl TcpCloseDefer { /// /// 重要: /// - 锁顺序必须保持为:`SocketSet` -> `TcpCloseDefer::closing`,避免与 close 路径反转。 - pub fn reap_closed( - &self, - sockets: &mut smoltcp::iface::SocketSet<'static>, - port_manager: &PortManager, - ) { + pub fn reap_closed(&self, sockets: &mut smoltcp::iface::SocketSet<'static>) { let mut closing = self.closing.lock(); if closing.is_empty() { return; @@ -75,7 +69,7 @@ impl TcpCloseDefer { while i < closing.len() { let ClosingTcpSocket { handle, - local_port, + local_port: _local_port, ref sock, } = closing[i]; let state = sockets.get::(handle).state(); @@ -88,7 +82,6 @@ impl TcpCloseDefer { continue; } sockets.remove(handle); - port_manager.unbind_port(Types::Tcp, local_port); closing.swap_remove(i); continue; } diff --git a/kernel/src/process/namespace/net_namespace.rs b/kernel/src/process/namespace/net_namespace.rs index 8347cccf6..6c792b321 100644 --- a/kernel/src/process/namespace/net_namespace.rs +++ b/kernel/src/process/namespace/net_namespace.rs @@ -23,6 +23,7 @@ use alloc::boxed::Box; use alloc::collections::BTreeMap; use alloc::string::{String, ToString}; use alloc::sync::{Arc, Weak}; +use core::sync::atomic::AtomicU32; use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use hashbrown::HashMap; use system_error::SystemError; @@ -90,6 +91,8 @@ pub struct NetNamespace { /// AF_UNIX abstract namespace table (scoped to this netns). unix_abstract_table: Arc, + /// Per-netns IPv4 ephemeral port range (ip_local_port_range) + local_port_range: AtomicU32, } #[derive(Debug)] @@ -136,6 +139,9 @@ impl NetNamespace { netlink_socket_table: NetlinkSocketTable::default(), netlink_kernel_socket: RwLock::new(generate_supported_netlink_kernel_sockets()), unix_abstract_table: unix_abstract_table.clone(), + local_port_range: AtomicU32::new( + crate::net::socket::inet::common::port::DEFAULT_LOCAL_PORT_RANGE, + ), }); // Self::create_polling_thread(netns.clone(), "netns_root".to_string()); @@ -168,6 +174,9 @@ impl NetNamespace { netlink_socket_table: NetlinkSocketTable::default(), netlink_kernel_socket: RwLock::new(generate_supported_netlink_kernel_sockets()), unix_abstract_table: unix_abstract_table.clone(), + local_port_range: AtomicU32::new( + crate::net::socket::inet::common::port::DEFAULT_LOCAL_PORT_RANGE, + ), }); // Linux 语义:每个 netns 都需要一个可被唤醒的轮询线程来推进协议栈。 @@ -199,6 +208,32 @@ impl NetNamespace { self.device_list.read() } + #[inline] + pub fn local_port_range(&self) -> (u16, u16) { + let value = self.local_port_range.load(Ordering::Relaxed); + ((value >> 16) as u16, (value & 0xffff) as u16) + } + + pub fn set_local_port_range(&self, min: u16, max: u16) -> Result<(), SystemError> { + if min == 0 || max == 0 || min > max { + return Err(SystemError::EINVAL); + } + let new_value = ((min as u32) << 16) | (max as u32); + loop { + let old_value = self.local_port_range.load(Ordering::Relaxed); + if old_value == new_value { + return Ok(()); + } + if self + .local_port_range + .compare_exchange(old_value, new_value, Ordering::AcqRel, Ordering::Relaxed) + .is_ok() + { + return Ok(()); + } + } + } + pub fn inner(&self) -> RwLockReadGuard<'_, InnerNetNamespace> { self.inner.read() } diff --git a/kernel/src/process/rseq.rs b/kernel/src/process/rseq.rs index 1cbd4b27a..e64e6830c 100644 --- a/kernel/src/process/rseq.rs +++ b/kernel/src/process/rseq.rs @@ -583,7 +583,7 @@ impl Rseq { // 更新 cpu_id 等字段 let cpu_id = current_cpu_id().data() as u32; - if let Err(e) = unsafe { access.update_cpu_node_id(cpu_id, 0, 0) } { + if let Err(_e) = unsafe { access.update_cpu_node_id(cpu_id, 0, 0) } { // log::debug!("rseq update_cpu_node_id failed: {:?}", e); Self::disable_current_rseq_after_fault(&pcb); let _ = crate::ipc::signal::send_kernel_signal_to_current( diff --git a/user/apps/c_unitest/test_ip_local_port_range_netns.c b/user/apps/c_unitest/test_ip_local_port_range_netns.c new file mode 100644 index 000000000..5beaf1080 --- /dev/null +++ b/user/apps/c_unitest/test_ip_local_port_range_netns.c @@ -0,0 +1,127 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char *kRangeFile = "/proc/sys/net/ipv4/ip_local_port_range"; + +static int read_range(int *min, int *max) { + int fd = open(kRangeFile, O_RDONLY); + if (fd < 0) { + perror("open range file"); + return -1; + } + char buf[64]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n <= 0) { + perror("read range file"); + return -1; + } + buf[n] = '\0'; + if (sscanf(buf, "%d %d", min, max) != 2) { + fprintf(stderr, "failed to parse range: '%s'\n", buf); + return -1; + } + return 0; +} + +static int write_range(int min, int max) { + int fd = open(kRangeFile, O_WRONLY | O_TRUNC); + if (fd < 0) { + perror("open range file for write"); + return -1; + } + char buf[64]; + int len = snprintf(buf, sizeof(buf), "%d %d", min, max); + if (len <= 0) { + close(fd); + return -1; + } + ssize_t n = write(fd, buf, (size_t)len); + close(fd); + if (n != len) { + perror("write range file"); + return -1; + } + return 0; +} + +int main(void) { + int parent_min = 0, parent_max = 0; + if (read_range(&parent_min, &parent_max) != 0) { + return 1; + } + + if (access(kRangeFile, W_OK) != 0) { + printf("[SKIP] %s not writable\n", kRangeFile); + return 0; + } + + pid_t pid = fork(); + if (pid < 0) { + perror("fork"); + return 1; + } + if (pid == 0) { + if (unshare(CLONE_NEWNET) != 0) { + printf("[SKIP] unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); + return 0; + } + int child_min = 0, child_max = 0; + if (read_range(&child_min, &child_max) != 0) { + return 1; + } + int new_min = child_min; + int new_max = child_min + 10; + if (write_range(new_min, new_max) != 0) { + return 1; + } + int verify_min = 0, verify_max = 0; + if (read_range(&verify_min, &verify_max) != 0) { + return 1; + } + if (verify_min != new_min || verify_max != new_max) { + fprintf(stderr, "child range verify failed: %d %d (expected %d %d)\n", + verify_min, verify_max, new_min, new_max); + return 1; + } + // Restore original range in this netns to keep it tidy. + if (write_range(child_min, child_max) != 0) { + return 1; + } + return 0; + } + + int status = 0; + if (waitpid(pid, &status, 0) < 0) { + perror("waitpid"); + return 1; + } + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fprintf(stderr, "child failed\n"); + return 1; + } + + int parent_min_after = 0, parent_max_after = 0; + if (read_range(&parent_min_after, &parent_max_after) != 0) { + return 1; + } + + if (parent_min_after != parent_min || parent_max_after != parent_max) { + fprintf(stderr, + "parent range changed: %d %d (expected %d %d)\n", + parent_min_after, parent_max_after, parent_min, parent_max); + return 1; + } + + printf("[PASS] ip_local_port_range is isolated per netns\n"); + return 0; +} diff --git a/user/apps/tests/syscall/gvisor/runner/Cargo.lock b/user/apps/tests/syscall/gvisor/runner/Cargo.lock index 27e77bcd0..9ceab9af7 100644 --- a/user/apps/tests/syscall/gvisor/runner/Cargo.lock +++ b/user/apps/tests/syscall/gvisor/runner/Cargo.lock @@ -197,6 +197,7 @@ dependencies = [ "chrono", "clap", "env_logger", + "libc", "log", "regex", ] diff --git a/user/apps/tests/syscall/gvisor/runner/Cargo.toml b/user/apps/tests/syscall/gvisor/runner/Cargo.toml index a762c06b8..309ed6f72 100644 --- a/user/apps/tests/syscall/gvisor/runner/Cargo.toml +++ b/user/apps/tests/syscall/gvisor/runner/Cargo.toml @@ -14,6 +14,7 @@ regex = "1.0" chrono = { version = "0.4", features = ["serde"] } log = "0.4" env_logger = "0.10" +libc = "0.2" [profile.release] lto = true diff --git a/user/apps/tests/syscall/gvisor/runner/src/lib_sync.rs b/user/apps/tests/syscall/gvisor/runner/src/lib_sync.rs index 5baa8b259..ff3b6d803 100644 --- a/user/apps/tests/syscall/gvisor/runner/src/lib_sync.rs +++ b/user/apps/tests/syscall/gvisor/runner/src/lib_sync.rs @@ -4,6 +4,7 @@ use std::{ collections::HashSet, fs::{self, File}, io::{BufRead, BufReader, Write}, + os::unix::process::CommandExt, path::{Path, PathBuf}, process::Command, sync::{ @@ -369,6 +370,19 @@ impl TestRunner { if !blocked_subtests.is_empty() { cmd.arg(format!("--gtest_filter=-{}", blocked_subtests.join(":"))); } + // Run each test binary in a fresh network namespace to avoid sysctl leakage. + let name_lc = test_name.to_ascii_lowercase(); + if name_lc.contains("socket") || name_lc.contains("net") { + unsafe { + cmd.pre_exec(|| { + let ret = libc::unshare(libc::CLONE_NEWNET); + if ret != 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(()) + }); + } + } let status = cmd .current_dir(&self.config.tests_dir) diff --git a/user/apps/tests/syscall/gvisor/whitelist.txt b/user/apps/tests/syscall/gvisor/whitelist.txt index 380c056da..dd337e75a 100644 --- a/user/apps/tests/syscall/gvisor/whitelist.txt +++ b/user/apps/tests/syscall/gvisor/whitelist.txt @@ -105,6 +105,8 @@ socket_ip_tcp_loopback_non_blocking_test socket_ip_tcp_loopback_test socket_ip_tcp_udp_generic_loopback_test socket_ipv4_datagram_based_socket_unbound_loopback_test +socket_ipv4_udp_unbound_loopback_test +socket_ipv4_udp_unbound_loopback_nogotsan_test # 信号处理测试 sigaction_test