ndn_faces/l2/
af_packet.rs

1//! Shared AF_PACKET infrastructure for raw Ethernet faces.
2//!
3//! Contains socket helpers and the TPACKET_V2 mmap'd ring buffer used by both
4//! `NamedEtherFace` (unicast) and `MulticastEtherFace`.
5//!
6//! `MacAddr` is re-exported from `ndn-transport` — it is the shared canonical
7//! type used across the whole stack.  Defining a second copy here caused a
8//! type-mismatch on Linux when passing MACs to `NeighborUpdate::AddFace`.
9
10use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd, RawFd};
11use std::sync::atomic::{AtomicU32, Ordering};
12
13use bytes::Bytes;
14
15pub use ndn_transport::MacAddr;
16
17// ─── AF_PACKET helpers ───────────────────────────────────────────────────────
18
19/// Look up the interface index for `iface` via `SIOCGIFINDEX`.
20pub fn get_ifindex(fd: RawFd, iface: &str) -> std::io::Result<i32> {
21    let mut ifr: libc::ifreq = unsafe { std::mem::zeroed() };
22    let name_bytes = iface.as_bytes();
23    if name_bytes.len() >= libc::IFNAMSIZ {
24        return Err(std::io::Error::new(
25            std::io::ErrorKind::InvalidInput,
26            "interface name too long",
27        ));
28    }
29    unsafe {
30        std::ptr::copy_nonoverlapping(
31            name_bytes.as_ptr(),
32            ifr.ifr_name.as_mut_ptr() as *mut u8,
33            name_bytes.len(),
34        );
35    }
36    if unsafe { libc::ioctl(fd, libc::SIOCGIFINDEX as libc::c_ulong, &mut ifr) } == -1 {
37        return Err(std::io::Error::last_os_error());
38    }
39    Ok(unsafe { ifr.ifr_ifru.ifru_ifindex })
40}
41
42/// Build a `sockaddr_ll` for `bind` or `sendto`.
43pub fn make_sockaddr_ll(ifindex: i32, dst_mac: &MacAddr, protocol: u16) -> libc::sockaddr_ll {
44    let mut addr: libc::sockaddr_ll = unsafe { std::mem::zeroed() };
45    addr.sll_family = libc::AF_PACKET as u16;
46    addr.sll_protocol = protocol.to_be();
47    addr.sll_ifindex = ifindex;
48    addr.sll_halen = 6;
49    addr.sll_addr[..6].copy_from_slice(dst_mac.as_bytes());
50    addr
51}
52
53/// Create an `AF_PACKET + SOCK_DGRAM` socket bound to `ifindex`, filtering
54/// only frames with ethertype `protocol`.  Returns a non-blocking `OwnedFd`.
55pub fn open_packet_socket(ifindex: i32, protocol: u16) -> std::io::Result<OwnedFd> {
56    let fd = unsafe {
57        libc::socket(
58            libc::AF_PACKET,
59            libc::SOCK_DGRAM | libc::SOCK_NONBLOCK | libc::SOCK_CLOEXEC,
60            protocol.to_be() as i32,
61        )
62    };
63    if fd == -1 {
64        return Err(std::io::Error::last_os_error());
65    }
66    let owned = unsafe { OwnedFd::from_raw_fd(fd) };
67
68    // Bind to the specific interface so we only receive frames from it.
69    let bind_addr = make_sockaddr_ll(ifindex, &MacAddr::new([0; 6]), protocol);
70    if unsafe {
71        libc::bind(
72            owned.as_raw_fd(),
73            &bind_addr as *const libc::sockaddr_ll as *const libc::sockaddr,
74            std::mem::size_of::<libc::sockaddr_ll>() as libc::socklen_t,
75        )
76    } == -1
77    {
78        return Err(std::io::Error::last_os_error());
79    }
80
81    Ok(owned)
82}
83
84/// Generic `setsockopt` wrapper for a value of type `T`.
85pub fn setsockopt_val<T>(
86    fd: RawFd,
87    level: libc::c_int,
88    name: libc::c_int,
89    val: &T,
90) -> std::io::Result<()> {
91    if unsafe {
92        libc::setsockopt(
93            fd,
94            level,
95            name,
96            val as *const T as *const libc::c_void,
97            std::mem::size_of::<T>() as libc::socklen_t,
98        )
99    } == -1
100    {
101        return Err(std::io::Error::last_os_error());
102    }
103    Ok(())
104}
105
106// ─── TPACKET_V2 mmap ring buffer ────────────────────────────────────────────
107
108// TPACKET_V2 lives inside libc's `tpacket_versions` enum.
109pub(crate) const TPACKET_V2: libc::c_int = 1;
110
111pub(crate) const TPACKET_ALIGNMENT: usize = 16;
112
113pub(crate) const fn tpacket_align(x: usize) -> usize {
114    (x + TPACKET_ALIGNMENT - 1) & !(TPACKET_ALIGNMENT - 1)
115}
116
117// Ring geometry — each ring gets BLOCK_NR × BLOCK_SIZE bytes.
118// 2048-byte frames fit tpacket2_hdr (32 B) + sockaddr_ll (20 B) + 1500 B payload.
119pub(crate) const RING_FRAME_SIZE: u32 = 2048;
120pub(crate) const RING_BLOCK_SIZE: u32 = 1 << 12; // 4 KiB (one page, 2 frames/block)
121pub(crate) const RING_BLOCK_NR: u32 = 32; // 32 blocks → 128 KiB per ring
122pub(crate) const RING_FRAME_NR: u32 = (RING_BLOCK_SIZE / RING_FRAME_SIZE) * RING_BLOCK_NR; // 64
123
124/// Byte offset from TX frame start to the packet data payload.
125pub(crate) const TX_DATA_OFFSET: usize = tpacket_align(std::mem::size_of::<libc::tpacket2_hdr>())
126    + std::mem::size_of::<libc::sockaddr_ll>();
127
128// ─── tp_status atomic access ─────────────────────────────────────────────────
129
130/// Read `tp_status` from a ring frame with Acquire ordering.
131///
132/// # Safety
133/// `frame` must point to a valid, 16-byte-aligned tpacket2_hdr in the mmap'd ring.
134pub(crate) unsafe fn read_tp_status(frame: *mut u8) -> u32 {
135    unsafe { (*AtomicU32::from_ptr(frame as *mut u32)).load(Ordering::Acquire) }
136}
137
138/// Write `tp_status` with Release ordering.
139///
140/// # Safety
141/// Same requirements as [`read_tp_status`].
142pub(crate) unsafe fn write_tp_status(frame: *mut u8, val: u32) {
143    unsafe { (*AtomicU32::from_ptr(frame as *mut u32)).store(val, Ordering::Release) }
144}
145
146// ─── PacketRing ──────────────────────────────────────────────────────────────
147
148/// Mmap'd `PACKET_RX_RING` + `PACKET_TX_RING` for zero-copy packet I/O.
149pub struct PacketRing {
150    /// Mmap'd region (RX ring at offset 0, TX ring at `tx_offset`).
151    map: *mut u8,
152    map_len: usize,
153    frame_size: usize,
154    rx_frame_nr: u32,
155    tx_frame_nr: u32,
156    /// Byte offset where the TX ring starts within the mmap region.
157    tx_offset: usize,
158    /// Current RX consumer index (single consumer — Face::recv is single-task).
159    rx_head: AtomicU32,
160    /// Current TX producer index, protected for concurrent Face::send calls.
161    tx_head: std::sync::Mutex<u32>,
162}
163
164// Safety: the mmap'd region is shared with the kernel via MAP_SHARED.
165// Synchronisation is through atomic tp_status reads/writes with
166// Acquire/Release ordering.  rx_head is single-consumer; tx_head is
167// protected by a Mutex.
168unsafe impl Send for PacketRing {}
169unsafe impl Sync for PacketRing {}
170
171impl PacketRing {
172    fn rx_frame(&self, idx: u32) -> *mut u8 {
173        unsafe { self.map.add(idx as usize * self.frame_size) }
174    }
175
176    fn tx_frame(&self, idx: u32) -> *mut u8 {
177        unsafe {
178            self.map
179                .add(self.tx_offset + idx as usize * self.frame_size)
180        }
181    }
182
183    /// Try to dequeue one packet from the RX ring.
184    pub fn try_pop_rx(&self) -> Option<Bytes> {
185        self.try_pop_rx_with_source().map(|(bytes, _)| bytes)
186    }
187
188    /// Try to dequeue one packet from the RX ring, also returning the source MAC.
189    ///
190    /// In a TPACKET_V2 frame the kernel embeds a `sockaddr_ll` immediately after
191    /// the aligned `tpacket2_hdr`.  For received frames the kernel fills in
192    /// `sll_addr` / `sll_halen` with the source Ethernet address, giving us the
193    /// peer MAC without any extra syscall.
194    pub fn try_pop_rx_with_source(&self) -> Option<(Bytes, MacAddr)> {
195        let idx = self.rx_head.load(Ordering::Relaxed);
196        let frame = self.rx_frame(idx);
197
198        let status = unsafe { read_tp_status(frame) };
199        if status & libc::TP_STATUS_USER == 0 {
200            return None;
201        }
202
203        let hdr = frame as *const libc::tpacket2_hdr;
204        let tp_mac = unsafe { (*hdr).tp_mac } as usize;
205        let tp_snaplen = unsafe { (*hdr).tp_snaplen } as usize;
206
207        // sockaddr_ll sits immediately after the aligned tpacket2_hdr.
208        let sll_offset = tpacket_align(std::mem::size_of::<libc::tpacket2_hdr>());
209        let sll = unsafe { &*(frame.add(sll_offset) as *const libc::sockaddr_ll) };
210        let src_mac = MacAddr({
211            let mut b = [0u8; 6];
212            b.copy_from_slice(&sll.sll_addr[..6]);
213            b
214        });
215
216        let data = unsafe { std::slice::from_raw_parts(frame.add(tp_mac), tp_snaplen) };
217        let bytes = Bytes::copy_from_slice(data);
218
219        // Release frame back to the kernel.
220        unsafe { write_tp_status(frame, libc::TP_STATUS_KERNEL) };
221        self.rx_head
222            .store((idx + 1) % self.rx_frame_nr, Ordering::Relaxed);
223
224        Some((bytes, src_mac))
225    }
226
227    /// Try to enqueue one packet into the TX ring.
228    pub fn try_push_tx(&self, data: &[u8]) -> bool {
229        let mut head = self.tx_head.lock().unwrap();
230        let frame = self.tx_frame(*head);
231
232        let status = unsafe { read_tp_status(frame) };
233        if status != 0 {
234            return false;
235        }
236
237        unsafe {
238            std::ptr::copy_nonoverlapping(data.as_ptr(), frame.add(TX_DATA_OFFSET), data.len());
239
240            let hdr = frame as *mut libc::tpacket2_hdr;
241            (*hdr).tp_len = data.len() as u32;
242            (*hdr).tp_snaplen = data.len() as u32;
243        }
244
245        unsafe { write_tp_status(frame, libc::TP_STATUS_SEND_REQUEST) };
246
247        *head = (*head + 1) % self.tx_frame_nr;
248        true
249    }
250}
251
252impl Drop for PacketRing {
253    fn drop(&mut self) {
254        if !self.map.is_null() {
255            unsafe {
256                libc::munmap(self.map as *mut libc::c_void, self.map_len);
257            }
258        }
259    }
260}
261
262/// Configure TPACKET_V2, create RX + TX rings, and mmap them.
263pub fn setup_packet_ring(fd: RawFd) -> std::io::Result<PacketRing> {
264    // 1. Select TPACKET_V2.
265    let version: libc::c_int = TPACKET_V2;
266    setsockopt_val(fd, libc::SOL_PACKET, libc::PACKET_VERSION, &version)?;
267
268    let req = libc::tpacket_req {
269        tp_block_size: RING_BLOCK_SIZE,
270        tp_block_nr: RING_BLOCK_NR,
271        tp_frame_size: RING_FRAME_SIZE,
272        tp_frame_nr: RING_FRAME_NR,
273    };
274
275    // 2. Configure RX ring, then TX ring (same geometry).
276    setsockopt_val(fd, libc::SOL_PACKET, libc::PACKET_RX_RING, &req)?;
277    setsockopt_val(fd, libc::SOL_PACKET, libc::PACKET_TX_RING, &req)?;
278
279    // 3. Mmap both rings.
280    let rx_ring_size = (RING_BLOCK_SIZE as usize) * (RING_BLOCK_NR as usize);
281    let tx_ring_size = rx_ring_size;
282    let map_len = rx_ring_size + tx_ring_size;
283
284    let map = unsafe {
285        libc::mmap(
286            std::ptr::null_mut(),
287            map_len,
288            libc::PROT_READ | libc::PROT_WRITE,
289            libc::MAP_SHARED,
290            fd,
291            0,
292        )
293    };
294    if map == libc::MAP_FAILED {
295        return Err(std::io::Error::last_os_error());
296    }
297
298    Ok(PacketRing {
299        map: map as *mut u8,
300        map_len,
301        frame_size: RING_FRAME_SIZE as usize,
302        rx_frame_nr: RING_FRAME_NR,
303        tx_frame_nr: RING_FRAME_NR,
304        tx_offset: rx_ring_size,
305        rx_head: AtomicU32::new(0),
306        tx_head: std::sync::Mutex::new(0),
307    })
308}
309
310/// Query the hardware (MAC) address of `iface` via `SIOCGIFHWADDR`.
311///
312/// Returns an error if the interface does not exist or if the process lacks
313/// the necessary permissions to open a raw socket for the ioctl.
314pub fn get_interface_mac(iface: &str) -> std::io::Result<MacAddr> {
315    let fd = unsafe { libc::socket(libc::AF_PACKET, libc::SOCK_DGRAM | libc::SOCK_CLOEXEC, 0) };
316    if fd == -1 {
317        return Err(std::io::Error::last_os_error());
318    }
319    let fd = unsafe { OwnedFd::from_raw_fd(fd) };
320
321    let mut ifr: libc::ifreq = unsafe { std::mem::zeroed() };
322    let name_bytes = iface.as_bytes();
323    let copy_len = name_bytes.len().min(libc::IFNAMSIZ - 1);
324    // SAFETY: ifr_name is a fixed-size C array, zeroed above.
325    let name_ptr = ifr.ifr_name.as_mut_ptr() as *mut u8;
326    unsafe { std::ptr::copy_nonoverlapping(name_bytes.as_ptr(), name_ptr, copy_len) };
327
328    let ret = unsafe { libc::ioctl(fd.as_raw_fd(), libc::SIOCGIFHWADDR, &mut ifr as *mut _) };
329    if ret == -1 {
330        return Err(std::io::Error::last_os_error());
331    }
332
333    // ifr_hwaddr.sa_data holds the MAC bytes at offset 0.
334    let sa_data = unsafe { ifr.ifr_ifru.ifru_hwaddr.sa_data };
335    let mac = [
336        sa_data[0] as u8,
337        sa_data[1] as u8,
338        sa_data[2] as u8,
339        sa_data[3] as u8,
340        sa_data[4] as u8,
341        sa_data[5] as u8,
342    ];
343    Ok(MacAddr::new(mac))
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use crate::NDN_ETHERTYPE;
350
351    #[test]
352    fn mac_addr_display() {
353        let mac = MacAddr::new([0xaa, 0xbb, 0xcc, 0x01, 0x02, 0x03]);
354        assert_eq!(format!("{mac}"), "aa:bb:cc:01:02:03");
355    }
356
357    #[test]
358    fn mac_addr_broadcast() {
359        assert_eq!(MacAddr::BROADCAST.as_bytes(), &[0xff; 6]);
360    }
361
362    #[test]
363    fn sockaddr_ll_layout() {
364        let mac = MacAddr::new([0x11, 0x22, 0x33, 0x44, 0x55, 0x66]);
365        let addr = make_sockaddr_ll(3, &mac, NDN_ETHERTYPE);
366        assert_eq!(addr.sll_family, libc::AF_PACKET as u16);
367        assert_eq!(addr.sll_ifindex, 3);
368        assert_eq!(addr.sll_halen, 6);
369        assert_eq!(&addr.sll_addr[..6], mac.as_bytes());
370        assert_eq!(addr.sll_protocol, NDN_ETHERTYPE.to_be());
371    }
372
373    #[test]
374    fn ring_geometry() {
375        assert_eq!(
376            RING_FRAME_NR,
377            (RING_BLOCK_SIZE / RING_FRAME_SIZE) * RING_BLOCK_NR,
378        );
379        assert!(RING_FRAME_SIZE as usize >= TX_DATA_OFFSET + 1500);
380    }
381
382    #[test]
383    fn tx_data_offset_is_correct() {
384        let aligned_hdr = tpacket_align(std::mem::size_of::<libc::tpacket2_hdr>());
385        let expected = aligned_hdr + std::mem::size_of::<libc::sockaddr_ll>();
386        assert_eq!(TX_DATA_OFFSET, expected);
387    }
388
389    /// Verify that `try_pop_rx_with_source` correctly extracts the source MAC
390    /// from a manually constructed TPACKET_V2 frame in a stack buffer.
391    #[test]
392    fn rx_source_mac_extraction() {
393        // Build a synthetic TPACKET_V2 frame in a heap buffer so we can
394        // exercise the MAC-extraction logic without an actual AF_PACKET socket.
395        let frame_size = RING_FRAME_SIZE as usize;
396        let mut buf = vec![0u8; frame_size];
397
398        // Place the tpacket2_hdr at offset 0.
399        let hdr = buf.as_mut_ptr() as *mut libc::tpacket2_hdr;
400        let aligned_hdr_size = tpacket_align(std::mem::size_of::<libc::tpacket2_hdr>());
401        let payload_offset = aligned_hdr_size + std::mem::size_of::<libc::sockaddr_ll>();
402        let payload = b"NDN";
403
404        unsafe {
405            (*hdr).tp_status = libc::TP_STATUS_USER;
406            (*hdr).tp_mac = payload_offset as _;
407            (*hdr).tp_snaplen = payload.len() as u32;
408        }
409
410        // Fill in the embedded sockaddr_ll with a known source MAC.
411        let expected_mac = MacAddr::new([0xde, 0xad, 0xbe, 0xef, 0x00, 0x01]);
412        let sll =
413            unsafe { &mut *(buf.as_mut_ptr().add(aligned_hdr_size) as *mut libc::sockaddr_ll) };
414        sll.sll_halen = 6;
415        sll.sll_addr[..6].copy_from_slice(expected_mac.as_bytes());
416
417        // Write the payload.
418        buf[payload_offset..payload_offset + payload.len()].copy_from_slice(payload);
419
420        // Read back MAC via the same logic used in try_pop_rx_with_source.
421        let sll_read =
422            unsafe { &*(buf.as_ptr().add(aligned_hdr_size) as *const libc::sockaddr_ll) };
423        let got_mac = MacAddr({
424            let mut b = [0u8; 6];
425            b.copy_from_slice(&sll_read.sll_addr[..6]);
426            b
427        });
428
429        assert_eq!(got_mac, expected_mac);
430
431        let data_slice = &buf[payload_offset..payload_offset + payload.len()];
432        assert_eq!(data_slice, payload);
433    }
434}