From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68]) by lore.proxmox.com (Postfix) with ESMTPS id 87A7E1FF146 for ; Tue, 09 Jun 2026 15:25:38 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id 0B3F811C17; Tue, 9 Jun 2026 15:25:35 +0200 (CEST) From: Hannes Laimer To: pve-devel@lists.proxmox.com Subject: [PATCH proxmox-ebpf 02/16] bpf: add bridge subsystem Date: Tue, 9 Jun 2026 15:25:08 +0200 Message-ID: <20260609132522.235917-3-h.laimer@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260609132522.235917-1-h.laimer@proxmox.com> References: <20260609132522.235917-1-h.laimer@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1781011483417 X-SPAM-LEVEL: Spam detection results: 0 AWL -0.918 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment KAM_MAILER 2 Automated Mailer Tag Left in Email SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: TZP2TZZ56ZFTY6RIN7NNJSBWV5CLBST6 X-Message-ID-Hash: TZP2TZZ56ZFTY6RIN7NNJSBWV5CLBST6 X-MailFrom: h.laimer@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Carry the per-packet identity across the underlay so the receiving host sees the same group as the sending host and can enforce policy on it. SRv6 and VXLAN-GBP are the two carriers that fit, but the kernel already handles the skb->mark <-> md->gbp mapping, so we only implement SRv6 here. Signed-off-by: Hannes Laimer --- src/agent.rs | 38 +++++++++++++++----- src/bridge/bpf/srv6.bpf.c | 76 +++++++++++++++++++++++++++++++++++++++ src/bridge/mod.rs | 75 ++++++++++++++++++++++++++++++++++++++ src/main.rs | 1 + src/state.rs | 25 ++++++++++--- src/subsystem.rs | 3 +- src/tc.rs | 5 +-- 7 files changed, 207 insertions(+), 16 deletions(-) create mode 100644 src/bridge/bpf/srv6.bpf.c create mode 100644 src/bridge/mod.rs diff --git a/src/agent.rs b/src/agent.rs index ae8eb66..34aede5 100644 --- a/src/agent.rs +++ b/src/agent.rs @@ -1,7 +1,7 @@ //! Agent orchestrator. Builds the per-host [`DesiredState`] from the SDN running-config and applies -//! it through the [`policy`](crate::policy) subsystem. A full pass ([`apply`](Agent::apply)) covers -//! the whole host, the tap_plug path -//! ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy)) programs a single interface. +//! it. A full pass ([`apply`](Agent::apply)) covers both subsystems ([`policy`](crate::policy) and +//! [`bridge`](crate::bridge)). The tap_plug path ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy)) +//! is policy only and programs a single interface. //! //! The binary runs one-shot per event (boot, an SDN apply, a tap_plug), not as a resident daemon. //! Every invocation first makes sure the programs are loaded (installing them on the first run or a @@ -10,42 +10,61 @@ use anyhow::Context; -use crate::{policy::PolicySubsystem, running_config, state::DesiredState}; +use crate::{ + bridge::BridgeSubsystem, policy::PolicySubsystem, running_config, state::DesiredState, +}; pub struct Agent { policy: PolicySubsystem, + bridge: BridgeSubsystem, } impl Agent { pub fn new() -> Self { Self { policy: PolicySubsystem::new(), + bridge: BridgeSubsystem::new(), } } - /// Full pass over the host. + /// Full pass over the host, both subsystems. pub fn apply(&mut self) -> anyhow::Result<()> { let Some(state) = build_state()? else { return Ok(()); }; log::debug!( - "applying: {} groups, {} rules, {} assignments", + "applying: {} groups, {} rules, {} assignments, {} bridges", state.groups.len(), state.rules.len(), state.assignments.len(), + state.bridges.len(), ); if log::log_enabled!(log::Level::Trace) { for (id, g) in &state.groups { log::trace!("group {id} = '{}'", g.name); } } - if let Err(e) = self.policy.apply(&state) { - log::error!("policy: apply: {e:#}"); + let policy = self.policy.apply(&state); + let bridge = self.bridge.apply(&state); + let mut failed = false; + for (name, result) in [("policy", policy), ("bridge", bridge)] { + if let Err(e) = result { + log::error!("{name} subsystem: {e:#}"); + failed = true; + } + } + if failed { + anyhow::bail!("one or more subsystems failed to apply"); } Ok(()) } - /// Fast path for a guest NIC that just appeared (a tap_plug). Programs just that interface. + /// Policy-only fast path for a guest NIC that just appeared (a tap_plug). Bridge is untouched, + /// a guest NIC is never a bridge-facing interface. + /// + /// Forwards only a genuine enforcement failure for an assigned NIC, so the agent exits non-zero + /// and the caller does not bring the NIC up unenforced. A running config we cannot read or + /// build is logged and treated as success, so one bad config does not block every guest start. pub fn apply_guest_iface_policy(&mut self, iface: &str) -> anyhow::Result<()> { let state = match build_state() { Ok(Some(state)) => state, @@ -61,6 +80,7 @@ impl Agent { /// Detach everything and drop all pinned/run state, for package removal. pub fn clear(&self) -> anyhow::Result<()> { self.policy.clear()?; + self.bridge.clear()?; Ok(()) } } diff --git a/src/bridge/bpf/srv6.bpf.c b/src/bridge/bpf/srv6.bpf.c new file mode 100644 index 0000000..8610418 --- /dev/null +++ b/src/bridge/bpf/srv6.bpf.c @@ -0,0 +1,76 @@ +#include "vmlinux.h" +#include +#include +#include "mark.h" +#include "bpf_debug.h" + +char LICENSE[] SEC("license") = "GPL"; + +#define TC_ACT_OK 0 + +#define ETH_HLEN 14 +#define IPV6_HLEN 40 + +#define ETH_P_IPV6 0x86dd +#define IPPROTO_ROUTING 43 +#define SRH_TYPE 4 + +// SRH layout (IPv6 Routing Header type 4), starting from byte 0 of the header: +// 0: next_hdr +// 1: hdr_ext_len +// 2: routing_type (== SRH_TYPE) +// 3: segments_left +// 4: last_entry +// 5: flags +// 6..7: tag <-- our carrier +#define SRH_TAG_OFF (ETH_HLEN + IPV6_HLEN + 6) + +static __always_inline int is_srv6(struct __sk_buff *skb) { + void *data = (void *)(long)skb->data; + void *data_end = (void *)(long)skb->data_end; + + if (data + ETH_HLEN + IPV6_HLEN + 8 > data_end) return 0; + + struct ethhdr *eth = data; + if (eth->h_proto != bpf_htons(ETH_P_IPV6)) return 0; + + struct ipv6hdr *ip6 = (void *)(eth + 1); + if (ip6->nexthdr != IPPROTO_ROUTING) return 0; + + __u8 *rh = (__u8 *)(ip6 + 1); + if (rh[2] != SRH_TYPE) return 0; + + return 1; +} + +SEC("classifier") +int tc_bridge_egress(struct __sk_buff *skb) { + if (!is_srv6(skb)) { + DBG("bridge_out(%u): skip, no SRv6", skb->ifindex); + return TC_ACT_OK; + } + __u16 group = microseg_mark_get(skb); + DBG("bridge_out(%u): putting %u onto SRH tag", skb->ifindex, group); + __u16 tag = bpf_htons(group); + bpf_skb_store_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag), 0); + return TC_ACT_OK; +} + +SEC("classifier") +int tc_bridge_ingress(struct __sk_buff *skb) { + if (!is_srv6(skb)) { + DBG("bridge_in(%u): skip, no SRv6", skb->ifindex); + return TC_ACT_OK; + } + + __u16 tag = 0; + if (bpf_skb_load_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag)) < 0) { + DBG("bridge_in(%u): skip, no tag", skb->ifindex); + return TC_ACT_OK; + } + + __u16 group = bpf_ntohs(tag); + microseg_mark_set(skb, group); + DBG("bridge_in(%u): pulled %u out of SRH", skb->ifindex, group); + return TC_ACT_OK; +} diff --git a/src/bridge/mod.rs b/src/bridge/mod.rs new file mode 100644 index 0000000..bdc8e2a --- /dev/null +++ b/src/bridge/mod.rs @@ -0,0 +1,75 @@ +//! Bridge-side carrier translation. Attaches BPF programs to bridge-facing interfaces that stamp +//! `skb->mark` onto an on-wire policy carrier on egress (e.g. the SRv6 SRH tag) and lift it back +//! into `skb->mark` on ingress. Driven by the bridge entries in the [`DesiredState`], which the +//! agent has already filtered to interfaces that apply on this host. +//! +//! The program detects the carrier from the packet itself, so the userspace side is +//! protocol-agnostic. It attaches the one program everywhere and lets the data plane decide per +//! packet. +//! +//! Pairs with the [`policy`](crate::policy) subsystem. Policy decides which group `skb->mark` +//! carries on the local tap, bridge moves that value onto and off the wire so it survives across +//! hosts. + +use std::collections::HashSet; + +use aya::include_bytes_aligned; + +use crate::state::{DesiredState, ResolvedBridge}; +use crate::subsystem::TcPrograms; +use crate::tc::Direction; + +const NAME: &str = "bridge"; + +const BRIDGE_OBJ: &[u8] = include_bytes_aligned!(concat!(env!("OUT_DIR"), "/srv6.bpf.o")); +const BRIDGE_FINGERPRINT: u64 = TcPrograms::obj_fingerprint(BRIDGE_OBJ); + +fn program_name(dir: Direction) -> &'static str { + match dir { + Direction::Ingress => "tc_bridge_ingress", + Direction::Egress => "tc_bridge_egress", + } +} + +pub struct BridgeSubsystem { + programs: TcPrograms, +} + +impl BridgeSubsystem { + pub fn new() -> Self { + Self { + programs: TcPrograms::new(NAME, BRIDGE_OBJ, BRIDGE_FINGERPRINT, program_name, None), + } + } + + pub fn apply(&mut self, state: &DesiredState) -> anyhow::Result<()> { + log::trace!("bridge apply starting"); + let _lock = self.programs.lock_exclusive()?; + self.programs.ensure_loaded()?; + self.programs + .reconcile(&resolve_local_bridges(&state.bridges))?; + log::trace!("bridge apply done"); + Ok(()) + } + + /// Detach all bridge programs and drop pinned/run state, for package removal. + pub fn clear(&self) -> anyhow::Result<()> { + self.programs.clear() + } +} + +fn resolve_local_bridges(bridges: &[ResolvedBridge]) -> HashSet { + let mut out = HashSet::new(); + for b in bridges { + match nix::net::if_::if_nametoindex(b.interface.as_str()) { + Ok(ifidx) => { + log::debug!("resolved bridge iface {} -> ifindex {}", b.interface, ifidx); + out.insert(ifidx); + } + Err(_) => { + log::debug!("bridge: skipping {}, no such interface", b.interface); + } + } + } + out +} diff --git a/src/main.rs b/src/main.rs index 6b3c16c..0334fcb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ mod agent; +mod bridge; mod policy; mod running_config; mod state; diff --git a/src/state.rs b/src/state.rs index 879562b..085a747 100644 --- a/src/state.rs +++ b/src/state.rs @@ -1,8 +1,9 @@ //! Resolved desired state derived from the SDN running config plus the local hostname. //! -//! [`MicrosegRunningConfig`] is admin-facing, groups are referenced by name, rules and assignments -//! name them. This module does the one-time name -> id resolution so subsystems see a straight-line -//! view of "what should be applied on this host". +//! In [`MicrosegRunningConfig`] groups are referenced by name, rules and assignments name them, +//! bridges carry the cluster-wide `nodes` filter. This module does the one-time name -> id +//! resolution and host filtering so subsystems see a straight-line view of "what should be applied +//! on this host". use std::collections::{HashMap, HashSet}; @@ -15,6 +16,7 @@ pub struct DesiredState { pub groups: HashMap, pub rules: Vec, pub assignments: Vec, + pub bridges: Vec, } #[derive(Debug)] @@ -37,8 +39,13 @@ pub struct ResolvedAssignment { pub group: u16, } +#[derive(Debug)] +pub struct ResolvedBridge { + pub interface: String, +} + impl DesiredState { - pub fn build(cfg: &MicrosegRunningConfig, _this_node: &str) -> anyhow::Result { + pub fn build(cfg: &MicrosegRunningConfig, this_node: &str) -> anyhow::Result { let mut name_to_id: HashMap<&str, u16> = HashMap::new(); let mut parent_of: HashMap<&str, &str> = HashMap::new(); let mut groups: HashMap = HashMap::new(); @@ -124,10 +131,20 @@ impl DesiredState { }); } + let mut bridges = Vec::new(); + for (interface, b) in cfg.bridges() { + if b.applies_to(this_node) { + bridges.push(ResolvedBridge { + interface: interface.to_string(), + }); + } + } + Ok(Self { groups, rules, assignments, + bridges, }) } } diff --git a/src/subsystem.rs b/src/subsystem.rs index 7b944b4..1a1ab71 100644 --- a/src/subsystem.rs +++ b/src/subsystem.rs @@ -2,7 +2,8 @@ //! pinned under `/sys/fs/bpf/proxmox-ebpf//`. The loaded BPF stays in the kernel between //! invocations, so [`TcPrograms::ensure_loaded`] loads and verifies only on the first run and on a //! version change. Everything else attaches links and syncs maps against what is already there. -//! The [`policy`](crate::policy) subsystem owns a [`TcPrograms`]. +//! The [`policy`](crate::policy) and [`bridge`](crate::bridge) subsystems each own a +//! [`TcPrograms`]. use std::{collections::HashSet, fs::File, io::ErrorKind, path::PathBuf}; diff --git a/src/tc.rs b/src/tc.rs index d89304d..f8a9e86 100644 --- a/src/tc.rs +++ b/src/tc.rs @@ -1,5 +1,6 @@ -//! TC link plumbing for the [`policy`](crate::policy) subsystem. A `Direction` enum, -//! attach/swap/detach free functions, and a uniform pin-filename layout `{ifindex}-{direction}`. +//! TC link plumbing shared by the [`policy`](crate::policy) and [`bridge`](crate::bridge) +//! subsystems. A `Direction` enum, attach/swap/detach free functions, and a uniform pin-filename +//! layout `{ifindex}-{direction}`. use std::{io::ErrorKind, path::Path, str::FromStr}; -- 2.47.3