From: Hannes Laimer <h.laimer@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [PATCH proxmox-ebpf 02/16] bpf: add bridge subsystem
Date: Tue, 9 Jun 2026 15:25:08 +0200 [thread overview]
Message-ID: <20260609132522.235917-3-h.laimer@proxmox.com> (raw)
In-Reply-To: <20260609132522.235917-1-h.laimer@proxmox.com>
Carry the per-packet identity across the underlay so the receiving host
sees the same group as the sending host and can enforce policy on it.
SRv6 and VXLAN-GBP are the two carriers that fit, but the kernel already
handles the skb->mark <-> md->gbp mapping, so we only implement SRv6 here.
Signed-off-by: Hannes Laimer <h.laimer@proxmox.com>
---
src/agent.rs | 38 +++++++++++++++-----
src/bridge/bpf/srv6.bpf.c | 76 +++++++++++++++++++++++++++++++++++++++
src/bridge/mod.rs | 75 ++++++++++++++++++++++++++++++++++++++
src/main.rs | 1 +
src/state.rs | 25 ++++++++++---
src/subsystem.rs | 3 +-
src/tc.rs | 5 +--
7 files changed, 207 insertions(+), 16 deletions(-)
create mode 100644 src/bridge/bpf/srv6.bpf.c
create mode 100644 src/bridge/mod.rs
diff --git a/src/agent.rs b/src/agent.rs
index ae8eb66..34aede5 100644
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -1,7 +1,7 @@
//! Agent orchestrator. Builds the per-host [`DesiredState`] from the SDN running-config and applies
-//! it through the [`policy`](crate::policy) subsystem. A full pass ([`apply`](Agent::apply)) covers
-//! the whole host, the tap_plug path
-//! ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy)) programs a single interface.
+//! it. A full pass ([`apply`](Agent::apply)) covers both subsystems ([`policy`](crate::policy) and
+//! [`bridge`](crate::bridge)). The tap_plug path ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy))
+//! is policy only and programs a single interface.
//!
//! The binary runs one-shot per event (boot, an SDN apply, a tap_plug), not as a resident daemon.
//! Every invocation first makes sure the programs are loaded (installing them on the first run or a
@@ -10,42 +10,61 @@
use anyhow::Context;
-use crate::{policy::PolicySubsystem, running_config, state::DesiredState};
+use crate::{
+ bridge::BridgeSubsystem, policy::PolicySubsystem, running_config, state::DesiredState,
+};
pub struct Agent {
policy: PolicySubsystem,
+ bridge: BridgeSubsystem,
}
impl Agent {
pub fn new() -> Self {
Self {
policy: PolicySubsystem::new(),
+ bridge: BridgeSubsystem::new(),
}
}
- /// Full pass over the host.
+ /// Full pass over the host, both subsystems.
pub fn apply(&mut self) -> anyhow::Result<()> {
let Some(state) = build_state()? else {
return Ok(());
};
log::debug!(
- "applying: {} groups, {} rules, {} assignments",
+ "applying: {} groups, {} rules, {} assignments, {} bridges",
state.groups.len(),
state.rules.len(),
state.assignments.len(),
+ state.bridges.len(),
);
if log::log_enabled!(log::Level::Trace) {
for (id, g) in &state.groups {
log::trace!("group {id} = '{}'", g.name);
}
}
- if let Err(e) = self.policy.apply(&state) {
- log::error!("policy: apply: {e:#}");
+ let policy = self.policy.apply(&state);
+ let bridge = self.bridge.apply(&state);
+ let mut failed = false;
+ for (name, result) in [("policy", policy), ("bridge", bridge)] {
+ if let Err(e) = result {
+ log::error!("{name} subsystem: {e:#}");
+ failed = true;
+ }
+ }
+ if failed {
+ anyhow::bail!("one or more subsystems failed to apply");
}
Ok(())
}
- /// Fast path for a guest NIC that just appeared (a tap_plug). Programs just that interface.
+ /// Policy-only fast path for a guest NIC that just appeared (a tap_plug). Bridge is untouched,
+ /// a guest NIC is never a bridge-facing interface.
+ ///
+ /// Forwards only a genuine enforcement failure for an assigned NIC, so the agent exits non-zero
+ /// and the caller does not bring the NIC up unenforced. A running config we cannot read or
+ /// build is logged and treated as success, so one bad config does not block every guest start.
pub fn apply_guest_iface_policy(&mut self, iface: &str) -> anyhow::Result<()> {
let state = match build_state() {
Ok(Some(state)) => state,
@@ -61,6 +80,7 @@ impl Agent {
/// Detach everything and drop all pinned/run state, for package removal.
pub fn clear(&self) -> anyhow::Result<()> {
self.policy.clear()?;
+ self.bridge.clear()?;
Ok(())
}
}
diff --git a/src/bridge/bpf/srv6.bpf.c b/src/bridge/bpf/srv6.bpf.c
new file mode 100644
index 0000000..8610418
--- /dev/null
+++ b/src/bridge/bpf/srv6.bpf.c
@@ -0,0 +1,76 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "mark.h"
+#include "bpf_debug.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+#define TC_ACT_OK 0
+
+#define ETH_HLEN 14
+#define IPV6_HLEN 40
+
+#define ETH_P_IPV6 0x86dd
+#define IPPROTO_ROUTING 43
+#define SRH_TYPE 4
+
+// SRH layout (IPv6 Routing Header type 4), starting from byte 0 of the header:
+// 0: next_hdr
+// 1: hdr_ext_len
+// 2: routing_type (== SRH_TYPE)
+// 3: segments_left
+// 4: last_entry
+// 5: flags
+// 6..7: tag <-- our carrier
+#define SRH_TAG_OFF (ETH_HLEN + IPV6_HLEN + 6)
+
+static __always_inline int is_srv6(struct __sk_buff *skb) {
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (data + ETH_HLEN + IPV6_HLEN + 8 > data_end) return 0;
+
+ struct ethhdr *eth = data;
+ if (eth->h_proto != bpf_htons(ETH_P_IPV6)) return 0;
+
+ struct ipv6hdr *ip6 = (void *)(eth + 1);
+ if (ip6->nexthdr != IPPROTO_ROUTING) return 0;
+
+ __u8 *rh = (__u8 *)(ip6 + 1);
+ if (rh[2] != SRH_TYPE) return 0;
+
+ return 1;
+}
+
+SEC("classifier")
+int tc_bridge_egress(struct __sk_buff *skb) {
+ if (!is_srv6(skb)) {
+ DBG("bridge_out(%u): skip, no SRv6", skb->ifindex);
+ return TC_ACT_OK;
+ }
+ __u16 group = microseg_mark_get(skb);
+ DBG("bridge_out(%u): putting %u onto SRH tag", skb->ifindex, group);
+ __u16 tag = bpf_htons(group);
+ bpf_skb_store_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag), 0);
+ return TC_ACT_OK;
+}
+
+SEC("classifier")
+int tc_bridge_ingress(struct __sk_buff *skb) {
+ if (!is_srv6(skb)) {
+ DBG("bridge_in(%u): skip, no SRv6", skb->ifindex);
+ return TC_ACT_OK;
+ }
+
+ __u16 tag = 0;
+ if (bpf_skb_load_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag)) < 0) {
+ DBG("bridge_in(%u): skip, no tag", skb->ifindex);
+ return TC_ACT_OK;
+ }
+
+ __u16 group = bpf_ntohs(tag);
+ microseg_mark_set(skb, group);
+ DBG("bridge_in(%u): pulled %u out of SRH", skb->ifindex, group);
+ return TC_ACT_OK;
+}
diff --git a/src/bridge/mod.rs b/src/bridge/mod.rs
new file mode 100644
index 0000000..bdc8e2a
--- /dev/null
+++ b/src/bridge/mod.rs
@@ -0,0 +1,75 @@
+//! Bridge-side carrier translation. Attaches BPF programs to bridge-facing interfaces that stamp
+//! `skb->mark` onto an on-wire policy carrier on egress (e.g. the SRv6 SRH tag) and lift it back
+//! into `skb->mark` on ingress. Driven by the bridge entries in the [`DesiredState`], which the
+//! agent has already filtered to interfaces that apply on this host.
+//!
+//! The program detects the carrier from the packet itself, so the userspace side is
+//! protocol-agnostic. It attaches the one program everywhere and lets the data plane decide per
+//! packet.
+//!
+//! Pairs with the [`policy`](crate::policy) subsystem. Policy decides which group `skb->mark`
+//! carries on the local tap, bridge moves that value onto and off the wire so it survives across
+//! hosts.
+
+use std::collections::HashSet;
+
+use aya::include_bytes_aligned;
+
+use crate::state::{DesiredState, ResolvedBridge};
+use crate::subsystem::TcPrograms;
+use crate::tc::Direction;
+
+const NAME: &str = "bridge";
+
+const BRIDGE_OBJ: &[u8] = include_bytes_aligned!(concat!(env!("OUT_DIR"), "/srv6.bpf.o"));
+const BRIDGE_FINGERPRINT: u64 = TcPrograms::obj_fingerprint(BRIDGE_OBJ);
+
+fn program_name(dir: Direction) -> &'static str {
+ match dir {
+ Direction::Ingress => "tc_bridge_ingress",
+ Direction::Egress => "tc_bridge_egress",
+ }
+}
+
+pub struct BridgeSubsystem {
+ programs: TcPrograms,
+}
+
+impl BridgeSubsystem {
+ pub fn new() -> Self {
+ Self {
+ programs: TcPrograms::new(NAME, BRIDGE_OBJ, BRIDGE_FINGERPRINT, program_name, None),
+ }
+ }
+
+ pub fn apply(&mut self, state: &DesiredState) -> anyhow::Result<()> {
+ log::trace!("bridge apply starting");
+ let _lock = self.programs.lock_exclusive()?;
+ self.programs.ensure_loaded()?;
+ self.programs
+ .reconcile(&resolve_local_bridges(&state.bridges))?;
+ log::trace!("bridge apply done");
+ Ok(())
+ }
+
+ /// Detach all bridge programs and drop pinned/run state, for package removal.
+ pub fn clear(&self) -> anyhow::Result<()> {
+ self.programs.clear()
+ }
+}
+
+fn resolve_local_bridges(bridges: &[ResolvedBridge]) -> HashSet<u32> {
+ let mut out = HashSet::new();
+ for b in bridges {
+ match nix::net::if_::if_nametoindex(b.interface.as_str()) {
+ Ok(ifidx) => {
+ log::debug!("resolved bridge iface {} -> ifindex {}", b.interface, ifidx);
+ out.insert(ifidx);
+ }
+ Err(_) => {
+ log::debug!("bridge: skipping {}, no such interface", b.interface);
+ }
+ }
+ }
+ out
+}
diff --git a/src/main.rs b/src/main.rs
index 6b3c16c..0334fcb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
mod agent;
+mod bridge;
mod policy;
mod running_config;
mod state;
diff --git a/src/state.rs b/src/state.rs
index 879562b..085a747 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -1,8 +1,9 @@
//! Resolved desired state derived from the SDN running config plus the local hostname.
//!
-//! [`MicrosegRunningConfig`] is admin-facing, groups are referenced by name, rules and assignments
-//! name them. This module does the one-time name -> id resolution so subsystems see a straight-line
-//! view of "what should be applied on this host".
+//! In [`MicrosegRunningConfig`] groups are referenced by name, rules and assignments name them,
+//! bridges carry the cluster-wide `nodes` filter. This module does the one-time name -> id
+//! resolution and host filtering so subsystems see a straight-line view of "what should be applied
+//! on this host".
use std::collections::{HashMap, HashSet};
@@ -15,6 +16,7 @@ pub struct DesiredState {
pub groups: HashMap<u16, GroupInfo>,
pub rules: Vec<ResolvedRule>,
pub assignments: Vec<ResolvedAssignment>,
+ pub bridges: Vec<ResolvedBridge>,
}
#[derive(Debug)]
@@ -37,8 +39,13 @@ pub struct ResolvedAssignment {
pub group: u16,
}
+#[derive(Debug)]
+pub struct ResolvedBridge {
+ pub interface: String,
+}
+
impl DesiredState {
- pub fn build(cfg: &MicrosegRunningConfig, _this_node: &str) -> anyhow::Result<Self> {
+ pub fn build(cfg: &MicrosegRunningConfig, this_node: &str) -> anyhow::Result<Self> {
let mut name_to_id: HashMap<&str, u16> = HashMap::new();
let mut parent_of: HashMap<&str, &str> = HashMap::new();
let mut groups: HashMap<u16, GroupInfo> = HashMap::new();
@@ -124,10 +131,20 @@ impl DesiredState {
});
}
+ let mut bridges = Vec::new();
+ for (interface, b) in cfg.bridges() {
+ if b.applies_to(this_node) {
+ bridges.push(ResolvedBridge {
+ interface: interface.to_string(),
+ });
+ }
+ }
+
Ok(Self {
groups,
rules,
assignments,
+ bridges,
})
}
}
diff --git a/src/subsystem.rs b/src/subsystem.rs
index 7b944b4..1a1ab71 100644
--- a/src/subsystem.rs
+++ b/src/subsystem.rs
@@ -2,7 +2,8 @@
//! pinned under `/sys/fs/bpf/proxmox-ebpf/<name>/`. The loaded BPF stays in the kernel between
//! invocations, so [`TcPrograms::ensure_loaded`] loads and verifies only on the first run and on a
//! version change. Everything else attaches links and syncs maps against what is already there.
-//! The [`policy`](crate::policy) subsystem owns a [`TcPrograms`].
+//! The [`policy`](crate::policy) and [`bridge`](crate::bridge) subsystems each own a
+//! [`TcPrograms`].
use std::{collections::HashSet, fs::File, io::ErrorKind, path::PathBuf};
diff --git a/src/tc.rs b/src/tc.rs
index d89304d..f8a9e86 100644
--- a/src/tc.rs
+++ b/src/tc.rs
@@ -1,5 +1,6 @@
-//! TC link plumbing for the [`policy`](crate::policy) subsystem. A `Direction` enum,
-//! attach/swap/detach free functions, and a uniform pin-filename layout `{ifindex}-{direction}`.
+//! TC link plumbing shared by the [`policy`](crate::policy) and [`bridge`](crate::bridge)
+//! subsystems. A `Direction` enum, attach/swap/detach free functions, and a uniform pin-filename
+//! layout `{ifindex}-{direction}`.
use std::{io::ErrorKind, path::Path, str::FromStr};
--
2.47.3
next prev parent reply other threads:[~2026-06-09 13:25 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-09 13:25 [RFC cluster/docs/ifupdown2/manager/network/proxmox{-ebpf,-ve-rs,-perl-rs} 00/16] sdn: add microsegmentation support Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ebpf 01/16] agent: add userspace coordinator and stateless policy subsystem Hannes Laimer
2026-06-09 13:25 ` Hannes Laimer [this message]
2026-06-09 13:25 ` [PATCH proxmox-ebpf 03/16] debian: add packaging and boot-time oneshot unit Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ve-rs 04/16] ve-config: sdn: add microseg config types Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-perl-rs 05/16] sdn: add microseg config binding Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-cluster 06/16] cfs: add 'sdn/microseg.cfg' to observed files Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 07/16] sdn: microseg: add config and API Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 08/16] sdn: zones: trigger microseg apply on tap_plug Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 09/16] sdn: zones: add vxlan-gbp option to vxlan and evpn zones Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 10/16] evpn: disable vxlan-learning on create if GBP is enabled Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 11/16] ui: sdn: add microsegmentation Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 12/16] network: apply microseg state on reload Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 13/16] ui: sdn: zones: add vxlan-gbp checkbox to vxlan and evpn Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 14/16] sdn: add microsegmentation section Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 15/16] sdn: add VXLAN-GBP flag to evpn/vxlan zone sections Hannes Laimer
2026-06-09 13:25 ` [PATCH ifupdown2 16/16] d/patches: add support for VXLAN-GBP flag Hannes Laimer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260609132522.235917-3-h.laimer@proxmox.com \
--to=h.laimer@proxmox.com \
--cc=pve-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox