From: Hannes Laimer <h.laimer@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [PATCH proxmox-ebpf 02/16] bpf: add bridge subsystem
Date: Tue, 9 Jun 2026 15:25:08 +0200 [thread overview]
Message-ID: <20260609132522.235917-3-h.laimer@proxmox.com> (raw)
In-Reply-To: <20260609132522.235917-1-h.laimer@proxmox.com>
Carry the per-packet identity across the underlay so the receiving host
sees the same group as the sending host and can enforce policy on it.
SRv6 and VXLAN-GBP are the two carriers that fit, but the kernel already
handles the skb->mark <-> md->gbp mapping, so we only implement SRv6 here.
Signed-off-by: Hannes Laimer <h.laimer@proxmox.com>
---
src/agent.rs | 38 +++++++++++++++-----
src/bridge/bpf/srv6.bpf.c | 76 +++++++++++++++++++++++++++++++++++++++
src/bridge/mod.rs | 75 ++++++++++++++++++++++++++++++++++++++
src/main.rs | 1 +
src/state.rs | 25 ++++++++++---
src/subsystem.rs | 3 +-
src/tc.rs | 5 +--
7 files changed, 207 insertions(+), 16 deletions(-)
create mode 100644 src/bridge/bpf/srv6.bpf.c
create mode 100644 src/bridge/mod.rs
diff --git a/src/agent.rs b/src/agent.rs
index ae8eb66..34aede5 100644
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -1,7 +1,7 @@
//! Agent orchestrator. Builds the per-host [`DesiredState`] from the SDN running-config and applies
-//! it through the [`policy`](crate::policy) subsystem. A full pass ([`apply`](Agent::apply)) covers
-//! the whole host, the tap_plug path
-//! ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy)) programs a single interface.
+//! it. A full pass ([`apply`](Agent::apply)) covers both subsystems ([`policy`](crate::policy) and
+//! [`bridge`](crate::bridge)). The tap_plug path ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy))
+//! is policy only and programs a single interface.
//!
//! The binary runs one-shot per event (boot, an SDN apply, a tap_plug), not as a resident daemon.
//! Every invocation first makes sure the programs are loaded (installing them on the first run or a
@@ -10,42 +10,61 @@
use anyhow::Context;
-use crate::{policy::PolicySubsystem, running_config, state::DesiredState};
+use crate::{
+ bridge::BridgeSubsystem, policy::PolicySubsystem, running_config, state::DesiredState,
+};
pub struct Agent {
policy: PolicySubsystem,
+ bridge: BridgeSubsystem,
}
impl Agent {
pub fn new() -> Self {
Self {
policy: PolicySubsystem::new(),
+ bridge: BridgeSubsystem::new(),
}
}
- /// Full pass over the host.
+ /// Full pass over the host, both subsystems.
pub fn apply(&mut self) -> anyhow::Result<()> {
let Some(state) = build_state()? else {
return Ok(());
};
log::debug!(
- "applying: {} groups, {} rules, {} assignments",
+ "applying: {} groups, {} rules, {} assignments, {} bridges",
state.groups.len(),
state.rules.len(),
state.assignments.len(),
+ state.bridges.len(),
);
if log::log_enabled!(log::Level::Trace) {
for (id, g) in &state.groups {
log::trace!("group {id} = '{}'", g.name);
}
}
- if let Err(e) = self.policy.apply(&state) {
- log::error!("policy: apply: {e:#}");
+ let policy = self.policy.apply(&state);
+ let bridge = self.bridge.apply(&state);
+ let mut failed = false;
+ for (name, result) in [("policy", policy), ("bridge", bridge)] {
+ if let Err(e) = result {
+ log::error!("{name} subsystem: {e:#}");
+ failed = true;
+ }
+ }
+ if failed {
+ anyhow::bail!("one or more subsystems failed to apply");
}
Ok(())
}
- /// Fast path for a guest NIC that just appeared (a tap_plug). Programs just that interface.
+ /// Policy-only fast path for a guest NIC that just appeared (a tap_plug). Bridge is untouched,
+ /// a guest NIC is never a bridge-facing interface.
+ ///
+ /// Forwards only a genuine enforcement failure for an assigned NIC, so the agent exits non-zero
+ /// and the caller does not bring the NIC up unenforced. A running config we cannot read or
+ /// build is logged and treated as success, so one bad config does not block every guest start.
pub fn apply_guest_iface_policy(&mut self, iface: &str) -> anyhow::Result<()> {
let state = match build_state() {
Ok(Some(state)) => state,
@@ -61,6 +80,7 @@ impl Agent {
/// Detach everything and drop all pinned/run state, for package removal.
pub fn clear(&self) -> anyhow::Result<()> {
self.policy.clear()?;
+ self.bridge.clear()?;
Ok(())
}
}
diff --git a/src/bridge/bpf/srv6.bpf.c b/src/bridge/bpf/srv6.bpf.c
new file mode 100644
index 0000000..8610418
--- /dev/null
+++ b/src/bridge/bpf/srv6.bpf.c
@@ -0,0 +1,76 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "mark.h"
+#include "bpf_debug.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+#define TC_ACT_OK 0
+
+#define ETH_HLEN 14
+#define IPV6_HLEN 40
+
+#define ETH_P_IPV6 0x86dd
+#define IPPROTO_ROUTING 43
+#define SRH_TYPE 4
+
+// SRH layout (IPv6 Routing Header type 4), starting from byte 0 of the header:
+// 0: next_hdr
+// 1: hdr_ext_len
+// 2: routing_type (== SRH_TYPE)
+// 3: segments_left
+// 4: last_entry
+// 5: flags
+// 6..7: tag <-- our carrier
+#define SRH_TAG_OFF (ETH_HLEN + IPV6_HLEN + 6)
+
+static __always_inline int is_srv6(struct __sk_buff *skb) {
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+
+ if (data + ETH_HLEN + IPV6_HLEN + 8 > data_end) return 0;
+
+ struct ethhdr *eth = data;
+ if (eth->h_proto != bpf_htons(ETH_P_IPV6)) return 0;
+
+ struct ipv6hdr *ip6 = (void *)(eth + 1);
+ if (ip6->nexthdr != IPPROTO_ROUTING) return 0;
+
+ __u8 *rh = (__u8 *)(ip6 + 1);
+ if (rh[2] != SRH_TYPE) return 0;
+
+ return 1;
+}
+
+SEC("classifier")
+int tc_bridge_egress(struct __sk_buff *skb) {
+ if (!is_srv6(skb)) {
+ DBG("bridge_out(%u): skip, no SRv6", skb->ifindex);
+ return TC_ACT_OK;
+ }
+ __u16 group = microseg_mark_get(skb);
+ DBG("bridge_out(%u): putting %u onto SRH tag", skb->ifindex, group);
+ __u16 tag = bpf_htons(group);
+ bpf_skb_store_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag), 0);
+ return TC_ACT_OK;
+}
+
+SEC("classifier")
+int tc_bridge_ingress(struct __sk_buff *skb) {
+ if (!is_srv6(skb)) {
+ DBG("bridge_in(%u): skip, no SRv6", skb->ifindex);
+ return TC_ACT_OK;
+ }
+
+ __u16 tag = 0;
+ if (bpf_skb_load_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag)) < 0) {
+ DBG("bridge_in(%u): skip, no tag", skb->ifindex);
+ return TC_ACT_OK;
+ }
+
+ __u16 group = bpf_ntohs(tag);
+ microseg_mark_set(skb, group);
+ DBG("bridge_in(%u): pulled %u out of SRH", skb->ifindex, group);
+ return TC_ACT_OK;
+}
diff --git a/src/bridge/mod.rs b/src/bridge/mod.rs
new file mode 100644
index 0000000..bdc8e2a
--- /dev/null
+++ b/src/bridge/mod.rs
@@ -0,0 +1,75 @@
+//! Bridge-side carrier translation. Attaches BPF programs to bridge-facing interfaces that stamp
+//! `skb->mark` onto an on-wire policy carrier on egress (e.g. the SRv6 SRH tag) and lift it back
+//! into `skb->mark` on ingress. Driven by the bridge entries in the [`DesiredState`], which the
+//! agent has already filtered to interfaces that apply on this host.
+//!
+//! The program detects the carrier from the packet itself, so the userspace side is
+//! protocol-agnostic. It attaches the one program everywhere and lets the data plane decide per
+//! packet.
+//!
+//! Pairs with the [`policy`](crate::policy) subsystem. Policy decides which group `skb->mark`
+//! carries on the local tap, bridge moves that value onto and off the wire so it survives across
+//! hosts.
+
+use std::collections::HashSet;
+
+use aya::include_bytes_aligned;
+
+use crate::state::{DesiredState, ResolvedBridge};
+use crate::subsystem::TcPrograms;
+use crate::tc::Direction;
+
+const NAME: &str = "bridge";
+
+const BRIDGE_OBJ: &[u8] = include_bytes_aligned!(concat!(env!("OUT_DIR"), "/srv6.bpf.o"));
+const BRIDGE_FINGERPRINT: u64 = TcPrograms::obj_fingerprint(BRIDGE_OBJ);
+
+fn program_name(dir: Direction) -> &'static str {
+ match dir {
+ Direction::Ingress => "tc_bridge_ingress",
+ Direction::Egress => "tc_bridge_egress",
+ }
+}
+
+pub struct BridgeSubsystem {
+ programs: TcPrograms,
+}
+
+impl BridgeSubsystem {
+ pub fn new() -> Self {
+ Self {
+ programs: TcPrograms::new(NAME, BRIDGE_OBJ, BRIDGE_FINGERPRINT, program_name, None),
+ }
+ }
+
+ pub fn apply(&mut self, state: &DesiredState) -> anyhow::Result<()> {
+ log::trace!("bridge apply starting");
+ let _lock = self.programs.lock_exclusive()?;
+ self.programs.ensure_loaded()?;
+ self.programs
+ .reconcile(&resolve_local_bridges(&state.bridges))?;
+ log::trace!("bridge apply done");
+ Ok(())
+ }
+
+ /// Detach all bridge programs and drop pinned/run state, for package removal.
+ pub fn clear(&self) -> anyhow::Result<()> {
+ self.programs.clear()
+ }
+}
+
+fn resolve_local_bridges(bridges: &[ResolvedBridge]) -> HashSet<u32> {
+ let mut out = HashSet::new();
+ for b in bridges {
+ match nix::net::if_::if_nametoindex(b.interface.as_str()) {
+ Ok(ifidx) => {
+ log::debug!("resolved bridge iface {} -> ifindex {}", b.interface, ifidx);
+ out.insert(ifidx);
+ }
+ Err(_) => {
+ log::debug!("bridge: skipping {}, no such interface", b.interface);
+ }
+ }
+ }
+ out
+}
diff --git a/src/main.rs b/src/main.rs
index 6b3c16c..0334fcb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
mod agent;
+mod bridge;
mod policy;
mod running_config;
mod state;
diff --git a/src/state.rs b/src/state.rs
index 879562b..085a747 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -1,8 +1,9 @@
//! Resolved desired state derived from the SDN running config plus the local hostname.
//!
-//! [`MicrosegRunningConfig`] is admin-facing, groups are referenced by name, rules and assignments
-//! name them. This module does the one-time name -> id resolution so subsystems see a straight-line
-//! view of "what should be applied on this host".
+//! In [`MicrosegRunningConfig`] groups are referenced by name, rules and assignments name them,
+//! bridges carry the cluster-wide `nodes` filter. This module does the one-time name -> id
+//! resolution and host filtering so subsystems see a straight-line view of "what should be applied
+//! on this host".
use std::collections::{HashMap, HashSet};
@@ -15,6 +16,7 @@ pub struct DesiredState {
pub groups: HashMap<u16, GroupInfo>,
pub rules: Vec<ResolvedRule>,
pub assignments: Vec<ResolvedAssignment>,
+ pub bridges: Vec<ResolvedBridge>,
}
#[derive(Debug)]
@@ -37,8 +39,13 @@ pub struct ResolvedAssignment {
pub group: u16,
}
+#[derive(Debug)]
+pub struct ResolvedBridge {
+ pub interface: String,
+}
+
impl DesiredState {
- pub fn build(cfg: &MicrosegRunningConfig, _this_node: &str) -> anyhow::Result<Self> {
+ pub fn build(cfg: &MicrosegRunningConfig, this_node: &str) -> anyhow::Result<Self> {
let mut name_to_id: HashMap<&str, u16> = HashMap::new();
let mut parent_of: HashMap<&str, &str> = HashMap::new();
let mut groups: HashMap<u16, GroupInfo> = HashMap::new();
@@ -124,10 +131,20 @@ impl DesiredState {
});
}
+ let mut bridges = Vec::new();
+ for (interface, b) in cfg.bridges() {
+ if b.applies_to(this_node) {
+ bridges.push(ResolvedBridge {
+ interface: interface.to_string(),
+ });
+ }
+ }
+
Ok(Self {
groups,
rules,
assignments,
+ bridges,
})
}
}
diff --git a/src/subsystem.rs b/src/subsystem.rs
index 7b944b4..1a1ab71 100644
--- a/src/subsystem.rs
+++ b/src/subsystem.rs
@@ -2,7 +2,8 @@
//! pinned under `/sys/fs/bpf/proxmox-ebpf/<name>/`. The loaded BPF stays in the kernel between
//! invocations, so [`TcPrograms::ensure_loaded`] loads and verifies only on the first run and on a
//! version change. Everything else attaches links and syncs maps against what is already there.
-//! The [`policy`](crate::policy) subsystem owns a [`TcPrograms`].
+//! The [`policy`](crate::policy) and [`bridge`](crate::bridge) subsystems each own a
+//! [`TcPrograms`].
use std::{collections::HashSet, fs::File, io::ErrorKind, path::PathBuf};
diff --git a/src/tc.rs b/src/tc.rs
index d89304d..f8a9e86 100644
--- a/src/tc.rs
+++ b/src/tc.rs
@@ -1,5 +1,6 @@
-//! TC link plumbing for the [`policy`](crate::policy) subsystem. A `Direction` enum,
-//! attach/swap/detach free functions, and a uniform pin-filename layout `{ifindex}-{direction}`.
+//! TC link plumbing shared by the [`policy`](crate::policy) and [`bridge`](crate::bridge)
+//! subsystems. A `Direction` enum, attach/swap/detach free functions, and a uniform pin-filename
+//! layout `{ifindex}-{direction}`.
use std::{io::ErrorKind, path::Path, str::FromStr};
--
2.47.3
next prev parent reply other threads:[~2026-06-09 13:25 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-09 13:25 [RFC cluster/docs/ifupdown2/manager/network/proxmox{-ebpf,-ve-rs,-perl-rs} 00/16] sdn: add microsegmentation support Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ebpf 01/16] agent: add userspace coordinator and stateless policy subsystem Hannes Laimer
2026-06-09 13:25 ` Hannes Laimer [this message]
2026-06-09 13:25 ` [PATCH proxmox-ebpf 03/16] debian: add packaging and boot-time oneshot unit Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ve-rs 04/16] ve-config: sdn: add microseg config types Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-perl-rs 05/16] sdn: add microseg config binding Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-cluster 06/16] cfs: add 'sdn/microseg.cfg' to observed files Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 07/16] sdn: microseg: add config and API Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 08/16] sdn: zones: trigger microseg apply on tap_plug Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 09/16] sdn: zones: add vxlan-gbp option to vxlan and evpn zones Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 10/16] evpn: disable vxlan-learning on create if GBP is enabled Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 11/16] ui: sdn: add microsegmentation Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 12/16] network: apply microseg state on reload Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 13/16] ui: sdn: zones: add vxlan-gbp checkbox to vxlan and evpn Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 14/16] sdn: add microsegmentation section Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 15/16] sdn: add VXLAN-GBP flag to evpn/vxlan zone sections Hannes Laimer
2026-06-09 13:25 ` [PATCH ifupdown2 16/16] d/patches: add support for VXLAN-GBP flag Hannes Laimer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260609132522.235917-3-h.laimer@proxmox.com \
--to=h.laimer@proxmox.com \
--cc=pve-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.