public inbox for pve-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Hannes Laimer <h.laimer@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [PATCH proxmox-ebpf 02/16] bpf: add bridge subsystem
Date: Tue,  9 Jun 2026 15:25:08 +0200	[thread overview]
Message-ID: <20260609132522.235917-3-h.laimer@proxmox.com> (raw)
In-Reply-To: <20260609132522.235917-1-h.laimer@proxmox.com>

Carry the per-packet identity across the underlay so the receiving host
sees the same group as the sending host and can enforce policy on it.

SRv6 and VXLAN-GBP are the two carriers that fit, but the kernel already
handles the skb->mark <-> md->gbp mapping, so we only implement SRv6 here.

Signed-off-by: Hannes Laimer <h.laimer@proxmox.com>
---
 src/agent.rs              | 38 +++++++++++++++-----
 src/bridge/bpf/srv6.bpf.c | 76 +++++++++++++++++++++++++++++++++++++++
 src/bridge/mod.rs         | 75 ++++++++++++++++++++++++++++++++++++++
 src/main.rs               |  1 +
 src/state.rs              | 25 ++++++++++---
 src/subsystem.rs          |  3 +-
 src/tc.rs                 |  5 +--
 7 files changed, 207 insertions(+), 16 deletions(-)
 create mode 100644 src/bridge/bpf/srv6.bpf.c
 create mode 100644 src/bridge/mod.rs

diff --git a/src/agent.rs b/src/agent.rs
index ae8eb66..34aede5 100644
--- a/src/agent.rs
+++ b/src/agent.rs
@@ -1,7 +1,7 @@
 //! Agent orchestrator. Builds the per-host [`DesiredState`] from the SDN running-config and applies
-//! it through the [`policy`](crate::policy) subsystem. A full pass ([`apply`](Agent::apply)) covers
-//! the whole host, the tap_plug path
-//! ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy)) programs a single interface.
+//! it. A full pass ([`apply`](Agent::apply)) covers both subsystems ([`policy`](crate::policy) and
+//! [`bridge`](crate::bridge)). The tap_plug path ([`apply_guest_iface_policy`](Agent::apply_guest_iface_policy))
+//! is policy only and programs a single interface.
 //!
 //! The binary runs one-shot per event (boot, an SDN apply, a tap_plug), not as a resident daemon.
 //! Every invocation first makes sure the programs are loaded (installing them on the first run or a
@@ -10,42 +10,61 @@
 
 use anyhow::Context;
 
-use crate::{policy::PolicySubsystem, running_config, state::DesiredState};
+use crate::{
+    bridge::BridgeSubsystem, policy::PolicySubsystem, running_config, state::DesiredState,
+};
 
 pub struct Agent {
     policy: PolicySubsystem,
+    bridge: BridgeSubsystem,
 }
 
 impl Agent {
     pub fn new() -> Self {
         Self {
             policy: PolicySubsystem::new(),
+            bridge: BridgeSubsystem::new(),
         }
     }
 
-    /// Full pass over the host.
+    /// Full pass over the host, both subsystems.
     pub fn apply(&mut self) -> anyhow::Result<()> {
         let Some(state) = build_state()? else {
             return Ok(());
         };
         log::debug!(
-            "applying: {} groups, {} rules, {} assignments",
+            "applying: {} groups, {} rules, {} assignments, {} bridges",
             state.groups.len(),
             state.rules.len(),
             state.assignments.len(),
+            state.bridges.len(),
         );
         if log::log_enabled!(log::Level::Trace) {
             for (id, g) in &state.groups {
                 log::trace!("group {id} = '{}'", g.name);
             }
         }
-        if let Err(e) = self.policy.apply(&state) {
-            log::error!("policy: apply: {e:#}");
+        let policy = self.policy.apply(&state);
+        let bridge = self.bridge.apply(&state);
+        let mut failed = false;
+        for (name, result) in [("policy", policy), ("bridge", bridge)] {
+            if let Err(e) = result {
+                log::error!("{name} subsystem: {e:#}");
+                failed = true;
+            }
+        }
+        if failed {
+            anyhow::bail!("one or more subsystems failed to apply");
         }
         Ok(())
     }
 
-    /// Fast path for a guest NIC that just appeared (a tap_plug). Programs just that interface.
+    /// Policy-only fast path for a guest NIC that just appeared (a tap_plug). Bridge is untouched,
+    /// a guest NIC is never a bridge-facing interface.
+    ///
+    /// Forwards only a genuine enforcement failure for an assigned NIC, so the agent exits non-zero
+    /// and the caller does not bring the NIC up unenforced. A running config we cannot read or
+    /// build is logged and treated as success, so one bad config does not block every guest start.
     pub fn apply_guest_iface_policy(&mut self, iface: &str) -> anyhow::Result<()> {
         let state = match build_state() {
             Ok(Some(state)) => state,
@@ -61,6 +80,7 @@ impl Agent {
     /// Detach everything and drop all pinned/run state, for package removal.
     pub fn clear(&self) -> anyhow::Result<()> {
         self.policy.clear()?;
+        self.bridge.clear()?;
         Ok(())
     }
 }
diff --git a/src/bridge/bpf/srv6.bpf.c b/src/bridge/bpf/srv6.bpf.c
new file mode 100644
index 0000000..8610418
--- /dev/null
+++ b/src/bridge/bpf/srv6.bpf.c
@@ -0,0 +1,76 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "mark.h"
+#include "bpf_debug.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+#define TC_ACT_OK 0
+
+#define ETH_HLEN  14
+#define IPV6_HLEN 40
+
+#define ETH_P_IPV6      0x86dd
+#define IPPROTO_ROUTING 43
+#define SRH_TYPE        4
+
+// SRH layout (IPv6 Routing Header type 4), starting from byte 0 of the header:
+//   0: next_hdr
+//   1: hdr_ext_len
+//   2: routing_type   (== SRH_TYPE)
+//   3: segments_left
+//   4: last_entry
+//   5: flags
+//   6..7: tag         <-- our carrier
+#define SRH_TAG_OFF (ETH_HLEN + IPV6_HLEN + 6)
+
+static __always_inline int is_srv6(struct __sk_buff *skb) {
+    void *data = (void *)(long)skb->data;
+    void *data_end = (void *)(long)skb->data_end;
+
+    if (data + ETH_HLEN + IPV6_HLEN + 8 > data_end) return 0;
+
+    struct ethhdr *eth = data;
+    if (eth->h_proto != bpf_htons(ETH_P_IPV6)) return 0;
+
+    struct ipv6hdr *ip6 = (void *)(eth + 1);
+    if (ip6->nexthdr != IPPROTO_ROUTING) return 0;
+
+    __u8 *rh = (__u8 *)(ip6 + 1);
+    if (rh[2] != SRH_TYPE) return 0;
+
+    return 1;
+}
+
+SEC("classifier")
+int tc_bridge_egress(struct __sk_buff *skb) {
+    if (!is_srv6(skb)) {
+        DBG("bridge_out(%u): skip, no SRv6", skb->ifindex);
+        return TC_ACT_OK;
+    }
+    __u16 group = microseg_mark_get(skb);
+    DBG("bridge_out(%u): putting %u onto SRH tag", skb->ifindex, group);
+    __u16 tag = bpf_htons(group);
+    bpf_skb_store_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag), 0);
+    return TC_ACT_OK;
+}
+
+SEC("classifier")
+int tc_bridge_ingress(struct __sk_buff *skb) {
+    if (!is_srv6(skb)) {
+        DBG("bridge_in(%u): skip, no SRv6", skb->ifindex);
+        return TC_ACT_OK;
+    }
+
+    __u16 tag = 0;
+    if (bpf_skb_load_bytes(skb, SRH_TAG_OFF, &tag, sizeof(tag)) < 0) {
+        DBG("bridge_in(%u): skip, no tag", skb->ifindex);
+        return TC_ACT_OK;
+    }
+
+    __u16 group = bpf_ntohs(tag);
+    microseg_mark_set(skb, group);
+    DBG("bridge_in(%u): pulled %u out of SRH", skb->ifindex, group);
+    return TC_ACT_OK;
+}
diff --git a/src/bridge/mod.rs b/src/bridge/mod.rs
new file mode 100644
index 0000000..bdc8e2a
--- /dev/null
+++ b/src/bridge/mod.rs
@@ -0,0 +1,75 @@
+//! Bridge-side carrier translation. Attaches BPF programs to bridge-facing interfaces that stamp
+//! `skb->mark` onto an on-wire policy carrier on egress (e.g. the SRv6 SRH tag) and lift it back
+//! into `skb->mark` on ingress. Driven by the bridge entries in the [`DesiredState`], which the
+//! agent has already filtered to interfaces that apply on this host.
+//!
+//! The program detects the carrier from the packet itself, so the userspace side is
+//! protocol-agnostic. It attaches the one program everywhere and lets the data plane decide per
+//! packet.
+//!
+//! Pairs with the [`policy`](crate::policy) subsystem. Policy decides which group `skb->mark`
+//! carries on the local tap, bridge moves that value onto and off the wire so it survives across
+//! hosts.
+
+use std::collections::HashSet;
+
+use aya::include_bytes_aligned;
+
+use crate::state::{DesiredState, ResolvedBridge};
+use crate::subsystem::TcPrograms;
+use crate::tc::Direction;
+
+const NAME: &str = "bridge";
+
+const BRIDGE_OBJ: &[u8] = include_bytes_aligned!(concat!(env!("OUT_DIR"), "/srv6.bpf.o"));
+const BRIDGE_FINGERPRINT: u64 = TcPrograms::obj_fingerprint(BRIDGE_OBJ);
+
+fn program_name(dir: Direction) -> &'static str {
+    match dir {
+        Direction::Ingress => "tc_bridge_ingress",
+        Direction::Egress => "tc_bridge_egress",
+    }
+}
+
+pub struct BridgeSubsystem {
+    programs: TcPrograms,
+}
+
+impl BridgeSubsystem {
+    pub fn new() -> Self {
+        Self {
+            programs: TcPrograms::new(NAME, BRIDGE_OBJ, BRIDGE_FINGERPRINT, program_name, None),
+        }
+    }
+
+    pub fn apply(&mut self, state: &DesiredState) -> anyhow::Result<()> {
+        log::trace!("bridge apply starting");
+        let _lock = self.programs.lock_exclusive()?;
+        self.programs.ensure_loaded()?;
+        self.programs
+            .reconcile(&resolve_local_bridges(&state.bridges))?;
+        log::trace!("bridge apply done");
+        Ok(())
+    }
+
+    /// Detach all bridge programs and drop pinned/run state, for package removal.
+    pub fn clear(&self) -> anyhow::Result<()> {
+        self.programs.clear()
+    }
+}
+
+fn resolve_local_bridges(bridges: &[ResolvedBridge]) -> HashSet<u32> {
+    let mut out = HashSet::new();
+    for b in bridges {
+        match nix::net::if_::if_nametoindex(b.interface.as_str()) {
+            Ok(ifidx) => {
+                log::debug!("resolved bridge iface {} -> ifindex {}", b.interface, ifidx);
+                out.insert(ifidx);
+            }
+            Err(_) => {
+                log::debug!("bridge: skipping {}, no such interface", b.interface);
+            }
+        }
+    }
+    out
+}
diff --git a/src/main.rs b/src/main.rs
index 6b3c16c..0334fcb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
 mod agent;
+mod bridge;
 mod policy;
 mod running_config;
 mod state;
diff --git a/src/state.rs b/src/state.rs
index 879562b..085a747 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -1,8 +1,9 @@
 //! Resolved desired state derived from the SDN running config plus the local hostname.
 //!
-//! [`MicrosegRunningConfig`] is admin-facing, groups are referenced by name, rules and assignments
-//! name them. This module does the one-time name -> id resolution so subsystems see a straight-line
-//! view of "what should be applied on this host".
+//! In [`MicrosegRunningConfig`] groups are referenced by name, rules and assignments name them,
+//! bridges carry the cluster-wide `nodes` filter. This module does the one-time name -> id
+//! resolution and host filtering so subsystems see a straight-line view of "what should be applied
+//! on this host".
 
 use std::collections::{HashMap, HashSet};
 
@@ -15,6 +16,7 @@ pub struct DesiredState {
     pub groups: HashMap<u16, GroupInfo>,
     pub rules: Vec<ResolvedRule>,
     pub assignments: Vec<ResolvedAssignment>,
+    pub bridges: Vec<ResolvedBridge>,
 }
 
 #[derive(Debug)]
@@ -37,8 +39,13 @@ pub struct ResolvedAssignment {
     pub group: u16,
 }
 
+#[derive(Debug)]
+pub struct ResolvedBridge {
+    pub interface: String,
+}
+
 impl DesiredState {
-    pub fn build(cfg: &MicrosegRunningConfig, _this_node: &str) -> anyhow::Result<Self> {
+    pub fn build(cfg: &MicrosegRunningConfig, this_node: &str) -> anyhow::Result<Self> {
         let mut name_to_id: HashMap<&str, u16> = HashMap::new();
         let mut parent_of: HashMap<&str, &str> = HashMap::new();
         let mut groups: HashMap<u16, GroupInfo> = HashMap::new();
@@ -124,10 +131,20 @@ impl DesiredState {
             });
         }
 
+        let mut bridges = Vec::new();
+        for (interface, b) in cfg.bridges() {
+            if b.applies_to(this_node) {
+                bridges.push(ResolvedBridge {
+                    interface: interface.to_string(),
+                });
+            }
+        }
+
         Ok(Self {
             groups,
             rules,
             assignments,
+            bridges,
         })
     }
 }
diff --git a/src/subsystem.rs b/src/subsystem.rs
index 7b944b4..1a1ab71 100644
--- a/src/subsystem.rs
+++ b/src/subsystem.rs
@@ -2,7 +2,8 @@
 //! pinned under `/sys/fs/bpf/proxmox-ebpf/<name>/`. The loaded BPF stays in the kernel between
 //! invocations, so [`TcPrograms::ensure_loaded`] loads and verifies only on the first run and on a
 //! version change. Everything else attaches links and syncs maps against what is already there.
-//! The [`policy`](crate::policy) subsystem owns a [`TcPrograms`].
+//! The [`policy`](crate::policy) and [`bridge`](crate::bridge) subsystems each own a
+//! [`TcPrograms`].
 
 use std::{collections::HashSet, fs::File, io::ErrorKind, path::PathBuf};
 
diff --git a/src/tc.rs b/src/tc.rs
index d89304d..f8a9e86 100644
--- a/src/tc.rs
+++ b/src/tc.rs
@@ -1,5 +1,6 @@
-//! TC link plumbing for the [`policy`](crate::policy) subsystem. A `Direction` enum,
-//! attach/swap/detach free functions, and a uniform pin-filename layout `{ifindex}-{direction}`.
+//! TC link plumbing shared by the [`policy`](crate::policy) and [`bridge`](crate::bridge)
+//! subsystems. A `Direction` enum, attach/swap/detach free functions, and a uniform pin-filename
+//! layout `{ifindex}-{direction}`.
 
 use std::{io::ErrorKind, path::Path, str::FromStr};
 
-- 
2.47.3





  parent reply	other threads:[~2026-06-09 13:25 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-09 13:25 [RFC cluster/docs/ifupdown2/manager/network/proxmox{-ebpf,-ve-rs,-perl-rs} 00/16] sdn: add microsegmentation support Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ebpf 01/16] agent: add userspace coordinator and stateless policy subsystem Hannes Laimer
2026-06-09 13:25 ` Hannes Laimer [this message]
2026-06-09 13:25 ` [PATCH proxmox-ebpf 03/16] debian: add packaging and boot-time oneshot unit Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-ve-rs 04/16] ve-config: sdn: add microseg config types Hannes Laimer
2026-06-09 13:25 ` [PATCH proxmox-perl-rs 05/16] sdn: add microseg config binding Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-cluster 06/16] cfs: add 'sdn/microseg.cfg' to observed files Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 07/16] sdn: microseg: add config and API Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 08/16] sdn: zones: trigger microseg apply on tap_plug Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 09/16] sdn: zones: add vxlan-gbp option to vxlan and evpn zones Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-network 10/16] evpn: disable vxlan-learning on create if GBP is enabled Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 11/16] ui: sdn: add microsegmentation Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 12/16] network: apply microseg state on reload Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-manager 13/16] ui: sdn: zones: add vxlan-gbp checkbox to vxlan and evpn Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 14/16] sdn: add microsegmentation section Hannes Laimer
2026-06-09 13:25 ` [PATCH pve-docs 15/16] sdn: add VXLAN-GBP flag to evpn/vxlan zone sections Hannes Laimer
2026-06-09 13:25 ` [PATCH ifupdown2 16/16] d/patches: add support for VXLAN-GBP flag Hannes Laimer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260609132522.235917-3-h.laimer@proxmox.com \
    --to=h.laimer@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal