all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Thomas Lamprecht <t.lamprecht@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [PATCH storage 07/13] api: multipath: add cluster-wide health status endpoint
Date: Fri, 26 Jun 2026 14:07:37 +0200	[thread overview]
Message-ID: <20260626121000.2095591-8-t.lamprecht@proxmox.com> (raw)
In-Reply-To: <20260626121000.2095591-1-t.lamprecht@proxmox.com>

A per-node view cannot tell whether a LUN is healthy across the whole
cluster. Add an endpoint that collects the per-node broadcasts and
combines them into a per-WWID by per-node matrix, rolled up to one
cluster-state per LUN.

The broadcasts are cross-checked against live membership, so a stale
value from an offline node reads as 'unknown' rather than as healthy.
The roll-up is taken over the nodes that are actively multipathing, so
a LUN that is optimal on three nodes but degraded on a fourth shows up
as degraded instead of hiding behind the healthy majority. A node where
a multipath storage is enabled but that broadcasts nothing is surfaced
as missing rather than vanishing from the matrix. Consuming storages
are labeled from the cluster storage config.

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
 src/PVE/API2/Multipath.pm       | 116 +++++++++++++++++++++++++++
 src/PVE/Multipath.pm            | 111 ++++++++++++++++++++++++++
 src/test/run_multipath_tests.pl | 135 ++++++++++++++++++++++++++++++++
 3 files changed, 362 insertions(+)

diff --git a/src/PVE/API2/Multipath.pm b/src/PVE/API2/Multipath.pm
index 6a165d5..5336d71 100644
--- a/src/PVE/API2/Multipath.pm
+++ b/src/PVE/API2/Multipath.pm
@@ -3,10 +3,14 @@ package PVE::API2::Multipath;
 use strict;
 use warnings;
 
+use JSON qw(decode_json);
+
+use PVE::Cluster;
 use PVE::Exception qw(raise_param_exc);
 use PVE::Storage;
 use PVE::Tools qw(extract_param);
 
+use PVE::Multipath;
 use PVE::Multipath::Config;
 use PVE::Multipath::ClusterConfig;
 
@@ -43,6 +47,33 @@ my sub multipath_consumers {
     return $consumers;
 }
 
+# The nodes where an allow-listed LUN is supposed to be assembled: those where a multipath storage
+# is enabled (its node restriction, or every cluster node when unrestricted). Read from the cluster
+# storage config so it is node-invariant.
+my sub multipath_expected_nodes {
+    my $expected = {};
+
+    my $cfg = eval { PVE::Storage::config() };
+    return $expected if !$cfg;
+
+    my $all_nodes;
+    my $ids = $cfg->{ids} // {};
+    for my $storeid (sort keys %$ids) {
+        my $scfg = $ids->{$storeid};
+        next if ($scfg->{type} // '') ne 'multipath';
+        next if $scfg->{disable};
+
+        if ($scfg->{nodes}) {
+            $expected->{$_} = 1 for keys $scfg->{nodes}->%*;
+        } else {
+            $all_nodes //= PVE::Cluster::get_nodelist();
+            $expected->{$_} = 1 for $all_nodes->@*;
+        }
+    }
+
+    return $expected;
+}
+
 # multipathd resolves an alias to a map name, so two WWIDs sharing one alias makes it drop a map
 # (the loser is order-dependent and only logged at level 1). Reject a collision up front.
 my sub assert_alias_free {
@@ -122,6 +153,91 @@ __PACKAGE__->register_method({
     },
 });
 
+__PACKAGE__->register_method({
+    name => 'status',
+    path => 'status',
+    method => 'GET',
+    protected => 1,
+    description => "Cluster-wide multipath health: a per-WWID by per-node matrix"
+        . " rolled up over the nodes that are actively multipathing.",
+    permissions => {
+        check => ['perm', '/', ['Sys.Audit']],
+    },
+    parameters => {
+        additionalProperties => 0,
+        properties => {},
+    },
+    returns => {
+        type => 'array',
+        items => {
+            type => 'object',
+            additionalProperties => 1,
+            properties => {
+                wwid => { type => 'string', description => 'The LUN WWID.' },
+                alias => {
+                    type => 'string',
+                    description => 'The configured alias, if any.',
+                    optional => 1,
+                },
+                'used-by' => {
+                    type => 'string',
+                    description => 'The storage consuming this LUN, if any.',
+                    optional => 1,
+                },
+                size => {
+                    type => 'integer',
+                    description => 'LUN size in bytes, as reported by a node.',
+                    optional => 1,
+                },
+                'cluster-state' => {
+                    type => 'string',
+                    description => "Worst map state across the actively multipathing nodes:"
+                        . " 'optimal', 'degraded' (some paths down on a node), 'missing' (an"
+                        . " active node has not assembled it), 'failed' (no active path), or"
+                        . " 'unknown' (no active node reports it).",
+                    enum => ['optimal', 'degraded', 'missing', 'failed', 'unknown'],
+                },
+                nodes => {
+                    type => 'object',
+                    description => 'Per-node map state, keyed by node name.',
+                    additionalProperties => 1,
+                },
+            },
+        },
+    },
+    code => sub {
+        my $cfg = PVE::Multipath::ClusterConfig::read_config();
+
+        my $raw_kv = PVE::Cluster::get_node_kv('multipath');
+        my $node_kv = {};
+        for my $node (keys %$raw_kv) {
+            my $decoded = eval { decode_json($raw_kv->{$node}) };
+            $node_kv->{$node} = $decoded if $decoded;
+        }
+
+        my $expected = multipath_expected_nodes();
+
+        # resolve liveness for every node we might place in the matrix: those that broadcast and
+        # those a multipath storage expects (and that may be silent)
+        my $members = PVE::Cluster::get_members() // {};
+        my $online = {};
+        for my $node (keys %$node_kv, keys %$expected) {
+            # standalone clusters carry no member info; treat the reporter as live
+            $online->{$node} =
+                (!%$members || ($members->{$node} && $members->{$node}->{online})) ? 1 : 0;
+        }
+
+        return PVE::Multipath::aggregate_cluster_status(
+            PVE::Multipath::Config::wwid_list($cfg),
+            PVE::Multipath::Config::aliases($cfg),
+            multipath_consumers(),
+            $node_kv,
+            $online,
+            $expected,
+        );
+    },
+});
+
 __PACKAGE__->register_method({
     name => 'set_overrides',
     path => '',
diff --git a/src/PVE/Multipath.pm b/src/PVE/Multipath.pm
index 5647189..2b93d57 100644
--- a/src/PVE/Multipath.pm
+++ b/src/PVE/Multipath.pm
@@ -333,4 +333,115 @@ sub broadcast_health {
     warn "multipath: health broadcast failed - $@" if $@;
 }
 
+# Severity ordering for rolling per-node states up into a cluster state; a higher number is worse.
+# 'unknown' is a stale or offline node and never drives the roll-up, so it sits below 'optimal'.
+my $STATE_RANK = {
+    unknown => -1,
+    optimal => 0,
+    degraded => 1,
+    missing => 2,
+    failed => 3,
+};
+
+# Pure: fold the per-node health summaries (already JSON-decoded) into a per-WWID cluster matrix.
+# Inputs:
+#   $allow_wwids  arrayref, the cluster WWID allow-list
+#   $aliases      { wwid => name }
+#   $used_by      { wwid => storage-id } of consuming LVM storages
+#   $node_kv      { node => summary } as broadcast by broadcast_health()
+#   $online       { node => bool }; a node absent here counts as offline
+#   $expected     { node => 1 } nodes where multipath storage is enabled, so an
+#                 allow-listed LUN is supposed to be present there
+#
+# The cluster-state is rolled up over the nodes that should carry each LUN. A node that reports a
+# summary but lacks the LUN, or an expected node that reports nothing at all (it lost every path and
+# cleared its broadcast), is 'missing'; without the $expected set such a node would silently drop
+# out of the view instead of going red. A node that carries a stale broadcast while offline, or an
+# expected node that is offline, shows as 'unknown' and does not drive the roll-up.
+sub aggregate_cluster_status {
+    my ($allow_wwids, $aliases, $used_by, $node_kv, $online, $expected) = @_;
+
+    $allow_wwids //= [];
+    $aliases //= {};
+    $used_by //= {};
+    $node_kv //= {};
+    $online //= {};
+    $expected //= {};
+
+    my %allow = map { $_ => 1 } $allow_wwids->@*;
+
+    # report the allow-list plus any WWID a node actually sees
+    my %wwids = %allow;
+    for my $node (keys %$node_kv) {
+        $wwids{$_} = 1 for keys $node_kv->{$node}->%*;
+    }
+
+    my $res = [];
+    for my $wwid (sort keys %wwids) {
+        my $nodes = {};
+        my $worst = 'optimal';
+        my $have_active = 0;
+        my $size;
+
+        my $rank = sub {
+            my ($state) = @_;
+            $worst = $state if $STATE_RANK->{$state} > $STATE_RANK->{$worst};
+        };
+
+        for my $node (sort keys %$node_kv) {
+            my $entry = $node_kv->{$node}->{$wwid};
+
+            if (!$online->{$node}) {
+                $nodes->{$node} = { state => 'unknown' } if $entry;
+                next;
+            }
+
+            $have_active = 1;
+            if ($entry) {
+                $nodes->{$node} = {
+                    state => $entry->{state},
+                    'paths-active' => $entry->{'paths-active'},
+                    'paths-total' => $entry->{'paths-total'},
+                    defined($entry->{transport}) ? (transport => $entry->{transport}) : (),
+                };
+                $size //= $entry->{size};
+                $rank->($entry->{state});
+            } else {
+                # node is actively multipathing but has not assembled this LUN
+                $nodes->{$node} = { state => 'missing' };
+                $rank->('missing');
+            }
+        }
+
+        # A LUN on the allow-list should assemble on every node where a multipath storage is
+        # enabled. An expected node with no broadcast at all is missing the map (online) or
+        # unreachable (offline); fold it in so a node that lost all its paths surfaces instead of
+        # vanishing.
+        if ($allow{$wwid}) {
+            for my $node (sort keys %$expected) {
+                next if exists $nodes->{$node};
+                if ($online->{$node}) {
+                    $have_active = 1;
+                    $nodes->{$node} = { state => 'missing' };
+                    $rank->('missing');
+                } else {
+                    $nodes->{$node} = { state => 'unknown' };
+                }
+            }
+        }
+
+        push $res->@*,
+            {
+                wwid => $wwid,
+                defined($aliases->{$wwid}) ? (alias => $aliases->{$wwid}) : (),
+                defined($used_by->{$wwid}) ? ('used-by' => $used_by->{$wwid}) : (),
+                defined($size) ? (size => $size) : (),
+                'cluster-state' => $have_active ? $worst : 'unknown',
+                nodes => $nodes,
+            };
+    }
+
+    return $res;
+}
+
 1;
diff --git a/src/test/run_multipath_tests.pl b/src/test/run_multipath_tests.pl
index affec23..9e7e1db 100755
--- a/src/test/run_multipath_tests.pl
+++ b/src/test/run_multipath_tests.pl
@@ -285,4 +285,139 @@ my $many = [
 my $big = JSON::encode_json(PVE::Multipath::summarize_maps_for_broadcast($many));
 ok(length($big) < 32 * 1024, "100-map summary (" . length($big) . " B) fits the KV size limit");
 
+# --- cluster status aggregation ---
+my $node_kv = {
+    nodeA => {
+        wA =>
+            { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2, transport => 'iscsi' },
+        wB => {
+            state => 'optimal',
+            'paths-active' => 2,
+            'paths-total' => 2,
+            transport => 'iscsi',
+            size => 42,
+        },
+    },
+    nodeB => {
+        wA =>
+            { state => 'degraded', 'paths-active' => 1, 'paths-total' => 2, transport => 'iscsi' },
+        # nodeB is active but does not see wB
+    },
+    nodeC => {
+        # stale broadcast from an offline node
+        wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 },
+    },
+};
+my $agg = PVE::Multipath::aggregate_cluster_status(
+    ['wA', 'wB', 'wZ'], # allow-list incl. an unseen WWID
+    { wA => 'lun-a' }, # alias
+    { wB => 'mptank' }, # used-by
+    $node_kv,
+    { nodeA => 1, nodeB => 1, nodeC => 0 }, # nodeC offline
+);
+my %by_wwid = map { $_->{wwid} => $_ } $agg->@*;
+
+is_deeply([sort keys %by_wwid], ['wA', 'wB', 'wZ'], 'matrix covers allow-list and seen WWIDs');
+
+is($by_wwid{wA}->{alias}, 'lun-a', 'alias surfaced on the WWID row');
+is($by_wwid{wA}->{'cluster-state'}, 'degraded', 'degraded on one active node rolls up to degraded');
+is($by_wwid{wA}->{nodes}->{nodeA}->{state}, 'optimal', 'per-node optimal cell kept');
+is($by_wwid{wA}->{nodes}->{nodeB}->{state}, 'degraded', 'per-node degraded cell kept');
+is($by_wwid{wA}->{nodes}->{nodeC}->{state}, 'unknown',
+    'offline node with stale data shows unknown');
+
+is($by_wwid{wB}->{'used-by'}, 'mptank', 'consuming storage surfaced as used-by');
+is($by_wwid{wB}->{size}, 42, 'LUN size surfaced from a reporting node');
+is(
+    $by_wwid{wB}->{'cluster-state'},
+    'missing',
+    'active node not assembling the LUN rolls up to missing',
+);
+is(
+    $by_wwid{wB}->{nodes}->{nodeB}->{state},
+    'missing',
+    'missing marked on the active node lacking it',
+);
+
+is(
+    $by_wwid{wZ}->{'cluster-state'},
+    'missing',
+    'allow-listed WWID no active node assembled is missing everywhere',
+);
+is($by_wwid{wZ}->{nodes}->{nodeA}->{state}, 'missing', 'active node missing the allow-listed WWID');
+ok(!exists $by_wwid{wZ}->{nodes}->{nodeC}, 'offline node contributes no cell for an unseen WWID');
+
+# a WWID only an offline node ever reported, with no online active node, is unknown
+my $agg_off = PVE::Multipath::aggregate_cluster_status(
+    ['wA'],
+    {},
+    {},
+    { dead => { wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 } } },
+    { dead => 0 },
+);
+is(
+    $agg_off->[0]->{'cluster-state'},
+    'unknown',
+    'no online active node leaves the cluster-state unknown',
+);
+is($agg_off->[0]->{nodes}->{dead}->{state}, 'unknown', 'stale offline node shown as unknown');
+
+# failure outranks degraded in the roll-up
+my $agg2 = PVE::Multipath::aggregate_cluster_status(
+    ['wA'],
+    {},
+    {},
+    {
+        n1 => { wA => { state => 'degraded', 'paths-active' => 1, 'paths-total' => 2 } },
+        n2 => { wA => { state => 'failed', 'paths-active' => 0, 'paths-total' => 2 } },
+    },
+    { n1 => 1, n2 => 1 },
+);
+is($agg2->[0]->{'cluster-state'}, 'failed', 'failed outranks degraded in the cluster roll-up');
+
+# --- expected-node set: a node that lost all paths (silent) must not vanish ---
+# nodeS is expected (a multipath storage is enabled there) and online, but
+# broadcasts nothing - e.g. every path to the SAN is down so it cleared its KV.
+my $exp_kv = {
+    nodeA => { wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 } },
+};
+my $online = { nodeA => 1, nodeS => 1, nodeOff => 0 };
+my $expected = { nodeA => 1, nodeS => 1, nodeOff => 1 };
+my $eagg = PVE::Multipath::aggregate_cluster_status(
+    ['wA'], {}, {}, $exp_kv, $online, $expected,
+);
+my $row = $eagg->[0];
+is($row->{nodes}->{nodeA}->{state}, 'optimal', 'reporting node keeps its real state');
+is(
+    $row->{nodes}->{nodeS}->{state},
+    'missing',
+    'expected online but silent node shows missing instead of vanishing',
+);
+is($row->{nodes}->{nodeOff}->{state}, 'unknown', 'expected offline node shows unknown');
+is($row->{'cluster-state'}, 'missing', 'a silent expected node drags the cluster-state to missing');
+
+# without $expected the silent node would have been invisible (regression guard
+# for the old behavior, proving the new param is what surfaces it)
+my $noexp = PVE::Multipath::aggregate_cluster_status(['wA'], {}, {}, $exp_kv, $online);
+ok(
+    !exists $noexp->[0]->{nodes}->{nodeS},
+    'without the expected set the silent node is absent (the gap the param closes)',
+);
+is($noexp->[0]->{'cluster-state'}, 'optimal', 'and the cluster-state would falsely read optimal');
+
+# expected augmentation applies only to allow-listed WWIDs, not to a LUN that a
+# node merely happens to report off-list
+my $offlist = PVE::Multipath::aggregate_cluster_status(
+    [],
+    {},
+    {},
+    { nodeA => { wX => { state => 'optimal', 'paths-active' => 1, 'paths-total' => 1 } } },
+    { nodeA => 1, nodeS => 1 },
+    { nodeA => 1, nodeS => 1 },
+);
+ok(
+    !exists $offlist->[0]->{nodes}->{nodeS},
+    'non-allow-listed WWID does not synthesize missing cells for expected nodes',
+);
+
 done_testing();
-- 
2.47.3





  parent reply	other threads:[~2026-06-26 12:11 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-26 12:07 [PATCH storage,cluster,manager 0/13] multipath: cluster-wide config, storage and health overview Thomas Lamprecht
2026-06-26 12:07 ` [PATCH storage 01/13] multipath: add helper library and managed configuration Thomas Lamprecht
2026-06-26 14:43   ` Maximiliano Sandoval
2026-06-26 12:07 ` [PATCH storage 02/13] api: disks: add read-only multipath status endpoint Thomas Lamprecht
2026-06-26 12:07 ` [PATCH storage 03/13] api: multipath: add cluster-wide configuration endpoints Thomas Lamprecht
2026-06-26 12:07 ` [PATCH storage 04/13] multipath: add storage plugin for multipath LUNs Thomas Lamprecht
2026-06-26 12:07 ` [PATCH storage 05/13] lvm: allow a multipath storage as the base device Thomas Lamprecht
2026-06-26 12:07 ` [PATCH storage 06/13] multipath: broadcast per-node map health to the cluster KV store Thomas Lamprecht
2026-06-26 12:07 ` Thomas Lamprecht [this message]
2026-06-26 12:07 ` [PATCH cluster 08/13] pmxcfs: track cluster-wide multipath configuration Thomas Lamprecht
2026-06-26 12:07 ` [PATCH manager 09/13] pvestatd: apply the cluster-wide multipath config on each node Thomas Lamprecht
2026-06-26 12:07 ` [PATCH manager 10/13] api: cluster: mount the multipath configuration endpoint Thomas Lamprecht
2026-06-26 12:07 ` [PATCH manager 11/13] pvestatd: broadcast multipath map health to the cluster Thomas Lamprecht
2026-06-26 12:07 ` [PATCH manager 12/13] ui: dc: add multipath health matrix and config editor Thomas Lamprecht
2026-06-26 14:05   ` Maximiliano Sandoval
2026-06-26 12:07 ` [PATCH manager 13/13] ui: node: show multipath maps and their paths under Disks Thomas Lamprecht

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260626121000.2095591-8-t.lamprecht@proxmox.com \
    --to=t.lamprecht@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal