From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from firstgate.proxmox.com (firstgate.proxmox.com [IPv6:2a01:7e0:0:424::9]) by lore.proxmox.com (Postfix) with ESMTPS id 489461FF14C for ; Fri, 26 Jun 2026 14:11:51 +0200 (CEST) Received: from firstgate.proxmox.com (localhost [127.0.0.1]) by firstgate.proxmox.com (Proxmox) with ESMTP id F047AFC1D; Fri, 26 Jun 2026 14:10:58 +0200 (CEST) From: Thomas Lamprecht To: pve-devel@lists.proxmox.com Subject: [PATCH storage 07/13] api: multipath: add cluster-wide health status endpoint Date: Fri, 26 Jun 2026 14:07:37 +0200 Message-ID: <20260626121000.2095591-8-t.lamprecht@proxmox.com> X-Mailer: git-send-email 2.47.3 In-Reply-To: <20260626121000.2095591-1-t.lamprecht@proxmox.com> References: <20260626121000.2095591-1-t.lamprecht@proxmox.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Bm-Milter-Handled: 55990f41-d878-4baa-be0a-ee34c49e34d2 X-Bm-Transport-Timestamp: 1782475801666 X-SPAM-LEVEL: Spam detection results: 0 AWL 0.005 Adjusted score from AWL reputation of From: address BAYES_00 -1.9 Bayes spam probability is 0 to 1% DMARC_MISSING 0.1 Missing DMARC policy KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment SPF_HELO_NONE 0.001 SPF: HELO does not publish an SPF Record SPF_PASS -0.001 SPF: sender matches SPF record Message-ID-Hash: QAZTSEF64GO7O2LBZXJAL6UUWIP2PH76 X-Message-ID-Hash: QAZTSEF64GO7O2LBZXJAL6UUWIP2PH76 X-MailFrom: t.lamprecht@proxmox.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; loop; banned-address; emergency; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header X-Mailman-Version: 3.3.10 Precedence: list List-Id: Proxmox VE development discussion List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: A per-node view cannot tell whether a LUN is healthy across the whole cluster. Add an endpoint that collects the per-node broadcasts and combines them into a per-WWID by per-node matrix, rolled up to one cluster-state per LUN. The broadcasts are cross-checked against live membership, so a stale value from an offline node reads as 'unknown' rather than as healthy. The roll-up is taken over the nodes that are actively multipathing, so a LUN that is optimal on three nodes but degraded on a fourth shows up as degraded instead of hiding behind the healthy majority. A node where a multipath storage is enabled but that broadcasts nothing is surfaced as missing rather than vanishing from the matrix. Consuming storages are labeled from the cluster storage config. Signed-off-by: Thomas Lamprecht --- src/PVE/API2/Multipath.pm | 116 +++++++++++++++++++++++++++ src/PVE/Multipath.pm | 111 ++++++++++++++++++++++++++ src/test/run_multipath_tests.pl | 135 ++++++++++++++++++++++++++++++++ 3 files changed, 362 insertions(+) diff --git a/src/PVE/API2/Multipath.pm b/src/PVE/API2/Multipath.pm index 6a165d5..5336d71 100644 --- a/src/PVE/API2/Multipath.pm +++ b/src/PVE/API2/Multipath.pm @@ -3,10 +3,14 @@ package PVE::API2::Multipath; use strict; use warnings; +use JSON qw(decode_json); + +use PVE::Cluster; use PVE::Exception qw(raise_param_exc); use PVE::Storage; use PVE::Tools qw(extract_param); +use PVE::Multipath; use PVE::Multipath::Config; use PVE::Multipath::ClusterConfig; @@ -43,6 +47,33 @@ my sub multipath_consumers { return $consumers; } +# The nodes where an allow-listed LUN is supposed to be assembled: those where a multipath storage +# is enabled (its node restriction, or every cluster node when unrestricted). Read from the cluster +# storage config so it is node-invariant. +my sub multipath_expected_nodes { + my $expected = {}; + + my $cfg = eval { PVE::Storage::config() }; + return $expected if !$cfg; + + my $all_nodes; + my $ids = $cfg->{ids} // {}; + for my $storeid (sort keys %$ids) { + my $scfg = $ids->{$storeid}; + next if ($scfg->{type} // '') ne 'multipath'; + next if $scfg->{disable}; + + if ($scfg->{nodes}) { + $expected->{$_} = 1 for keys $scfg->{nodes}->%*; + } else { + $all_nodes //= PVE::Cluster::get_nodelist(); + $expected->{$_} = 1 for $all_nodes->@*; + } + } + + return $expected; +} + # multipathd resolves an alias to a map name, so two WWIDs sharing one alias makes it drop a map # (the loser is order-dependent and only logged at level 1). Reject a collision up front. my sub assert_alias_free { @@ -122,6 +153,91 @@ __PACKAGE__->register_method({ }, }); +__PACKAGE__->register_method({ + name => 'status', + path => 'status', + method => 'GET', + protected => 1, + description => "Cluster-wide multipath health: a per-WWID by per-node matrix" + . " rolled up over the nodes that are actively multipathing.", + permissions => { + check => ['perm', '/', ['Sys.Audit']], + }, + parameters => { + additionalProperties => 0, + properties => {}, + }, + returns => { + type => 'array', + items => { + type => 'object', + additionalProperties => 1, + properties => { + wwid => { type => 'string', description => 'The LUN WWID.' }, + alias => { + type => 'string', + description => 'The configured alias, if any.', + optional => 1, + }, + 'used-by' => { + type => 'string', + description => 'The storage consuming this LUN, if any.', + optional => 1, + }, + size => { + type => 'integer', + description => 'LUN size in bytes, as reported by a node.', + optional => 1, + }, + 'cluster-state' => { + type => 'string', + description => "Worst map state across the actively multipathing nodes:" + . " 'optimal', 'degraded' (some paths down on a node), 'missing' (an" + . " active node has not assembled it), 'failed' (no active path), or" + . " 'unknown' (no active node reports it).", + enum => ['optimal', 'degraded', 'missing', 'failed', 'unknown'], + }, + nodes => { + type => 'object', + description => 'Per-node map state, keyed by node name.', + additionalProperties => 1, + }, + }, + }, + }, + code => sub { + my $cfg = PVE::Multipath::ClusterConfig::read_config(); + + my $raw_kv = PVE::Cluster::get_node_kv('multipath'); + my $node_kv = {}; + for my $node (keys %$raw_kv) { + my $decoded = eval { decode_json($raw_kv->{$node}) }; + $node_kv->{$node} = $decoded if $decoded; + } + + my $expected = multipath_expected_nodes(); + + # resolve liveness for every node we might place in the matrix: those that broadcast and + # those a multipath storage expects (and that may be silent) + my $members = PVE::Cluster::get_members() // {}; + my $online = {}; + for my $node (keys %$node_kv, keys %$expected) { + # standalone clusters carry no member info; treat the reporter as live + $online->{$node} = + (!%$members || ($members->{$node} && $members->{$node}->{online})) ? 1 : 0; + } + + return PVE::Multipath::aggregate_cluster_status( + PVE::Multipath::Config::wwid_list($cfg), + PVE::Multipath::Config::aliases($cfg), + multipath_consumers(), + $node_kv, + $online, + $expected, + ); + }, +}); + __PACKAGE__->register_method({ name => 'set_overrides', path => '', diff --git a/src/PVE/Multipath.pm b/src/PVE/Multipath.pm index 5647189..2b93d57 100644 --- a/src/PVE/Multipath.pm +++ b/src/PVE/Multipath.pm @@ -333,4 +333,115 @@ sub broadcast_health { warn "multipath: health broadcast failed - $@" if $@; } +# Severity ordering for rolling per-node states up into a cluster state; a higher number is worse. +# 'unknown' is a stale or offline node and never drives the roll-up, so it sits below 'optimal'. +my $STATE_RANK = { + unknown => -1, + optimal => 0, + degraded => 1, + missing => 2, + failed => 3, +}; + +# Pure: fold the per-node health summaries (already JSON-decoded) into a per-WWID cluster matrix. +# Inputs: +# $allow_wwids arrayref, the cluster WWID allow-list +# $aliases { wwid => name } +# $used_by { wwid => storage-id } of consuming LVM storages +# $node_kv { node => summary } as broadcast by broadcast_health() +# $online { node => bool }; a node absent here counts as offline +# $expected { node => 1 } nodes where multipath storage is enabled, so an +# allow-listed LUN is supposed to be present there +# +# The cluster-state is rolled up over the nodes that should carry each LUN. A node that reports a +# summary but lacks the LUN, or an expected node that reports nothing at all (it lost every path and +# cleared its broadcast), is 'missing'; without the $expected set such a node would silently drop +# out of the view instead of going red. A node that carries a stale broadcast while offline, or an +# expected node that is offline, shows as 'unknown' and does not drive the roll-up. +sub aggregate_cluster_status { + my ($allow_wwids, $aliases, $used_by, $node_kv, $online, $expected) = @_; + + $allow_wwids //= []; + $aliases //= {}; + $used_by //= {}; + $node_kv //= {}; + $online //= {}; + $expected //= {}; + + my %allow = map { $_ => 1 } $allow_wwids->@*; + + # report the allow-list plus any WWID a node actually sees + my %wwids = %allow; + for my $node (keys %$node_kv) { + $wwids{$_} = 1 for keys $node_kv->{$node}->%*; + } + + my $res = []; + for my $wwid (sort keys %wwids) { + my $nodes = {}; + my $worst = 'optimal'; + my $have_active = 0; + my $size; + + my $rank = sub { + my ($state) = @_; + $worst = $state if $STATE_RANK->{$state} > $STATE_RANK->{$worst}; + }; + + for my $node (sort keys %$node_kv) { + my $entry = $node_kv->{$node}->{$wwid}; + + if (!$online->{$node}) { + $nodes->{$node} = { state => 'unknown' } if $entry; + next; + } + + $have_active = 1; + if ($entry) { + $nodes->{$node} = { + state => $entry->{state}, + 'paths-active' => $entry->{'paths-active'}, + 'paths-total' => $entry->{'paths-total'}, + defined($entry->{transport}) ? (transport => $entry->{transport}) : (), + }; + $size //= $entry->{size}; + $rank->($entry->{state}); + } else { + # node is actively multipathing but has not assembled this LUN + $nodes->{$node} = { state => 'missing' }; + $rank->('missing'); + } + } + + # A LUN on the allow-list should assemble on every node where a multipath storage is + # enabled. An expected node with no broadcast at all is missing the map (online) or + # unreachable (offline); fold it in so a node that lost all its paths surfaces instead of + # vanishing. + if ($allow{$wwid}) { + for my $node (sort keys %$expected) { + next if exists $nodes->{$node}; + if ($online->{$node}) { + $have_active = 1; + $nodes->{$node} = { state => 'missing' }; + $rank->('missing'); + } else { + $nodes->{$node} = { state => 'unknown' }; + } + } + } + + push $res->@*, + { + wwid => $wwid, + defined($aliases->{$wwid}) ? (alias => $aliases->{$wwid}) : (), + defined($used_by->{$wwid}) ? ('used-by' => $used_by->{$wwid}) : (), + defined($size) ? (size => $size) : (), + 'cluster-state' => $have_active ? $worst : 'unknown', + nodes => $nodes, + }; + } + + return $res; +} + 1; diff --git a/src/test/run_multipath_tests.pl b/src/test/run_multipath_tests.pl index affec23..9e7e1db 100755 --- a/src/test/run_multipath_tests.pl +++ b/src/test/run_multipath_tests.pl @@ -285,4 +285,139 @@ my $many = [ my $big = JSON::encode_json(PVE::Multipath::summarize_maps_for_broadcast($many)); ok(length($big) < 32 * 1024, "100-map summary (" . length($big) . " B) fits the KV size limit"); +# --- cluster status aggregation --- +my $node_kv = { + nodeA => { + wA => + { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2, transport => 'iscsi' }, + wB => { + state => 'optimal', + 'paths-active' => 2, + 'paths-total' => 2, + transport => 'iscsi', + size => 42, + }, + }, + nodeB => { + wA => + { state => 'degraded', 'paths-active' => 1, 'paths-total' => 2, transport => 'iscsi' }, + # nodeB is active but does not see wB + }, + nodeC => { + # stale broadcast from an offline node + wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 }, + }, +}; +my $agg = PVE::Multipath::aggregate_cluster_status( + ['wA', 'wB', 'wZ'], # allow-list incl. an unseen WWID + { wA => 'lun-a' }, # alias + { wB => 'mptank' }, # used-by + $node_kv, + { nodeA => 1, nodeB => 1, nodeC => 0 }, # nodeC offline +); +my %by_wwid = map { $_->{wwid} => $_ } $agg->@*; + +is_deeply([sort keys %by_wwid], ['wA', 'wB', 'wZ'], 'matrix covers allow-list and seen WWIDs'); + +is($by_wwid{wA}->{alias}, 'lun-a', 'alias surfaced on the WWID row'); +is($by_wwid{wA}->{'cluster-state'}, 'degraded', 'degraded on one active node rolls up to degraded'); +is($by_wwid{wA}->{nodes}->{nodeA}->{state}, 'optimal', 'per-node optimal cell kept'); +is($by_wwid{wA}->{nodes}->{nodeB}->{state}, 'degraded', 'per-node degraded cell kept'); +is($by_wwid{wA}->{nodes}->{nodeC}->{state}, 'unknown', + 'offline node with stale data shows unknown'); + +is($by_wwid{wB}->{'used-by'}, 'mptank', 'consuming storage surfaced as used-by'); +is($by_wwid{wB}->{size}, 42, 'LUN size surfaced from a reporting node'); +is( + $by_wwid{wB}->{'cluster-state'}, + 'missing', + 'active node not assembling the LUN rolls up to missing', +); +is( + $by_wwid{wB}->{nodes}->{nodeB}->{state}, + 'missing', + 'missing marked on the active node lacking it', +); + +is( + $by_wwid{wZ}->{'cluster-state'}, + 'missing', + 'allow-listed WWID no active node assembled is missing everywhere', +); +is($by_wwid{wZ}->{nodes}->{nodeA}->{state}, 'missing', 'active node missing the allow-listed WWID'); +ok(!exists $by_wwid{wZ}->{nodes}->{nodeC}, 'offline node contributes no cell for an unseen WWID'); + +# a WWID only an offline node ever reported, with no online active node, is unknown +my $agg_off = PVE::Multipath::aggregate_cluster_status( + ['wA'], + {}, + {}, + { dead => { wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 } } }, + { dead => 0 }, +); +is( + $agg_off->[0]->{'cluster-state'}, + 'unknown', + 'no online active node leaves the cluster-state unknown', +); +is($agg_off->[0]->{nodes}->{dead}->{state}, 'unknown', 'stale offline node shown as unknown'); + +# failure outranks degraded in the roll-up +my $agg2 = PVE::Multipath::aggregate_cluster_status( + ['wA'], + {}, + {}, + { + n1 => { wA => { state => 'degraded', 'paths-active' => 1, 'paths-total' => 2 } }, + n2 => { wA => { state => 'failed', 'paths-active' => 0, 'paths-total' => 2 } }, + }, + { n1 => 1, n2 => 1 }, +); +is($agg2->[0]->{'cluster-state'}, 'failed', 'failed outranks degraded in the cluster roll-up'); + +# --- expected-node set: a node that lost all paths (silent) must not vanish --- +# nodeS is expected (a multipath storage is enabled there) and online, but +# broadcasts nothing - e.g. every path to the SAN is down so it cleared its KV. +my $exp_kv = { + nodeA => { wA => { state => 'optimal', 'paths-active' => 2, 'paths-total' => 2 } }, +}; +my $online = { nodeA => 1, nodeS => 1, nodeOff => 0 }; +my $expected = { nodeA => 1, nodeS => 1, nodeOff => 1 }; +my $eagg = PVE::Multipath::aggregate_cluster_status( + ['wA'], {}, {}, $exp_kv, $online, $expected, +); +my $row = $eagg->[0]; +is($row->{nodes}->{nodeA}->{state}, 'optimal', 'reporting node keeps its real state'); +is( + $row->{nodes}->{nodeS}->{state}, + 'missing', + 'expected online but silent node shows missing instead of vanishing', +); +is($row->{nodes}->{nodeOff}->{state}, 'unknown', 'expected offline node shows unknown'); +is($row->{'cluster-state'}, 'missing', 'a silent expected node drags the cluster-state to missing'); + +# without $expected the silent node would have been invisible (regression guard +# for the old behavior, proving the new param is what surfaces it) +my $noexp = PVE::Multipath::aggregate_cluster_status(['wA'], {}, {}, $exp_kv, $online); +ok( + !exists $noexp->[0]->{nodes}->{nodeS}, + 'without the expected set the silent node is absent (the gap the param closes)', +); +is($noexp->[0]->{'cluster-state'}, 'optimal', 'and the cluster-state would falsely read optimal'); + +# expected augmentation applies only to allow-listed WWIDs, not to a LUN that a +# node merely happens to report off-list +my $offlist = PVE::Multipath::aggregate_cluster_status( + [], + {}, + {}, + { nodeA => { wX => { state => 'optimal', 'paths-active' => 1, 'paths-total' => 1 } } }, + { nodeA => 1, nodeS => 1 }, + { nodeA => 1, nodeS => 1 }, +); +ok( + !exists $offlist->[0]->{nodes}->{nodeS}, + 'non-allow-listed WWID does not synthesize missing cells for expected nodes', +); + done_testing(); -- 2.47.3