* [PATCH proxmox-backup v5 1/3] api: move statefile loading into compute_schedule_status
2026-04-13 13:19 [PATCH proxmox-backup v5 0/3] fix #7400: improve handling of corrupted job statefiles Michael Köppl
@ 2026-04-13 13:19 ` Michael Köppl
2026-04-13 13:19 ` [PATCH proxmox-backup v5 2/3] fix #7400: api: gracefully handle corrupted job statefiles Michael Köppl
2026-04-13 13:20 ` [PATCH proxmox-backup v5 3/3] fix #7400: proxy: self-heal " Michael Köppl
2 siblings, 0 replies; 4+ messages in thread
From: Michael Köppl @ 2026-04-13 13:19 UTC (permalink / raw)
To: pbs-devel
Centralize loading of the job statefiles in compute_schedule_status,
reducing code duplication across the job management API endpoints.
This also changes the error handling for UPID parsing errors with
garbage collection state files, aligning it to the rest of the API
handler behavior.
Signed-off-by: Michael Köppl <m.koeppl@proxmox.com>
---
src/api2/admin/datastore.rs | 15 ++++++---------
src/api2/admin/prune.rs | 9 +++------
src/api2/admin/sync.rs | 9 +++------
src/api2/admin/verify.rs | 9 +++------
src/api2/tape/backup.rs | 9 +++------
src/server/jobstate.rs | 8 ++++++--
6 files changed, 24 insertions(+), 35 deletions(-)
diff --git a/src/api2/admin/datastore.rs b/src/api2/admin/datastore.rs
index fcb81ec5b..757b31145 100644
--- a/src/api2/admin/datastore.rs
+++ b/src/api2/admin/datastore.rs
@@ -1172,19 +1172,14 @@ pub fn garbage_collection_status(
let datastore = DataStore::lookup_datastore(lookup_with(&store, Operation::Read))?;
let status_in_memory = datastore.last_gc_status();
- let state_file = JobState::load("garbage_collection", &store)
- .map_err(|err| log::error!("could not open GC statefile for {store}: {err}"))
- .ok();
let mut last = proxmox_time::epoch_i64();
+ let jobtype = "garbage_collection";
+
if let Some(ref upid) = status_in_memory.upid {
- let mut computed_schedule: JobScheduleStatus = JobScheduleStatus::default();
- if let Some(state) = state_file {
- if let Ok(cs) = compute_schedule_status(&state, Some(upid)) {
- computed_schedule = cs;
- }
- }
+ let computed_schedule: JobScheduleStatus =
+ compute_schedule_status(jobtype, &store, Some(upid))?;
if let Some(endtime) = computed_schedule.last_run_endtime {
last = endtime;
@@ -1196,6 +1191,8 @@ pub fn garbage_collection_status(
info.next_run = computed_schedule.next_run;
info.last_run_endtime = computed_schedule.last_run_endtime;
info.last_run_state = computed_schedule.last_run_state;
+ } else if let Err(err) = JobState::load(jobtype, &store) {
+ log::error!("could not open statefile for {store}: {err}");
}
info.next_run = info
diff --git a/src/api2/admin/prune.rs b/src/api2/admin/prune.rs
index a5ebf2975..1b1d2f1ba 100644
--- a/src/api2/admin/prune.rs
+++ b/src/api2/admin/prune.rs
@@ -1,6 +1,6 @@
//! Datastore Prune Job Management
-use anyhow::{format_err, Error};
+use anyhow::Error;
use serde_json::Value;
use proxmox_router::{
@@ -18,7 +18,7 @@ use pbs_config::CachedUserInfo;
use crate::server::{
do_prune_job,
- jobstate::{compute_schedule_status, Job, JobState},
+ jobstate::{compute_schedule_status, Job},
};
#[api(
@@ -73,10 +73,7 @@ pub fn list_prune_jobs(
let mut list = Vec::new();
for job in job_config_iter {
- let last_state = JobState::load("prunejob", &job.id)
- .map_err(|err| format_err!("could not open statefile for {}: {}", &job.id, err))?;
-
- let mut status = compute_schedule_status(&last_state, Some(&job.schedule))?;
+ let mut status = compute_schedule_status("prunejob", &job.id, Some(&job.schedule))?;
if job.disable {
status.next_run = None;
}
diff --git a/src/api2/admin/sync.rs b/src/api2/admin/sync.rs
index 6722ebea0..2384ede75 100644
--- a/src/api2/admin/sync.rs
+++ b/src/api2/admin/sync.rs
@@ -1,6 +1,6 @@
//! Datastore Synchronization Job Management
-use anyhow::{bail, format_err, Error};
+use anyhow::{bail, Error};
use serde::{Deserialize, Serialize};
use serde_json::Value;
@@ -19,7 +19,7 @@ use pbs_config::CachedUserInfo;
use crate::{
api2::config::sync::{check_sync_job_modify_access, check_sync_job_read_access},
- server::jobstate::{compute_schedule_status, Job, JobState},
+ server::jobstate::{compute_schedule_status, Job},
server::sync::do_sync_job,
};
@@ -112,10 +112,7 @@ pub fn list_config_sync_jobs(
continue;
}
- let last_state = JobState::load("syncjob", &job.id)
- .map_err(|err| format_err!("could not open statefile for {}: {}", &job.id, err))?;
-
- let status = compute_schedule_status(&last_state, job.schedule.as_deref())?;
+ let status = compute_schedule_status("syncjob", &job.id, job.schedule.as_deref())?;
list.push(SyncJobStatus {
config: job,
diff --git a/src/api2/admin/verify.rs b/src/api2/admin/verify.rs
index 66695236c..af5b7fff4 100644
--- a/src/api2/admin/verify.rs
+++ b/src/api2/admin/verify.rs
@@ -1,6 +1,6 @@
//! Datastore Verify Job Management
-use anyhow::{format_err, Error};
+use anyhow::Error;
use serde_json::Value;
use proxmox_router::{
@@ -19,7 +19,7 @@ use pbs_config::CachedUserInfo;
use crate::server::{
do_verification_job,
- jobstate::{compute_schedule_status, Job, JobState},
+ jobstate::{compute_schedule_status, Job},
};
#[api(
@@ -73,10 +73,7 @@ pub fn list_verification_jobs(
let mut list = Vec::new();
for job in job_config_iter {
- let last_state = JobState::load("verificationjob", &job.id)
- .map_err(|err| format_err!("could not open statefile for {}: {}", &job.id, err))?;
-
- let status = compute_schedule_status(&last_state, job.schedule.as_deref())?;
+ let status = compute_schedule_status("verificationjob", &job.id, job.schedule.as_deref())?;
list.push(VerificationJobStatus {
config: job,
diff --git a/src/api2/tape/backup.rs b/src/api2/tape/backup.rs
index c254c6d8b..cd68fe279 100644
--- a/src/api2/tape/backup.rs
+++ b/src/api2/tape/backup.rs
@@ -1,6 +1,6 @@
use std::sync::{Arc, Mutex};
-use anyhow::{bail, format_err, Error};
+use anyhow::{bail, Error};
use serde_json::Value;
use tracing::{info, warn};
@@ -23,7 +23,7 @@ use pbs_datastore::{DataStore, StoreProgress};
use crate::tape::{assert_datastore_type, TapeNotificationMode};
use crate::{
server::{
- jobstate::{compute_schedule_status, Job, JobState},
+ jobstate::{compute_schedule_status, Job},
TapeBackupJobSummary,
},
tape::{
@@ -97,10 +97,7 @@ pub fn list_tape_backup_jobs(
continue;
}
- let last_state = JobState::load("tape-backup-job", &job.id)
- .map_err(|err| format_err!("could not open statefile for {}: {}", &job.id, err))?;
-
- let status = compute_schedule_status(&last_state, job.schedule.as_deref())?;
+ let status = compute_schedule_status("tape-backup-job", &job.id, job.schedule.as_deref())?;
let next_run = status.next_run.unwrap_or(current_time);
diff --git a/src/server/jobstate.rs b/src/server/jobstate.rs
index dc9f6c90d..ceac8dde8 100644
--- a/src/server/jobstate.rs
+++ b/src/server/jobstate.rs
@@ -301,11 +301,15 @@ impl Job {
}
pub fn compute_schedule_status(
- job_state: &JobState,
+ jobtype: &str,
+ jobname: &str,
schedule: Option<&str>,
) -> Result<JobScheduleStatus, Error> {
+ let job_state = JobState::load(jobtype, jobname)
+ .map_err(|err| format_err!("could not open statefile for {jobname}: {err}"))?;
+
let (upid, endtime, state, last) = match job_state {
- JobState::Created { time } => (None, None, None, *time),
+ JobState::Created { time } => (None, None, None, time),
JobState::Started { upid } => {
let parsed_upid: UPID = upid.parse()?;
(Some(upid), None, None, parsed_upid.starttime)
--
2.47.3
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH proxmox-backup v5 2/3] fix #7400: api: gracefully handle corrupted job statefiles
2026-04-13 13:19 [PATCH proxmox-backup v5 0/3] fix #7400: improve handling of corrupted job statefiles Michael Köppl
2026-04-13 13:19 ` [PATCH proxmox-backup v5 1/3] api: move statefile loading into compute_schedule_status Michael Köppl
@ 2026-04-13 13:19 ` Michael Köppl
2026-04-13 13:20 ` [PATCH proxmox-backup v5 3/3] fix #7400: proxy: self-heal " Michael Köppl
2 siblings, 0 replies; 4+ messages in thread
From: Michael Köppl @ 2026-04-13 13:19 UTC (permalink / raw)
To: pbs-devel
Introduce Unknown JobState to more explicitly represent cases where the
state could not be determined, e.g. if the statefile was corrupted or
missing. Update JobState::load to handle parsing errors (both for
statefiles themselves as well as UPIDs) and return an Unknown state if
such an error occurred. Update compute_schedule_status to also handle
the new Unknown status, returning a default JobScheduleStatus so API
endpoints don't return an error to the user, stopping them from viewing
their jobs.
Signed-off-by: Michael Köppl <m.koeppl@proxmox.com>
---
src/server/jobstate.rs | 63 +++++++++++++++++++++++++++++++++++++-----
1 file changed, 56 insertions(+), 7 deletions(-)
diff --git a/src/server/jobstate.rs b/src/server/jobstate.rs
index ceac8dde8..94f3a5e5d 100644
--- a/src/server/jobstate.rs
+++ b/src/server/jobstate.rs
@@ -66,6 +66,9 @@ pub enum JobState {
state: TaskState,
updated: Option<i64>,
},
+ /// The job's state could not be determined (e.g. because the state file was corrupted, does not
+ /// exist)
+ Unknown,
}
/// Represents a Job and holds the correct lock
@@ -77,6 +80,9 @@ pub struct Job {
_lock: BackupLockGuard,
}
+/// Fallback offset (in seconds) used for job schedules when the last run time is unknown
+pub const SCHEDULE_FALLBACK_OFFSET: i64 = 30;
+
const JOB_STATE_BASEDIR: &str = concat!(PROXMOX_BACKUP_STATE_DIR_M!(), "/jobstates");
/// Create jobstate stat dir with correct permission
@@ -155,6 +161,7 @@ pub fn update_job_last_run_time(jobtype: &str, jobname: &str) -> Result<(), Erro
state,
updated: Some(time),
},
+ JobState::Unknown => bail!("cannot update last run time for unknown job state"),
};
job.write_state()
}
@@ -179,6 +186,7 @@ pub fn last_run_time(jobtype: &str, jobname: &str) -> Result<i64, Error> {
.map_err(|err| format_err!("could not parse upid from state: {err}"))?;
Ok(upid.starttime)
}
+ JobState::Unknown => bail!("statefile could not be parsed or was empty"),
}
}
@@ -191,11 +199,23 @@ impl JobState {
/// This does not update the state in the file.
pub fn load(jobtype: &str, jobname: &str) -> Result<Self, Error> {
if let Some(state) = file_read_optional_string(get_path(jobtype, jobname))? {
- match serde_json::from_str(&state)? {
+ let job_state = match serde_json::from_str(&state) {
+ Ok(parsed_state) => parsed_state,
+ Err(err) => {
+ log::error!("could not parse statefile for {jobname}: {err}");
+ return Ok(JobState::Unknown);
+ }
+ };
+
+ match job_state {
JobState::Started { upid } => {
- let parsed: UPID = upid
- .parse()
- .map_err(|err| format_err!("error parsing upid: {err}"))?;
+ let parsed: UPID = match upid.parse() {
+ Ok(parsed) => parsed,
+ Err(err) => {
+ log::error!("error parsing upid for {jobname}: {err}");
+ return Ok(JobState::Unknown);
+ }
+ };
if !worker_is_active_local(&parsed) {
let state = upid_read_status(&parsed).unwrap_or(TaskState::Unknown {
@@ -211,11 +231,26 @@ impl JobState {
Ok(JobState::Started { upid })
}
}
+ JobState::Finished {
+ upid,
+ state,
+ updated,
+ } => {
+ if let Err(err) = upid.parse::<UPID>() {
+ log::error!("error parsing upid for {jobname}: {err}");
+ return Ok(JobState::Unknown);
+ }
+ Ok(JobState::Finished {
+ upid,
+ state,
+ updated,
+ })
+ }
other => Ok(other),
}
} else {
Ok(JobState::Created {
- time: proxmox_time::epoch_i64() - 30,
+ time: proxmox_time::epoch_i64() - SCHEDULE_FALLBACK_OFFSET,
})
}
}
@@ -263,6 +298,7 @@ impl Job {
JobState::Created { .. } => bail!("cannot finish when not started"),
JobState::Started { upid } => upid,
JobState::Finished { upid, .. } => upid,
+ JobState::Unknown => bail!("cannot finish job with unknown status"),
}
.to_string();
@@ -305,8 +341,15 @@ pub fn compute_schedule_status(
jobname: &str,
schedule: Option<&str>,
) -> Result<JobScheduleStatus, Error> {
- let job_state = JobState::load(jobtype, jobname)
- .map_err(|err| format_err!("could not open statefile for {jobname}: {err}"))?;
+ let job_state = match JobState::load(jobtype, jobname) {
+ Ok(job_state) => job_state,
+ Err(err) => {
+ log::error!(
+ "could not open statefile for {jobname}: {err} - falling back to default job schedule status",
+ );
+ return Ok(JobScheduleStatus::default());
+ }
+ };
let (upid, endtime, state, last) = match job_state {
JobState::Created { time } => (None, None, None, time),
@@ -327,6 +370,12 @@ pub fn compute_schedule_status(
last,
)
}
+ JobState::Unknown => (
+ None,
+ None,
+ None,
+ proxmox_time::epoch_i64() - SCHEDULE_FALLBACK_OFFSET,
+ ),
};
let mut status = JobScheduleStatus {
--
2.47.3
^ permalink raw reply [flat|nested] 4+ messages in thread* [PATCH proxmox-backup v5 3/3] fix #7400: proxy: self-heal corrupted job statefiles
2026-04-13 13:19 [PATCH proxmox-backup v5 0/3] fix #7400: improve handling of corrupted job statefiles Michael Köppl
2026-04-13 13:19 ` [PATCH proxmox-backup v5 1/3] api: move statefile loading into compute_schedule_status Michael Köppl
2026-04-13 13:19 ` [PATCH proxmox-backup v5 2/3] fix #7400: api: gracefully handle corrupted job statefiles Michael Köppl
@ 2026-04-13 13:20 ` Michael Köppl
2 siblings, 0 replies; 4+ messages in thread
From: Michael Köppl @ 2026-04-13 13:20 UTC (permalink / raw)
To: pbs-devel
Update update_job_last_run_time to transition JobState::Unknown into
JobState::Created so the corrupted statefile is overwritten. In
addition, update the scheduling loops to actively overwrite corrupted
statefiles and return the time for the next scheduled run of the
affected job.
Signed-off-by: Michael Köppl <m.koeppl@proxmox.com>
---
src/bin/proxmox-backup-proxy.rs | 6 ++++--
src/server/jobstate.rs | 2 +-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/bin/proxmox-backup-proxy.rs b/src/bin/proxmox-backup-proxy.rs
index 2932e4d7b..b18550420 100644
--- a/src/bin/proxmox-backup-proxy.rs
+++ b/src/bin/proxmox-backup-proxy.rs
@@ -33,7 +33,7 @@ use pbs_datastore::DataStore;
use proxmox_backup::{
server::{
auth::check_pbs_auth,
- jobstate::{self, Job},
+ jobstate::{self, Job, SCHEDULE_FALLBACK_OFFSET},
},
traffic_control_cache::{SharedRateLimit, TRAFFIC_CONTROL_CACHE},
};
@@ -567,6 +567,7 @@ async fn schedule_datastore_garbage_collection() {
Ok(time) => time,
Err(err) => {
eprintln!("could not get last run time of {worker_type} {store}: {err}");
+ let _ = jobstate::update_job_last_run_time(worker_type, &store);
continue;
}
};
@@ -1026,7 +1027,8 @@ fn check_schedule(worker_type: &str, event_str: &str, id: &str) -> bool {
Ok(time) => time,
Err(err) => {
eprintln!("could not get last run time of {worker_type} {id}: {err}");
- return false;
+ let _ = jobstate::update_job_last_run_time(worker_type, id);
+ proxmox_time::epoch_i64() - SCHEDULE_FALLBACK_OFFSET
}
};
diff --git a/src/server/jobstate.rs b/src/server/jobstate.rs
index 94f3a5e5d..fb96ea1da 100644
--- a/src/server/jobstate.rs
+++ b/src/server/jobstate.rs
@@ -161,7 +161,7 @@ pub fn update_job_last_run_time(jobtype: &str, jobname: &str) -> Result<(), Erro
state,
updated: Some(time),
},
- JobState::Unknown => bail!("cannot update last run time for unknown job state"),
+ JobState::Unknown => JobState::Created { time },
};
job.write_state()
}
--
2.47.3
^ permalink raw reply [flat|nested] 4+ messages in thread