From: Stefan Reiter <s.reiter@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH v2 qemu 5/6] PVE: fix and clean up error handling for create_backup_jobs
Date: Thu, 29 Oct 2020 14:10:35 +0100 [thread overview]
Message-ID: <20201029131036.11786-6-s.reiter@proxmox.com> (raw)
In-Reply-To: <20201029131036.11786-1-s.reiter@proxmox.com>
No more weird bool returns, just the standard "errp" format used
everywhere else too. With this, if backup_job_create fails, the error
message is actually returned over QMP and can be shown to the user.
To facilitate correct cleanup on such an error, we call
create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
This additionally allows us to actually hold the backup_mutex during
operation.
Also add a job_cancel_sync before job_unref, since a job must be in
STATUS_NULL to be deleted by unref, which could trigger an assert
before.
Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---
v2: new
Error handling in create_backup_jobs was pretty much nonexistant, lots of weird
things would happen. Clean it up and fix those issues at once.
pve-backup.c | 79 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 54 insertions(+), 25 deletions(-)
diff --git a/pve-backup.c b/pve-backup.c
index 0466145bec..1a2647e7a5 100644
--- a/pve-backup.c
+++ b/pve-backup.c
@@ -50,6 +50,7 @@ static struct PVEBackupState {
size_t zero_bytes;
GList *bitmap_list;
bool finishing;
+ bool starting;
} stat;
int64_t speed;
VmaWriter *vmaw;
@@ -277,6 +278,16 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
PVEBackupDevInfo *di = opaque;
int ret = di->completed_ret;
+ qemu_mutex_lock(&backup_state.stat.lock);
+ bool starting = backup_state.stat.starting;
+ qemu_mutex_unlock(&backup_state.stat.lock);
+ if (starting) {
+ /* in 'starting' state, no tasks have been run yet, meaning we can (and
+ * must) skip all cleanup, as we don't know what has and hasn't been
+ * initialized yet. */
+ return;
+ }
+
qemu_co_mutex_lock(&backup_state.backup_mutex);
if (ret < 0) {
@@ -441,15 +452,15 @@ static int coroutine_fn pvebackup_co_add_config(
/*
* backup_job_create can *not* be run from a coroutine (and requires an
* acquired AioContext), so this can't either.
- * This does imply that this function cannot run with backup_mutex acquired.
- * That is ok because it is only ever called between setting up the backup_state
- * struct and starting the jobs, and from within a QMP call. This means that no
- * other QMP call can interrupt, and no background job is running yet.
+ * The caller is responsible that backup_mutex is held nonetheless.
*/
-static bool create_backup_jobs(void) {
+static void create_backup_jobs_bh(void *opaque) {
assert(!qemu_in_coroutine());
+ CoCtxData *data = (CoCtxData*)opaque;
+ Error **errp = (Error**)data->data;
+
Error *local_err = NULL;
/* create job transaction to synchronize bitmap commit and cancel all
@@ -483,24 +494,19 @@ static bool create_backup_jobs(void) {
aio_context_release(aio_context);
- if (!job || local_err != NULL) {
- Error *create_job_err = NULL;
- error_setg(&create_job_err, "backup_job_create failed: %s",
- local_err ? error_get_pretty(local_err) : "null");
+ di->job = job;
- pvebackup_propagate_error(create_job_err);
+ if (!job || local_err) {
+ error_setg(errp, "backup_job_create failed: %s",
+ local_err ? error_get_pretty(local_err) : "null");
break;
}
- di->job = job;
-
bdrv_unref(di->target);
di->target = NULL;
}
- bool errors = pvebackup_error_or_canceled();
-
- if (errors) {
+ if (*errp) {
l = backup_state.di_list;
while (l) {
PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
@@ -512,12 +518,17 @@ static bool create_backup_jobs(void) {
}
if (di->job) {
+ AioContext *ctx = di->job->job.aio_context;
+ aio_context_acquire(ctx);
+ job_cancel_sync(&di->job->job);
job_unref(&di->job->job);
+ aio_context_release(ctx);
}
}
}
- return errors;
+ /* return */
+ aio_co_enter(data->ctx, data->co);
}
typedef struct QmpBackupTask {
@@ -883,6 +894,7 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
backup_state.stat.transferred = 0;
backup_state.stat.zero_bytes = 0;
backup_state.stat.finishing = false;
+ backup_state.stat.starting = true;
qemu_mutex_unlock(&backup_state.stat.lock);
@@ -898,7 +910,32 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
task->result = uuid_info;
+ /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
+ * backup_mutex locked. This is fine, a CoMutex can be held across yield
+ * points, and we'll release it as soon as the BH reschedules us.
+ */
+ CoCtxData waker = {
+ .co = qemu_coroutine_self(),
+ .ctx = qemu_get_current_aio_context(),
+ .data = &local_err,
+ };
+ aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
+ qemu_coroutine_yield();
+
+ if (local_err) {
+ error_propagate(task->errp, local_err);
+ goto err;
+ }
+
qemu_co_mutex_unlock(&backup_state.backup_mutex);
+
+ qemu_mutex_lock(&backup_state.stat.lock);
+ backup_state.stat.starting = false;
+ qemu_mutex_unlock(&backup_state.stat.lock);
+
+ /* start the first job in the transaction */
+ job_txn_start_seq(backup_state.txn);
+
return;
err_mutex:
@@ -921,6 +958,7 @@ err:
g_free(di);
}
g_list_free(di_list);
+ backup_state.di_list = NULL;
if (devs) {
g_strfreev(devs);
@@ -998,15 +1036,6 @@ UuidInfo *qmp_backup(
block_on_coroutine_fn(pvebackup_co_prepare, &task);
- if (*errp == NULL) {
- bool errors = create_backup_jobs();
-
- if (!errors) {
- // start the first job in the transaction
- job_txn_start_seq(backup_state.txn);
- }
- }
-
return task.result;
}
--
2.20.1
next prev parent reply other threads:[~2020-10-29 13:11 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-29 13:10 [pve-devel] [PATCH v2 0/6] QEMU backup cancellation fixes Stefan Reiter
2020-10-29 13:10 ` [pve-devel] [PATCH v2 qemu 1/6] PVE: fixup: drop CoMutex on error return Stefan Reiter
2020-10-29 13:10 ` [pve-devel] [PATCH v2 qemu 2/6] PVE: Introduce generic CoCtxData struct Stefan Reiter
2020-10-29 13:10 ` [pve-devel] [PATCH v2 qemu 3/6] PVE: Don't expect complete_cb to be called outside coroutine Stefan Reiter
2020-10-29 13:10 ` [pve-devel] [PATCH v2 qemu 4/6] PVE: Don't call job_cancel in coroutines Stefan Reiter
2020-10-29 13:10 ` Stefan Reiter [this message]
2020-10-29 13:10 ` [pve-devel] [PATCH v2 qemu 6/6] Several fixes for backup abort and error reporting Stefan Reiter
2020-10-29 17:27 ` [pve-devel] applied: " Thomas Lamprecht
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201029131036.11786-6-s.reiter@proxmox.com \
--to=s.reiter@proxmox.com \
--cc=pve-devel@lists.proxmox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal