From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <s.reiter@proxmox.com>
Received: from firstgate.proxmox.com (firstgate.proxmox.com [212.224.123.68])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by lists.proxmox.com (Postfix) with ESMTPS id 8F8076405B
 for <pve-devel@lists.proxmox.com>; Thu, 29 Oct 2020 14:11:32 +0100 (CET)
Received: from firstgate.proxmox.com (localhost [127.0.0.1])
 by firstgate.proxmox.com (Proxmox) with ESMTP id 860A3958E
 for <pve-devel@lists.proxmox.com>; Thu, 29 Oct 2020 14:11:02 +0100 (CET)
Received: from proxmox-new.maurer-it.com (proxmox-new.maurer-it.com
 [212.186.127.180])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (2048 bits))
 (No client certificate requested)
 by firstgate.proxmox.com (Proxmox) with ESMTPS id CE19F955F
 for <pve-devel@lists.proxmox.com>; Thu, 29 Oct 2020 14:10:44 +0100 (CET)
Received: from proxmox-new.maurer-it.com (localhost.localdomain [127.0.0.1])
 by proxmox-new.maurer-it.com (Proxmox) with ESMTP id 991CD45F82
 for <pve-devel@lists.proxmox.com>; Thu, 29 Oct 2020 14:10:44 +0100 (CET)
From: Stefan Reiter <s.reiter@proxmox.com>
To: pve-devel@lists.proxmox.com
Date: Thu, 29 Oct 2020 14:10:35 +0100
Message-Id: <20201029131036.11786-6-s.reiter@proxmox.com>
X-Mailer: git-send-email 2.20.1
In-Reply-To: <20201029131036.11786-1-s.reiter@proxmox.com>
References: <20201029131036.11786-1-s.reiter@proxmox.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-SPAM-LEVEL: Spam detection results:  0
 AWL -0.038 Adjusted score from AWL reputation of From: address
 KAM_DMARC_STATUS 0.01 Test Rule for DKIM or SPF Failure with Strict Alignment
 RCVD_IN_DNSWL_MED        -2.3 Sender listed at https://www.dnswl.org/,
 medium trust
 SPF_HELO_NONE           0.001 SPF: HELO does not publish an SPF Record
 SPF_PASS               -0.001 SPF: sender matches SPF record
Subject: [pve-devel] [PATCH v2 qemu 5/6] PVE: fix and clean up error
 handling for create_backup_jobs
X-BeenThere: pve-devel@lists.proxmox.com
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Proxmox VE development discussion <pve-devel.lists.proxmox.com>
List-Unsubscribe: <https://lists.proxmox.com/cgi-bin/mailman/options/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=unsubscribe>
List-Archive: <http://lists.proxmox.com/pipermail/pve-devel/>
List-Post: <mailto:pve-devel@lists.proxmox.com>
List-Help: <mailto:pve-devel-request@lists.proxmox.com?subject=help>
List-Subscribe: <https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel>, 
 <mailto:pve-devel-request@lists.proxmox.com?subject=subscribe>
X-List-Received-Date: Thu, 29 Oct 2020 13:11:32 -0000

No more weird bool returns, just the standard "errp" format used
everywhere else too. With this, if backup_job_create fails, the error
message is actually returned over QMP and can be shown to the user.

To facilitate correct cleanup on such an error, we call
create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
This additionally allows us to actually hold the backup_mutex during
operation.

Also add a job_cancel_sync before job_unref, since a job must be in
STATUS_NULL to be deleted by unref, which could trigger an assert
before.

Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
---

v2: new

Error handling in create_backup_jobs was pretty much nonexistant, lots of weird
things would happen. Clean it up and fix those issues at once.

 pve-backup.c | 79 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/pve-backup.c b/pve-backup.c
index 0466145bec..1a2647e7a5 100644
--- a/pve-backup.c
+++ b/pve-backup.c
@@ -50,6 +50,7 @@ static struct PVEBackupState {
         size_t zero_bytes;
         GList *bitmap_list;
         bool finishing;
+        bool starting;
     } stat;
     int64_t speed;
     VmaWriter *vmaw;
@@ -277,6 +278,16 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
     PVEBackupDevInfo *di = opaque;
     int ret = di->completed_ret;
 
+    qemu_mutex_lock(&backup_state.stat.lock);
+    bool starting = backup_state.stat.starting;
+    qemu_mutex_unlock(&backup_state.stat.lock);
+    if (starting) {
+        /* in 'starting' state, no tasks have been run yet, meaning we can (and
+         * must) skip all cleanup, as we don't know what has and hasn't been
+         * initialized yet. */
+        return;
+    }
+
     qemu_co_mutex_lock(&backup_state.backup_mutex);
 
     if (ret < 0) {
@@ -441,15 +452,15 @@ static int coroutine_fn pvebackup_co_add_config(
 /*
  * backup_job_create can *not* be run from a coroutine (and requires an
  * acquired AioContext), so this can't either.
- * This does imply that this function cannot run with backup_mutex acquired.
- * That is ok because it is only ever called between setting up the backup_state
- * struct and starting the jobs, and from within a QMP call. This means that no
- * other QMP call can interrupt, and no background job is running yet.
+ * The caller is responsible that backup_mutex is held nonetheless.
  */
-static bool create_backup_jobs(void) {
+static void create_backup_jobs_bh(void *opaque) {
 
     assert(!qemu_in_coroutine());
 
+    CoCtxData *data = (CoCtxData*)opaque;
+    Error **errp = (Error**)data->data;
+
     Error *local_err = NULL;
 
     /* create job transaction to synchronize bitmap commit and cancel all
@@ -483,24 +494,19 @@ static bool create_backup_jobs(void) {
 
         aio_context_release(aio_context);
 
-        if (!job || local_err != NULL) {
-            Error *create_job_err = NULL;
-            error_setg(&create_job_err, "backup_job_create failed: %s",
-                       local_err ? error_get_pretty(local_err) : "null");
+        di->job = job;
 
-            pvebackup_propagate_error(create_job_err);
+        if (!job || local_err) {
+            error_setg(errp, "backup_job_create failed: %s",
+                       local_err ? error_get_pretty(local_err) : "null");
             break;
         }
 
-        di->job = job;
-
         bdrv_unref(di->target);
         di->target = NULL;
     }
 
-    bool errors = pvebackup_error_or_canceled();
-
-    if (errors) {
+    if (*errp) {
         l = backup_state.di_list;
         while (l) {
             PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
@@ -512,12 +518,17 @@ static bool create_backup_jobs(void) {
             }
 
             if (di->job) {
+                AioContext *ctx = di->job->job.aio_context;
+                aio_context_acquire(ctx);
+                job_cancel_sync(&di->job->job);
                 job_unref(&di->job->job);
+                aio_context_release(ctx);
             }
         }
     }
 
-    return errors;
+    /* return */
+    aio_co_enter(data->ctx, data->co);
 }
 
 typedef struct QmpBackupTask {
@@ -883,6 +894,7 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
     backup_state.stat.transferred = 0;
     backup_state.stat.zero_bytes = 0;
     backup_state.stat.finishing = false;
+    backup_state.stat.starting = true;
 
     qemu_mutex_unlock(&backup_state.stat.lock);
 
@@ -898,7 +910,32 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
 
     task->result = uuid_info;
 
+    /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
+    * backup_mutex locked. This is fine, a CoMutex can be held across yield
+    * points, and we'll release it as soon as the BH reschedules us.
+    */
+    CoCtxData waker = {
+        .co = qemu_coroutine_self(),
+        .ctx = qemu_get_current_aio_context(),
+        .data = &local_err,
+    };
+    aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
+    qemu_coroutine_yield();
+
+    if (local_err) {
+        error_propagate(task->errp, local_err);
+        goto err;
+    }
+
     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+
+    qemu_mutex_lock(&backup_state.stat.lock);
+    backup_state.stat.starting = false;
+    qemu_mutex_unlock(&backup_state.stat.lock);
+
+    /* start the first job in the transaction */
+    job_txn_start_seq(backup_state.txn);
+
     return;
 
 err_mutex:
@@ -921,6 +958,7 @@ err:
         g_free(di);
     }
     g_list_free(di_list);
+    backup_state.di_list = NULL;
 
     if (devs) {
         g_strfreev(devs);
@@ -998,15 +1036,6 @@ UuidInfo *qmp_backup(
 
     block_on_coroutine_fn(pvebackup_co_prepare, &task);
 
-    if (*errp == NULL) {
-        bool errors = create_backup_jobs();
-
-        if (!errors) {
-            // start the first job in the transaction
-            job_txn_start_seq(backup_state.txn);
-        }
-    }
-
     return task.result;
 }
 
-- 
2.20.1