public inbox for pve-devel@lists.proxmox.com
 help / color / mirror / Atom feed
From: Fiona Ebner <f.ebner@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH v2 qemu 1/2] PVE Backup: fixup error handling for fleecing
Date: Tue, 25 Jun 2024 15:35:50 +0200	[thread overview]
Message-ID: <20240625133551.210636-1-f.ebner@proxmox.com> (raw)

The drained section needs to be terminated before breaking out of the
loop in the error scenarios. Otherwise, guest IO on the drive would
become stuck.

If the job is created successfully, then the job completion callback
will clean up the snapshot access block nodes. In case failure
happened before the job is created, there was no cleanup for the
snapshot access block nodes yet. Add it.

Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
---

Changes in v2:
    * also clean up block nodes

 .../0050-PVE-backup-add-fleecing-option.patch | 59 +++++++++++++------
 ...ve-error-when-copy-before-write-fail.patch | 23 ++++----
 2 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch b/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch
index dbb2883..0ba6235 100644
--- a/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch
+++ b/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch
@@ -63,9 +63,9 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
  block/monitor/block-hmp-cmds.c |   1 +
- pve-backup.c                   | 135 ++++++++++++++++++++++++++++++++-
+ pve-backup.c                   | 144 ++++++++++++++++++++++++++++++++-
  qapi/block-core.json           |  10 ++-
- 3 files changed, 142 insertions(+), 4 deletions(-)
+ 3 files changed, 151 insertions(+), 4 deletions(-)
 
 diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
 index 5000c084c5..70b3de4c7e 100644
@@ -80,7 +80,7 @@ index 5000c084c5..70b3de4c7e 100644
  
      hmp_handle_error(mon, error);
 diff --git a/pve-backup.c b/pve-backup.c
-index 5ebb6a3947..a747d12d3d 100644
+index 5ebb6a3947..2d3ca78eac 100644
 --- a/pve-backup.c
 +++ b/pve-backup.c
 @@ -7,9 +7,11 @@
@@ -111,7 +111,24 @@ index 5ebb6a3947..a747d12d3d 100644
      size_t size;
      uint64_t block_size;
      uint8_t dev_id;
-@@ -353,6 +362,22 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+@@ -348,11 +357,32 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
+     qemu_co_mutex_unlock(&backup_state.backup_mutex);
+ }
+ 
++static void cleanup_snapshot_access(PVEBackupDevInfo *di)
++{
++    if (di->fleecing.snapshot_access) {
++        bdrv_unref(di->fleecing.snapshot_access);
++        di->fleecing.snapshot_access = NULL;
++    }
++    if (di->fleecing.cbw) {
++        bdrv_cbw_drop(di->fleecing.cbw);
++        di->fleecing.cbw = NULL;
++    }
++}
++
+ static void pvebackup_complete_cb(void *opaque, int ret)
+ {
      PVEBackupDevInfo *di = opaque;
      di->completed_ret = ret;
  
@@ -122,19 +139,12 @@ index 5ebb6a3947..a747d12d3d 100644
 +     *   just spawn a BH calling bdrv_unref().
 +     * - For cbw, draining would need to spawn a BH.
 +     */
-+    if (di->fleecing.snapshot_access) {
-+        bdrv_unref(di->fleecing.snapshot_access);
-+        di->fleecing.snapshot_access = NULL;
-+    }
-+    if (di->fleecing.cbw) {
-+        bdrv_cbw_drop(di->fleecing.cbw);
-+        di->fleecing.cbw = NULL;
-+    }
++    cleanup_snapshot_access(di);
 +
      /*
       * Needs to happen outside of coroutine, because it takes the graph write lock.
       */
-@@ -519,9 +544,77 @@ static void create_backup_jobs_bh(void *opaque) {
+@@ -519,9 +549,80 @@ static void create_backup_jobs_bh(void *opaque) {
          }
          bdrv_drained_begin(di->bs);
  
@@ -172,6 +182,7 @@ index 5ebb6a3947..a747d12d3d 100644
 +            if (!di->fleecing.cbw) {
 +                error_setg(errp, "appending cbw node for fleecing failed: %s",
 +                           local_err ? error_get_pretty(local_err) : "unknown error");
++                bdrv_drained_end(di->bs);
 +                break;
 +            }
 +
@@ -184,6 +195,8 @@ index 5ebb6a3947..a747d12d3d 100644
 +            if (!di->fleecing.snapshot_access) {
 +                error_setg(errp, "setting up snapshot access for fleecing failed: %s",
 +                           local_err ? error_get_pretty(local_err) : "unknown error");
++                cleanup_snapshot_access(di);
++                bdrv_drained_end(di->bs);
 +                break;
 +            }
 +            source_bs = di->fleecing.snapshot_access;
@@ -214,7 +227,15 @@ index 5ebb6a3947..a747d12d3d 100644
              BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
              &local_err);
  
-@@ -577,6 +670,14 @@ static void create_backup_jobs_bh(void *opaque) {
+@@ -535,6 +636,7 @@ static void create_backup_jobs_bh(void *opaque) {
+         }
+ 
+         if (!job || local_err) {
++            cleanup_snapshot_access(di);
+             error_setg(errp, "backup_job_create failed: %s",
+                        local_err ? error_get_pretty(local_err) : "null");
+             break;
+@@ -577,6 +679,14 @@ static void create_backup_jobs_bh(void *opaque) {
      aio_co_enter(data->ctx, data->co);
  }
  
@@ -229,7 +250,7 @@ index 5ebb6a3947..a747d12d3d 100644
  /*
   * Returns a list of device infos, which needs to be freed by the caller. In
   * case of an error, errp will be set, but the returned value might still be a
-@@ -584,6 +685,7 @@ static void create_backup_jobs_bh(void *opaque) {
+@@ -584,6 +694,7 @@ static void create_backup_jobs_bh(void *opaque) {
   */
  static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
      const char *devlist,
@@ -237,7 +258,7 @@ index 5ebb6a3947..a747d12d3d 100644
      Error **errp)
  {
      gchar **devs = NULL;
-@@ -607,6 +709,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+@@ -607,6 +718,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
              }
              PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
              di->bs = bs;
@@ -269,7 +290,7 @@ index 5ebb6a3947..a747d12d3d 100644
              di_list = g_list_append(di_list, di);
              d++;
          }
-@@ -656,6 +783,7 @@ UuidInfo coroutine_fn *qmp_backup(
+@@ -656,6 +792,7 @@ UuidInfo coroutine_fn *qmp_backup(
      const char *devlist,
      bool has_speed, int64_t speed,
      bool has_max_workers, int64_t max_workers,
@@ -277,7 +298,7 @@ index 5ebb6a3947..a747d12d3d 100644
      Error **errp)
  {
      assert(qemu_in_coroutine());
-@@ -684,7 +812,7 @@ UuidInfo coroutine_fn *qmp_backup(
+@@ -684,7 +821,7 @@ UuidInfo coroutine_fn *qmp_backup(
      format = has_format ? format : BACKUP_FORMAT_VMA;
  
      bdrv_graph_co_rdlock();
@@ -286,7 +307,7 @@ index 5ebb6a3947..a747d12d3d 100644
      bdrv_graph_co_rdunlock();
      if (local_err) {
          error_propagate(errp, local_err);
-@@ -1089,5 +1217,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+@@ -1089,5 +1226,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
      ret->query_bitmap_info = true;
      ret->pbs_masterkey = true;
      ret->backup_max_workers = true;
diff --git a/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch b/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
index 4522d37..66fc9fa 100644
--- a/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
+++ b/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
@@ -17,8 +17,8 @@ Tested-by: Friedrich Weber <f.weber@proxmox.com>
 ---
  block/copy-before-write.c | 18 ++++++++++++------
  block/copy-before-write.h |  1 +
- pve-backup.c              |  9 +++++++++
- 3 files changed, 22 insertions(+), 6 deletions(-)
+ pve-backup.c              | 12 ++++++++++++
+ 3 files changed, 25 insertions(+), 6 deletions(-)
 
 diff --git a/block/copy-before-write.c b/block/copy-before-write.c
 index bba58326d7..50cc4c7aae 100644
@@ -96,13 +96,14 @@ index dc6cafe7fa..a27d2d7d9f 100644
  
  #endif /* COPY_BEFORE_WRITE_H */
 diff --git a/pve-backup.c b/pve-backup.c
-index a747d12d3d..4e730aa3da 100644
+index 2d3ca78eac..c4178758b3 100644
 --- a/pve-backup.c
 +++ b/pve-backup.c
-@@ -374,6 +374,15 @@ static void pvebackup_complete_cb(void *opaque, int ret)
-         di->fleecing.snapshot_access = NULL;
-     }
-     if (di->fleecing.cbw) {
+@@ -374,6 +374,18 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+     PVEBackupDevInfo *di = opaque;
+     di->completed_ret = ret;
+ 
++    if (di->fleecing.cbw) {
 +        /*
 +         * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot
 +         * error, making further requests to the snapshot fail with EACCES, which then also fail the
@@ -112,6 +113,8 @@ index a747d12d3d..4e730aa3da 100644
 +        if (di->completed_ret == -EACCES && snapshot_error) {
 +            di->completed_ret = snapshot_error;
 +        }
-         bdrv_cbw_drop(di->fleecing.cbw);
-         di->fleecing.cbw = NULL;
-     }
++    }
++
+     /*
+      * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
+      * won't be done as a coroutine anyways:
-- 
2.39.2



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


             reply	other threads:[~2024-06-25 13:35 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-25 13:35 Fiona Ebner [this message]
2024-06-25 13:35 ` [pve-devel] [PATCH v2 qemu 2/2] PVE backup: factor out setting up snapshot access " Fiona Ebner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240625133551.210636-1-f.ebner@proxmox.com \
    --to=f.ebner@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal