all lists on lists.proxmox.com
 help / color / mirror / Atom feed
From: Stoiko Ivanov <s.ivanov@proxmox.com>
To: pve-devel@lists.proxmox.com
Subject: [pve-devel] [PATCH zfsonlinux] add patch with backport of a7a144e65 ("enforce arc_dnode_limit")
Date: Mon, 28 Jul 2025 18:30:41 +0200	[thread overview]
Message-ID: <20250728163041.1287899-1-s.ivanov@proxmox.com> (raw)

as requested and argued in:
https://lore.proxmox.com/pve-devel/5f3e46ed-bf99-45e2-b497-fc81dc50d9b3@proxmox.com/

Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
---
If accepted we'd try to include the backport upstream for 2.2.9
...kport-enforce-arc_dnode_limit-to-2.2.patch | 207 ++++++++++++++++++
 debian/patches/series                         |   1 +
 2 files changed, 208 insertions(+)
 create mode 100644 debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch

diff --git a/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch
new file mode 100644
index 000000000..26c0dface
--- /dev/null
+++ b/debian/patches/0012-backport-enforce-arc_dnode_limit-to-2.2.patch
@@ -0,0 +1,207 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stoiko Ivanov <s.ivanov@proxmox.com>
+Date: Mon, 28 Jul 2025 15:16:46 +0200
+Subject: [PATCH] backport enforce arc_dnode_limit to 2.2
+
+This patch is a backport of  a7a144e65 ("enforce arc_dnode_limit")
+for the 2.2 branch.
+
+back-ported from commit a7a144e655850b4160943e4ba315eb9a5dc2b2fe
+working around changes from:
+55427add3 ("Several improvements to ARC shrinking (#16197)")
+5b9f3b766 ("Soften pruning threshold on not evictable metadata")
+which are present in 2.2.3, but not in 2.2.8
+
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+---
+ include/sys/arc_impl.h           |  2 +-
+ module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++
+ module/zfs/arc.c                 | 27 ++++++++-----
+ 3 files changed, 83 insertions(+), 11 deletions(-)
+
+diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
+index defebe3b2fbbdc8b1c901108f19bde8f12ea2175..36cd83e83358e123980909a903854d573531d4b6 100644
+--- a/include/sys/arc_impl.h
++++ b/include/sys/arc_impl.h
+@@ -952,7 +952,7 @@ typedef struct arc_sums {
+ 	wmsum_t arcstat_data_size;
+ 	wmsum_t arcstat_metadata_size;
+ 	wmsum_t arcstat_dbuf_size;
+-	wmsum_t arcstat_dnode_size;
++	aggsum_t arcstat_dnode_size;
+ 	wmsum_t arcstat_bonus_size;
+ 	wmsum_t arcstat_l2_hits;
+ 	wmsum_t arcstat_l2_misses;
+diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
+index 1f72cce07dd1830e2f5fdff50ef298e05be3013d..da0cda03985e93acfa111efb7d6e9d6637f729cf 100644
+--- a/module/os/linux/zfs/zfs_vfsops.c
++++ b/module/os/linux/zfs/zfs_vfsops.c
+@@ -1179,6 +1179,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
+ 	return (error);
+ }
+ 
++/*
++ * Dentry and inode caches referenced by a task in non-root memcg are
++ * not going to be scanned by the kernel-provided shrinker. So, if
++ * kernel prunes nothing, fall back to this manual walk to free dnodes.
++ * To avoid scanning the same znodes multiple times they are always rotated
++ * to the end of the z_all_znodes list. New znodes are inserted at the
++ * end of the list so we're always scanning the oldest znodes first.
++ */
++static int
++zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
++{
++	znode_t **zp_array, *zp;
++	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
++	int objects = 0;
++	int i = 0, j = 0;
++
++	zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
++
++	mutex_enter(&zfsvfs->z_znodes_lock);
++	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
++
++		if ((i++ > nr_to_scan) || (j >= max_array))
++			break;
++
++		ASSERT(list_link_active(&zp->z_link_node));
++		list_remove(&zfsvfs->z_all_znodes, zp);
++		list_insert_tail(&zfsvfs->z_all_znodes, zp);
++
++		/* Skip active znodes and .zfs entries */
++		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
++			continue;
++
++		if (igrab(ZTOI(zp)) == NULL)
++			continue;
++
++		zp_array[j] = zp;
++		j++;
++	}
++	mutex_exit(&zfsvfs->z_znodes_lock);
++
++	for (i = 0; i < j; i++) {
++		zp = zp_array[i];
++
++		ASSERT3P(zp, !=, NULL);
++		d_prune_aliases(ZTOI(zp));
++
++		if (atomic_read(&ZTOI(zp)->i_count) == 1)
++			objects++;
++
++		zrele(zp);
++	}
++
++	vmem_free(zp_array, max_array * sizeof (znode_t *));
++
++	return (objects);
++}
++
+ /*
+  * The ARC has requested that the filesystem drop entries from the dentry
+  * and inode caches.  This can occur when the ARC needs to free meta data
+@@ -1222,6 +1279,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+ 	*objects = (*shrinker->scan_objects)(shrinker, &sc);
+ #endif
+ 
++	/*
++	 * Fall back to zfs_prune_aliases if kernel's shrinker did nothing
++	 * due to dentry and inode caches being referenced by a task running
++	 * in non-root memcg.
++	 */
++	if (*objects == 0)
++		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
++
+ 	zfs_exit(zfsvfs, FTAG);
+ 
+ 	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+diff --git a/module/zfs/arc.c b/module/zfs/arc.c
+index 5c6e92f0f8b31dbcd569c92e645afb2e180b2deb..383aca2808d2c0aa8d09a9cdc8cfbfde4f6a6fc9 100644
+--- a/module/zfs/arc.c
++++ b/module/zfs/arc.c
+@@ -2597,7 +2597,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
+ 		ARCSTAT_INCR(arcstat_bonus_size, space);
+ 		break;
+ 	case ARC_SPACE_DNODE:
+-		ARCSTAT_INCR(arcstat_dnode_size, space);
++		aggsum_add(&arc_sums.arcstat_dnode_size, space);
+ 		break;
+ 	case ARC_SPACE_DBUF:
+ 		ARCSTAT_INCR(arcstat_dbuf_size, space);
+@@ -2643,7 +2643,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
+ 		ARCSTAT_INCR(arcstat_bonus_size, -space);
+ 		break;
+ 	case ARC_SPACE_DNODE:
+-		ARCSTAT_INCR(arcstat_dnode_size, -space);
++		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
+ 		break;
+ 	case ARC_SPACE_DBUF:
+ 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
+@@ -4292,7 +4292,7 @@ arc_evict(void)
+ 	 * target is not evictable or if they go over arc_dnode_limit.
+ 	 */
+ 	int64_t prune = 0;
+-	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
++	int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
+ 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
+ 	if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
+ 	    zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
+@@ -4775,12 +4775,19 @@ arc_is_overflowing(boolean_t use_reserve)
+ 	 * in the ARC. In practice, that's in the tens of MB, which is low
+ 	 * enough to be safe.
+ 	 */
+-	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
++	int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) -
+ 	    arc_c - overflow / 2;
+ 	if (!use_reserve)
+ 		overflow /= 2;
+-	return (over < 0 ? ARC_OVF_NONE :
+-	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
++
++	int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
++	    arc_dnode_limit;
++
++	/* Always allow at least one block of overflow. */
++	if (arc_over < 0 && dn_over <= 0)
++		return (ARC_OVF_NONE);
++
++	return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
+ }
+ 
+ static abd_t *
+@@ -6938,7 +6945,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
+ #if defined(COMPAT_FREEBSD11)
+ 	as->arcstat_other_size.value.ui64 =
+ 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
+-	    wmsum_value(&arc_sums.arcstat_dnode_size) +
++	    aggsum_value(&arc_sums.arcstat_dnode_size) +
+ 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
+ #endif
+ 
+@@ -6980,7 +6987,7 @@ arc_kstat_update(kstat_t *ksp, int rw)
+ 	    &as->arcstat_uncached_evictable_metadata);
+ 
+ 	as->arcstat_dnode_size.value.ui64 =
+-	    wmsum_value(&arc_sums.arcstat_dnode_size);
++	    aggsum_value(&arc_sums.arcstat_dnode_size);
+ 	as->arcstat_bonus_size.value.ui64 =
+ 	    wmsum_value(&arc_sums.arcstat_bonus_size);
+ 	as->arcstat_l2_hits.value.ui64 =
+@@ -7349,7 +7356,7 @@ arc_state_init(void)
+ 	wmsum_init(&arc_sums.arcstat_data_size, 0);
+ 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
+ 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
+-	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
++	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
+ 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
+ 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
+ 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
+@@ -7507,7 +7514,7 @@ arc_state_fini(void)
+ 	wmsum_fini(&arc_sums.arcstat_data_size);
+ 	wmsum_fini(&arc_sums.arcstat_metadata_size);
+ 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
+-	wmsum_fini(&arc_sums.arcstat_dnode_size);
++	aggsum_fini(&arc_sums.arcstat_dnode_size);
+ 	wmsum_fini(&arc_sums.arcstat_bonus_size);
+ 	wmsum_fini(&arc_sums.arcstat_l2_hits);
+ 	wmsum_fini(&arc_sums.arcstat_l2_misses);
diff --git a/debian/patches/series b/debian/patches/series
index 229027ff9..11a97debd 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -9,3 +9,4 @@
 0009-arc-stat-summary-guard-access-to-freshly-introduced-.patch
 0010-Fix-nfs_truncate_shares-without-etc-exports.d.patch
 0011-zpool-status-tighten-bounds-for-noalloc-stat-availab.patch
+0012-backport-enforce-arc_dnode_limit-to-2.2.patch
-- 
2.39.5



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel


             reply	other threads:[~2025-07-28 16:29 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-28 16:30 Stoiko Ivanov [this message]
2025-07-28 16:32 ` Stoiko Ivanov
2025-07-29  4:14 ` [pve-devel] applied: " Thomas Lamprecht

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250728163041.1287899-1-s.ivanov@proxmox.com \
    --to=s.ivanov@proxmox.com \
    --cc=pve-devel@lists.proxmox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.
Service provided by Proxmox Server Solutions GmbH | Privacy | Legal