aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/libxfs/xfs_metafile.c
diff options
context:
space:
mode:
authorDarrick J. Wong <[email protected]>2024-11-21 00:20:19 +0000
committerDarrick J. Wong <[email protected]>2024-12-23 21:06:03 +0000
commit05290bd5c6236b8ad659157edb36bd2d38f46d3e (patch)
treefe4443fc5abfbfcfab0d50c1edf675729ae58080 /fs/xfs/libxfs/xfs_metafile.c
parentxfs: prepare to reuse the dquot pointer space in struct xfs_inode (diff)
downloadkernel-05290bd5c6236b8ad659157edb36bd2d38f46d3e.tar.gz
kernel-05290bd5c6236b8ad659157edb36bd2d38f46d3e.zip
xfs: allow inode-based btrees to reserve space in the data device
Create a new space reservation scheme so that btree metadata for the realtime volume can reserve space in the data device to avoid space underruns. Back when we were testing the rmap and refcount btrees for the data device, people observed occasional shutdowns when xfs_btree_split was called for either of those two btrees. This happened when certain operations (mostly writeback ioends) created new rmap or refcount records, which would expand the size of the btree. If there were no free blocks available the allocation would fail and the split would shut down the filesystem. I considered pre-reserving blocks for btree expansion at the time of a write() call, but there wasn't any good way to attach the reservations to an inode and keep them there all the way to ioend processing. Unlike delalloc reservations which have that indlen mechanism, there's no way to do that for mapped extents; and indlen blocks are given back during the delalloc -> unwritten transition. The solution was to reserve sufficient blocks for rmap/refcount btree expansion at mount time. This is what the XFS_AG_RESV_* flags provide; any expansion of those two btrees can come from the pre-reserved space. This patch brings that pre-reservation ability to inode-rooted btrees so that the rt rmap and refcount btrees can also save room for future expansion. Signed-off-by: "Darrick J. Wong" <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]>
Diffstat (limited to 'fs/xfs/libxfs/xfs_metafile.c')
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c205
1 files changed, 205 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index adeb25d1a444..e151663cc9ef 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -17,6 +17,10 @@
#include "xfs_metafile.h"
#include "xfs_trace.h"
#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
/* Set up an inode to be recognized as a metadata directory inode. */
void
@@ -50,3 +54,204 @@ xfs_metafile_clear_iflag(
ip->i_diflags2 &= ~XFS_DIFLAG2_METADATA;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+
+/*
+ * Is the amount of space that could be allocated towards a given metadata
+ * file at or beneath a certain threshold?
+ */
+static inline bool
+xfs_metafile_resv_can_cover(
+ struct xfs_inode *ip,
+ int64_t rhs)
+{
+ /*
+ * The amount of space that can be allocated to this metadata file is
+ * the remaining reservation for the particular metadata file + the
+ * global free block count. Take care of the first case to avoid
+ * touching the per-cpu counter.
+ */
+ if (ip->i_delayed_blks >= rhs)
+ return true;
+
+ /*
+ * There aren't enough blocks left in the inode's reservation, but it
+ * isn't critical unless there also isn't enough free space.
+ */
+ return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
+ rhs - ip->i_delayed_blks, 2048) >= 0;
+}
+
+/*
+ * Is this metadata file critically low on blocks? For now we'll define that
+ * as the number of blocks we can get our hands on being less than 10% of what
+ * we reserved or less than some arbitrary number (maximum btree height).
+ */
+bool
+xfs_metafile_resv_critical(
+ struct xfs_inode *ip)
+{
+ uint64_t asked_low_water;
+
+ if (!ip)
+ return false;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_critical(ip, 0);
+
+ if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+ return true;
+
+ asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
+ if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
+ return true;
+
+ return XFS_TEST_ERROR(false, ip->i_mount,
+ XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+}
+
+/* Allocate a block from the metadata file's reservation. */
+void
+xfs_metafile_resv_alloc_space(
+ struct xfs_inode *ip,
+ struct xfs_alloc_arg *args)
+{
+ int64_t len = args->len;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ ASSERT(args->resv == XFS_AG_RESV_METAFILE);
+
+ trace_xfs_metafile_resv_alloc_space(ip, args->len);
+
+ /*
+ * Allocate the blocks from the metadata inode's block reservation
+ * and update the ondisk sb counter.
+ */
+ if (ip->i_delayed_blks > 0) {
+ int64_t from_resv;
+
+ from_resv = min_t(int64_t, len, ip->i_delayed_blks);
+ ip->i_delayed_blks -= from_resv;
+ xfs_mod_delalloc(ip, 0, -from_resv);
+ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
+ -from_resv);
+ len -= from_resv;
+ }
+
+ /*
+ * Any allocation in excess of the reservation requires in-core and
+ * on-disk fdblocks updates. If we can grab @len blocks from the
+ * in-core fdblocks then all we need to do is update the on-disk
+ * superblock; if not, then try to steal some from the transaction's
+ * block reservation. Overruns are only expected for rmap btrees.
+ */
+ if (len) {
+ unsigned int field;
+ int error;
+
+ error = xfs_dec_fdblocks(ip->i_mount, len, true);
+ if (error)
+ field = XFS_TRANS_SB_FDBLOCKS;
+ else
+ field = XFS_TRANS_SB_RES_FDBLOCKS;
+
+ xfs_trans_mod_sb(args->tp, field, -len);
+ }
+
+ ip->i_nblocks += args->len;
+ xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
+}
+
+/* Free a block to the metadata file's reservation. */
+void
+xfs_metafile_resv_free_space(
+ struct xfs_inode *ip,
+ struct xfs_trans *tp,
+ xfs_filblks_t len)
+{
+ int64_t to_resv;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_free_space(ip, len);
+
+ ip->i_nblocks -= len;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ /*
+ * Add the freed blocks back into the inode's delalloc reservation
+ * until it reaches the maximum size. Update the ondisk fdblocks only.
+ */
+ to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ if (to_resv > 0) {
+ to_resv = min_t(int64_t, to_resv, len);
+ ip->i_delayed_blks += to_resv;
+ xfs_mod_delalloc(ip, 0, to_resv);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
+ len -= to_resv;
+ }
+
+ /*
+ * Everything else goes back to the filesystem, so update the in-core
+ * and on-disk counters.
+ */
+ if (len)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
+}
+
+/* Release a metadata file's space reservation. */
+void
+xfs_metafile_resv_free(
+ struct xfs_inode *ip)
+{
+ /* Non-btree metadata inodes don't need space reservations. */
+ if (!ip || !ip->i_meta_resv_asked)
+ return;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+ trace_xfs_metafile_resv_free(ip, 0);
+
+ if (ip->i_delayed_blks) {
+ xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
+ xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
+ ip->i_delayed_blks = 0;
+ }
+ ip->i_meta_resv_asked = 0;
+}
+
+/* Set up a metadata file's space reservation. */
+int
+xfs_metafile_resv_init(
+ struct xfs_inode *ip,
+ xfs_filblks_t ask)
+{
+ xfs_filblks_t hidden_space;
+ xfs_filblks_t used;
+ int error;
+
+ if (!ip || ip->i_meta_resv_asked > 0)
+ return 0;
+
+ ASSERT(xfs_is_metadir_inode(ip));
+
+ /*
+ * Space taken by all other metadata btrees are accounted on-disk as
+ * used space. We therefore only hide the space that is reserved but
+ * not used by the trees.
+ */
+ used = ip->i_nblocks;
+ if (used > ask)
+ ask = used;
+ hidden_space = ask - used;
+
+ error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
+ if (error) {
+ trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
+ return error;
+ }
+
+ xfs_mod_delalloc(ip, 0, hidden_space);
+ ip->i_delayed_blks = hidden_space;
+ ip->i_meta_resv_asked = ask;
+
+ trace_xfs_metafile_resv_init(ip, ask);
+ return 0;
+}