xfs: refine the allocation stack switch

The allocation stack switch at xfs_bmapi_allocate() has served it's purpose, but is no longer a sufficient solution to the stack usage problem we have in the XFS allocation path. Whilst the kernel stack size is now 16k, that is not a valid reason for undoing all our "keep stack usage down" modifications. What it does allow us to do is have the freedom to refine and perfect the modifications knowing that if we get it wrong it won't blow up in our faces - we have a safety net now. This is important because we still have the issue of older kernels having smaller stacks and that they are still supported and are demonstrating a wide range of different stack overflows. Red Hat has several open bugs for allocation based stack overflows from directory modifications and direct IO block allocation and these problems still need to be solved. If we can solve them upstream, then distro's won't need to bake their own unique solutions. To that end, I've observed that every allocation based stack overflow report has had a specific characteristic - it has happened during or directly after a bmap btree block split. That event requires a new block to be allocated to the tree, and so we effectively stack one allocation stack on top of another, and that's when we get into trouble. A further observation is that bmap btree block splits are much rarer than writeback allocation - over a range of different workloads I've observed the ratio of bmap btree inserts to splits ranges from 100:1 (xfstests run) to 10000:1 (local VM image server with sparse files that range in the hundreds of thousands to millions of extents). Either way, bmap btree split events are much, much rarer than allocation events. Finally, we have to move the kswapd state to the allocation workqueue work when allocation is done on behalf of kswapd. This is proving to cause significant perturbation in performance under memory pressure and appears to be generating allocation deadlock warnings under some workloads, so avoiding the use of a workqueue for the majority of kswapd writeback allocation will minimise the impact of such behaviour. Hence it makes sense to move the stack switch to xfs_btree_split() and only do it for bmap btree splits. Stack switches during allocation will be much rarer, so there won't be significant performacne overhead caused by switching stacks. The worse case stack from all allocation paths will be split, not just writeback. And the majority of memory allocations will be done in the correct context (e.g. kswapd) without causing additional latency, and so we simplify the memory reclaim interactions between processes, workqueues and kswapd. The worst stack I've been able to generate with this patch in place is 5600 bytes deep. It's very revealing because we exit XFS at: 37) 1768 64 kmem_cache_alloc+0x13b/0x170 about 1800 bytes of stack consumed, and the remaining 3800 bytes (and 36 functions) is memory reclaim, swap and the IO stack. And this occurs in the inode allocation from an open(O_CREAT) syscall, not writeback. The amount of stack being used is much less than I've previously be able to generate - fs_mark testing has been able to generate stack usage of around 7k without too much trouble; with this patch it's only just getting to 5.5k. This is primarily because the metadata allocation paths (e.g. directory blocks) are no longer causing double splits on the same stack, and hence now stack tracing is showing swapping being the worst stack consumer rather than XFS. Performance of fs_mark inode create workloads is unchanged. Performance of fs_mark async fsync workloads is consistently good with context switches reduced by around 150,000/s (30%). Performance of dbench, streaming IO and postmark is unchanged. Allocation deadlock warnings have not been seen on the workloads that generated them since adding this patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
author: Dave Chinner <dchinner@redhat.com> 2014-07-14 17:08:24 -0400
committer: Dave Chinner <david@fromorbit.com> 2014-07-14 17:08:24 -0400
commit: cf11da9c5d374962913ca5ba0ce0886b58286224 (patch)
tree: 88480a47229aa9a3244beca6cae49e0ae00df37b /fs/xfs/xfs_btree.c
parent: aa182e64f16fc29a4984c2d79191b161888bbd9b (diff)
1 files changed, 81 insertions, 1 deletions
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index bf810c6baf2b..cf893bc1e373 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_cksum.h"
+#include "xfs_alloc.h"
 /*
 * Cursor allocation zone.
@@ -2323,7 +2324,7 @@ error1:
 * record (to be inserted into parent).
 */
 STATIC int                                      /* error */
-xfs_btree_split(
+__xfs_btree_split(
        struct xfs_btree_cur    *cur,
        int                     level,
        union xfs_btree_ptr     *ptrp,
@@ -2503,6 +2504,85 @@ error0:
        return error;
 }
+struct xfs_btree_split_args {
+        struct xfs_btree_cur    *cur;
+        int                     level;
+        union xfs_btree_ptr     *ptrp;
+        union xfs_btree_key     *key;
+        struct xfs_btree_cur    **curp;
+        int                     *stat;          /* success/failure */
+        int                     result;
+        bool                    kswapd; /* allocation in kswapd context */
+        struct completion       *done;
+        struct work_struct      work;
+};
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+        struct work_struct      *work)
+{
+        struct xfs_btree_split_args     *args = container_of(work,
+                                                struct xfs_btree_split_args, work);
+        unsigned long           pflags;
+        unsigned long           new_pflags = PF_FSTRANS;
+        /*
+         * we are in a transaction context here, but may also be doing work
+         * in kswapd context, and hence we may need to inherit that state
+         * temporarily to ensure that we don't block waiting for memory reclaim
+         * in any way.
+         */
+        if (args->kswapd)
+                new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+        current_set_flags_nested(&pflags, new_pflags);
+        args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+                                         args->key, args->curp, args->stat);
+        complete(args->done);
+        current_restore_flags_nested(&pflags, new_pflags);
+}
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int                                      /* error */
+xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_split_args     args;
+        DECLARE_COMPLETION_ONSTACK(done);
+        if (cur->bc_btnum != XFS_BTNUM_BMAP)
+                return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+        args.cur = cur;
+        args.level = level;
+        args.ptrp = ptrp;
+        args.key = key;
+        args.curp = curp;
+        args.stat = stat;
+        args.done = &done;
+        args.kswapd = current_is_kswapd();
+        INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+        queue_work(xfs_alloc_wq, &args.work);
+        wait_for_completion(&done);
+        destroy_work_on_stack(&args.work);
+        return args.result;
+}
 /*
 * Copy the old inode root contents into a real block and make the
 * broot point to it.
author	Dave Chinner <dchinner@redhat.com>	2014-07-14 17:08:24 -0400
committer	Dave Chinner <david@fromorbit.com>	2014-07-14 17:08:24 -0400
commit	cf11da9c5d374962913ca5ba0ce0886b58286224 (patch)
tree	88480a47229aa9a3244beca6cae49e0ae00df37b /fs/xfs/xfs_btree.c
parent	aa182e64f16fc29a4984c2d79191b161888bbd9b (diff)

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index bf810c6baf2b..cf893bc1e373 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c
@@ -33,6 +33,7 @@
33	#include "xfs_error.h"	33	#include "xfs_error.h"
34	#include "xfs_trace.h"	34	#include "xfs_trace.h"
35	#include "xfs_cksum.h"	35	#include "xfs_cksum.h"
		36	#include "xfs_alloc.h"
36		37
37	/*	38	/*
38	* Cursor allocation zone.	39	* Cursor allocation zone.
@@ -2323,7 +2324,7 @@ error1:
2323	* record (to be inserted into parent).	2324	* record (to be inserted into parent).
2324	*/	2325	*/
2325	STATIC int /* error */	2326	STATIC int /* error */
2326	xfs_btree_split(	2327	__xfs_btree_split(
2327	struct xfs_btree_cur *cur,	2328	struct xfs_btree_cur *cur,
2328	int level,	2329	int level,
2329	union xfs_btree_ptr *ptrp,	2330	union xfs_btree_ptr *ptrp,
@@ -2503,6 +2504,85 @@ error0:
2503	return error;	2504	return error;
2504	}	2505	}
2505		2506
		2507	struct xfs_btree_split_args {
		2508	struct xfs_btree_cur *cur;
		2509	int level;
		2510	union xfs_btree_ptr *ptrp;
		2511	union xfs_btree_key *key;
		2512	struct xfs_btree_cur **curp;
		2513	int stat; / success/failure */
		2514	int result;
		2515	bool kswapd; /* allocation in kswapd context */
		2516	struct completion *done;
		2517	struct work_struct work;
		2518	};
		2519
		2520	/*
		2521	* Stack switching interfaces for allocation
		2522	*/
		2523	static void
		2524	xfs_btree_split_worker(
		2525	struct work_struct *work)
		2526	{
		2527	struct xfs_btree_split_args *args = container_of(work,
		2528	struct xfs_btree_split_args, work);
		2529	unsigned long pflags;
		2530	unsigned long new_pflags = PF_FSTRANS;
		2531
		2532	/*
		2533	* we are in a transaction context here, but may also be doing work
		2534	* in kswapd context, and hence we may need to inherit that state
		2535	* temporarily to ensure that we don't block waiting for memory reclaim
		2536	* in any way.
		2537	*/
		2538	if (args->kswapd)
		2539	new_pflags \|= PF_MEMALLOC \| PF_SWAPWRITE \| PF_KSWAPD;
		2540
		2541	current_set_flags_nested(&pflags, new_pflags);
		2542
		2543	args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
		2544	args->key, args->curp, args->stat);
		2545	complete(args->done);
		2546
		2547	current_restore_flags_nested(&pflags, new_pflags);
		2548	}
		2549
		2550	/*
		2551	* BMBT split requests often come in with little stack to work on. Push
		2552	* them off to a worker thread so there is lots of stack to use. For the other
		2553	* btree types, just call directly to avoid the context switch overhead here.
		2554	*/
		2555	STATIC int /* error */
		2556	xfs_btree_split(
		2557	struct xfs_btree_cur *cur,
		2558	int level,
		2559	union xfs_btree_ptr *ptrp,
		2560	union xfs_btree_key *key,
		2561	struct xfs_btree_cur **curp,
		2562	int stat) / success/failure */
		2563	{
		2564	struct xfs_btree_split_args args;
		2565	DECLARE_COMPLETION_ONSTACK(done);
		2566
		2567	if (cur->bc_btnum != XFS_BTNUM_BMAP)
		2568	return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
		2569
		2570	args.cur = cur;
		2571	args.level = level;
		2572	args.ptrp = ptrp;
		2573	args.key = key;
		2574	args.curp = curp;
		2575	args.stat = stat;
		2576	args.done = &done;
		2577	args.kswapd = current_is_kswapd();
		2578	INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
		2579	queue_work(xfs_alloc_wq, &args.work);
		2580	wait_for_completion(&done);
		2581	destroy_work_on_stack(&args.work);
		2582	return args.result;
		2583	}
		2584
		2585
2506	/*	2586	/*
2507	* Copy the old inode root contents into a real block and make the	2587	* Copy the old inode root contents into a real block and make the
2508	* broot point to it.	2588	* broot point to it.