FS-Cache: Handle pages pending storage that get evicted under OOM conditions

Handle netfs pages that the vmscan algorithm wants to evict from the pagecache under OOM conditions, but that are waiting for write to the cache. Under these conditions, vmscan calls the releasepage() function of the netfs, asking if a page can be discarded. The problem is typified by the following trace of a stuck process: kslowd005 D 0000000000000000 0 4253 2 0x00000080 ffff88001b14f370 0000000000000046 ffff880020d0d000 0000000000000007 0000000000000006 0000000000000001 ffff88001b14ffd8 ffff880020d0d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff880020d0d2a8 Call Trace: [<ffffffffa00782d8>] __fscache_wait_on_page_write+0x8b/0xa7 [fscache] [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34 [<ffffffffa0078240>] ? __fscache_check_page_write+0x63/0x70 [fscache] [<ffffffffa00b671d>] nfs_fscache_release_page+0x4e/0xc4 [nfs] [<ffffffffa00927f0>] nfs_release_page+0x3c/0x41 [nfs] [<ffffffff810885d3>] try_to_release_page+0x32/0x3b [<ffffffff81093203>] shrink_page_list+0x316/0x4ac [<ffffffff8109372b>] shrink_inactive_list+0x392/0x67c [<ffffffff813532fa>] ? __mutex_unlock_slowpath+0x100/0x10b [<ffffffff81058df0>] ? trace_hardirqs_on_caller+0x10c/0x130 [<ffffffff8135330e>] ? mutex_unlock+0x9/0xb [<ffffffff81093aa2>] shrink_list+0x8d/0x8f [<ffffffff81093d1c>] shrink_zone+0x278/0x33c [<ffffffff81052d6c>] ? ktime_get_ts+0xad/0xba [<ffffffff81094b13>] try_to_free_pages+0x22e/0x392 [<ffffffff81091e24>] ? isolate_pages_global+0x0/0x212 [<ffffffff8108e743>] __alloc_pages_nodemask+0x3dc/0x5cf [<ffffffff81089529>] grab_cache_page_write_begin+0x65/0xaa [<ffffffff8110f8c0>] ext3_write_begin+0x78/0x1eb [<ffffffff81089ec5>] generic_file_buffered_write+0x109/0x28c [<ffffffff8103cb69>] ? current_fs_time+0x22/0x29 [<ffffffff8108a509>] __generic_file_aio_write+0x350/0x385 [<ffffffff8108a588>] ? generic_file_aio_write+0x4a/0xae [<ffffffff8108a59e>] generic_file_aio_write+0x60/0xae [<ffffffff810b2e82>] do_sync_write+0xe3/0x120 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34 [<ffffffff810b18e1>] ? __dentry_open+0x1a5/0x2b8 [<ffffffff810b1a76>] ? dentry_open+0x82/0x89 [<ffffffffa00e693c>] cachefiles_write_page+0x298/0x335 [cachefiles] [<ffffffffa0077147>] fscache_write_op+0x178/0x2c2 [fscache] [<ffffffffa0075656>] fscache_op_execute+0x7a/0xd1 [fscache] [<ffffffff81082093>] slow_work_execute+0x18f/0x2d1 [<ffffffff8108239a>] slow_work_thread+0x1c5/0x308 [<ffffffff8104c0f1>] ? autoremove_wake_function+0x0/0x34 [<ffffffff810821d5>] ? slow_work_thread+0x0/0x308 [<ffffffff8104be91>] kthread+0x7a/0x82 [<ffffffff8100beda>] child_rip+0xa/0x20 [<ffffffff8100b87c>] ? restore_args+0x0/0x30 [<ffffffff8102ef83>] ? tg_shares_up+0x171/0x227 [<ffffffff8104be17>] ? kthread+0x0/0x82 [<ffffffff8100bed0>] ? child_rip+0x0/0x20 In the above backtrace, the following is happening: (1) A page storage operation is being executed by a slow-work thread (fscache_write_op()). (2) FS-Cache farms the operation out to the cache to perform (cachefiles_write_page()). (3) CacheFiles is then calling Ext3 to perform the actual write, using Ext3's standard write (do_sync_write()) under KERNEL_DS directly from the netfs page. (4) However, for Ext3 to perform the write, it must allocate some memory, in particular, it must allocate at least one page cache page into which it can copy the data from the netfs page. (5) Under OOM conditions, the memory allocator can't immediately come up with a page, so it uses vmscan to find something to discard (try_to_free_pages()). (6) vmscan finds a clean netfs page it might be able to discard (possibly the one it's trying to write out). (7) The netfs is called to throw the page away (nfs_release_page()) - but it's called with __GFP_WAIT, so the netfs decides to wait for the store to complete (__fscache_wait_on_page_write()). (8) This blocks a slow-work processing thread - possibly against itself. The system ends up stuck because it can't write out any netfs pages to the cache without allocating more memory. To avoid this, we make FS-Cache cancel some writes that aren't in the middle of actually being performed. This means that some data won't make it into the cache this time. To support this, a new FS-Cache function is added fscache_maybe_release_page() that replaces what the netfs releasepage() functions used to do with respect to the cache. The decisions fscache_maybe_release_page() makes are counted and displayed through /proc/fs/fscache/stats on a line labelled "VmScan". There are four counters provided: "nos=N" - pages that weren't pending storage; "gon=N" - pages that were pending storage when we first looked, but weren't by the time we got the object lock; "bsy=N" - pages that we ignored as they were actively being written when we looked; and "can=N" - pages that we cancelled the storage of. What I'd really like to do is alter the behaviour of the cancellation heuristics, depending on how necessary it is to expel pages. If there are plenty of other pages that aren't waiting to be written to the cache that could be ejected first, then it would be nice to hold up on immediate cancellation of cache writes - but I don't see a way of doing that. Signed-off-by: David Howells <dhowells@redhat.com>
author: David Howells <dhowells@redhat.com> 2009-11-19 13:11:35 -0500
committer: David Howells <dhowells@redhat.com> 2009-11-19 13:11:35 -0500
commit: 201a15428bd54f83eccec8b7c64a04b8f9431204 (patch)
tree: 326fcce64ce96657253fd141a3f4a767ac95418a /fs/fscache
parent: e3d4d28b1c8cc7c26536a50b43d86ccd39878550 (diff)
3 files changed, 93 insertions, 2 deletions
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index a0769872b19c..e5046519b153 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -180,6 +180,11 @@ extern atomic_t fscache_n_store_pages;
 extern atomic_t fscache_n_store_radix_deletes;
 extern atomic_t fscache_n_store_pages_over_limit;
+extern atomic_t fscache_n_store_vmscan_not_storing;
+extern atomic_t fscache_n_store_vmscan_gone;
+extern atomic_t fscache_n_store_vmscan_busy;
+extern atomic_t fscache_n_store_vmscan_cancelled;
 extern atomic_t fscache_n_marks;
 extern atomic_t fscache_n_uncaches;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 022a5da8e130..fc76798bd968 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,6 +43,75 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 /*
+ * decide whether a page can be released, possibly by cancelling a store to it
+ * - we're allowed to sleep if __GFP_WAIT is flagged
+ */
+bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
+                                  struct page *page,
+                                  gfp_t gfp)
+{
+        struct page *xpage;
+        void *val;
+        _enter("%p,%p,%x", cookie, page, gfp);
+        rcu_read_lock();
+        val = radix_tree_lookup(&cookie->stores, page->index);
+        if (!val) {
+                rcu_read_unlock();
+                fscache_stat(&fscache_n_store_vmscan_not_storing);
+                __fscache_uncache_page(cookie, page);
+                return true;
+        }
+        /* see if the page is actually undergoing storage - if so we can't get
+         * rid of it till the cache has finished with it */
+        if (radix_tree_tag_get(&cookie->stores, page->index,
+                               FSCACHE_COOKIE_STORING_TAG)) {
+                rcu_read_unlock();
+                goto page_busy;
+        }
+        /* the page is pending storage, so we attempt to cancel the store and
+         * discard the store request so that the page can be reclaimed */
+        spin_lock(&cookie->stores_lock);
+        rcu_read_unlock();
+        if (radix_tree_tag_get(&cookie->stores, page->index,
+                               FSCACHE_COOKIE_STORING_TAG)) {
+                /* the page started to undergo storage whilst we were looking,
+                 * so now we can only wait or return */
+                spin_unlock(&cookie->stores_lock);
+                goto page_busy;
+        }
+        xpage = radix_tree_delete(&cookie->stores, page->index);
+        spin_unlock(&cookie->stores_lock);
+        if (xpage) {
+                fscache_stat(&fscache_n_store_vmscan_cancelled);
+                fscache_stat(&fscache_n_store_radix_deletes);
+                ASSERTCMP(xpage, ==, page);
+        } else {
+                fscache_stat(&fscache_n_store_vmscan_gone);
+        }
+        wake_up_bit(&cookie->flags, 0);
+        if (xpage)
+                page_cache_release(xpage);
+        __fscache_uncache_page(cookie, page);
+        return true;
+page_busy:
+        /* we might want to wait here, but that could deadlock the allocator as
+         * the slow-work threads writing to the cache may all end up sleeping
+         * on memory allocation */
+        fscache_stat(&fscache_n_store_vmscan_busy);
+        return false;
+}
+EXPORT_SYMBOL(__fscache_maybe_release_page);
+/*
 * note that a page has finished being written to the cache
 */
 static void fscache_end_page_write(struct fscache_object *object,
@@ -57,6 +126,8 @@ static void fscache_end_page_write(struct fscache_object *object,
                /* delete the page from the tree if it is now no longer
                 * pending */
                spin_lock(&cookie->stores_lock);
+                radix_tree_tag_clear(&cookie->stores, page->index,
+                                     FSCACHE_COOKIE_STORING_TAG);
                if (!radix_tree_tag_get(&cookie->stores, page->index,
                                        FSCACHE_COOKIE_PENDING_TAG)) {
                        fscache_stat(&fscache_n_store_radix_deletes);
@@ -640,8 +711,12 @@ static void fscache_write_op(struct fscache_operation *_op)
                goto superseded;
        }
-        radix_tree_tag_clear(&cookie->stores, page->index,
+        if (page) {
-                             FSCACHE_COOKIE_PENDING_TAG);
+                radix_tree_tag_set(&cookie->stores, page->index,
+                                   FSCACHE_COOKIE_STORING_TAG);
+                radix_tree_tag_clear(&cookie->stores, page->index,
+                                     FSCACHE_COOKIE_PENDING_TAG);
+        }
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 045ba396dbf2..cda69994e06d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -63,6 +63,11 @@ atomic_t fscache_n_store_pages;
 atomic_t fscache_n_store_radix_deletes;
 atomic_t fscache_n_store_pages_over_limit;
+atomic_t fscache_n_store_vmscan_not_storing;
+atomic_t fscache_n_store_vmscan_gone;
+atomic_t fscache_n_store_vmscan_busy;
+atomic_t fscache_n_store_vmscan_cancelled;
 atomic_t fscache_n_marks;
 atomic_t fscache_n_uncaches;
@@ -211,6 +216,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
                   atomic_read(&fscache_n_store_radix_deletes),
                   atomic_read(&fscache_n_store_pages_over_limit));
+        seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+                   atomic_read(&fscache_n_store_vmscan_not_storing),
+                   atomic_read(&fscache_n_store_vmscan_gone),
+                   atomic_read(&fscache_n_store_vmscan_busy),
+                   atomic_read(&fscache_n_store_vmscan_cancelled));
        seq_printf(m, "Ops    : pend=%u run=%u enq=%u can=%u rej=%u\n",
                   atomic_read(&fscache_n_op_pend),
                   atomic_read(&fscache_n_op_run),
author	David Howells <dhowells@redhat.com>	2009-11-19 13:11:35 -0500
committer	David Howells <dhowells@redhat.com>	2009-11-19 13:11:35 -0500
commit	201a15428bd54f83eccec8b7c64a04b8f9431204 (patch)
tree	326fcce64ce96657253fd141a3f4a767ac95418a /fs/fscache
parent	e3d4d28b1c8cc7c26536a50b43d86ccd39878550 (diff)

diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index a0769872b19c..e5046519b153 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h
@@ -180,6 +180,11 @@ extern atomic_t fscache_n_store_pages;
180	extern atomic_t fscache_n_store_radix_deletes;	180	extern atomic_t fscache_n_store_radix_deletes;
181	extern atomic_t fscache_n_store_pages_over_limit;	181	extern atomic_t fscache_n_store_pages_over_limit;
182		182
		183	extern atomic_t fscache_n_store_vmscan_not_storing;
		184	extern atomic_t fscache_n_store_vmscan_gone;
		185	extern atomic_t fscache_n_store_vmscan_busy;
		186	extern atomic_t fscache_n_store_vmscan_cancelled;
		187
183	extern atomic_t fscache_n_marks;	188	extern atomic_t fscache_n_marks;
184	extern atomic_t fscache_n_uncaches;	189	extern atomic_t fscache_n_uncaches;
185		190


diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 022a5da8e130..fc76798bd968 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c
@@ -43,6 +43,75 @@ void __fscache_wait_on_page_write(struct fscache_cookie cookie, struct page pa
43	EXPORT_SYMBOL(__fscache_wait_on_page_write);	43	EXPORT_SYMBOL(__fscache_wait_on_page_write);
44		44
45	/*	45	/*
		46	* decide whether a page can be released, possibly by cancelling a store to it
		47	* - we're allowed to sleep if __GFP_WAIT is flagged
		48	*/
		49	bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
		50	struct page *page,
		51	gfp_t gfp)
		52	{
		53	struct page *xpage;
		54	void *val;
		55
		56	_enter("%p,%p,%x", cookie, page, gfp);
		57
		58	rcu_read_lock();
		59	val = radix_tree_lookup(&cookie->stores, page->index);
		60	if (!val) {
		61	rcu_read_unlock();
		62	fscache_stat(&fscache_n_store_vmscan_not_storing);
		63	__fscache_uncache_page(cookie, page);
		64	return true;
		65	}
		66
		67	/* see if the page is actually undergoing storage - if so we can't get
		68	* rid of it till the cache has finished with it */
		69	if (radix_tree_tag_get(&cookie->stores, page->index,
		70	FSCACHE_COOKIE_STORING_TAG)) {
		71	rcu_read_unlock();
		72	goto page_busy;
		73	}
		74
		75	/* the page is pending storage, so we attempt to cancel the store and
		76	* discard the store request so that the page can be reclaimed */
		77	spin_lock(&cookie->stores_lock);
		78	rcu_read_unlock();
		79
		80	if (radix_tree_tag_get(&cookie->stores, page->index,
		81	FSCACHE_COOKIE_STORING_TAG)) {
		82	/* the page started to undergo storage whilst we were looking,
		83	* so now we can only wait or return */
		84	spin_unlock(&cookie->stores_lock);
		85	goto page_busy;
		86	}
		87
		88	xpage = radix_tree_delete(&cookie->stores, page->index);
		89	spin_unlock(&cookie->stores_lock);
		90
		91	if (xpage) {
		92	fscache_stat(&fscache_n_store_vmscan_cancelled);
		93	fscache_stat(&fscache_n_store_radix_deletes);
		94	ASSERTCMP(xpage, ==, page);
		95	} else {
		96	fscache_stat(&fscache_n_store_vmscan_gone);
		97	}
		98
		99	wake_up_bit(&cookie->flags, 0);
		100	if (xpage)
		101	page_cache_release(xpage);
		102	__fscache_uncache_page(cookie, page);
		103	return true;
		104
		105	page_busy:
		106	/* we might want to wait here, but that could deadlock the allocator as
		107	* the slow-work threads writing to the cache may all end up sleeping
		108	* on memory allocation */
		109	fscache_stat(&fscache_n_store_vmscan_busy);
		110	return false;
		111	}
		112	EXPORT_SYMBOL(__fscache_maybe_release_page);
		113
		114	/*
46	* note that a page has finished being written to the cache	115	* note that a page has finished being written to the cache
47	*/	116	*/
48	static void fscache_end_page_write(struct fscache_object *object,	117	static void fscache_end_page_write(struct fscache_object *object,
@@ -57,6 +126,8 @@ static void fscache_end_page_write(struct fscache_object *object,
57	/* delete the page from the tree if it is now no longer	126	/* delete the page from the tree if it is now no longer
58	* pending */	127	* pending */
59	spin_lock(&cookie->stores_lock);	128	spin_lock(&cookie->stores_lock);
		129	radix_tree_tag_clear(&cookie->stores, page->index,
		130	FSCACHE_COOKIE_STORING_TAG);
60	if (!radix_tree_tag_get(&cookie->stores, page->index,	131	if (!radix_tree_tag_get(&cookie->stores, page->index,
61	FSCACHE_COOKIE_PENDING_TAG)) {	132	FSCACHE_COOKIE_PENDING_TAG)) {
62	fscache_stat(&fscache_n_store_radix_deletes);	133	fscache_stat(&fscache_n_store_radix_deletes);
@@ -640,8 +711,12 @@ static void fscache_write_op(struct fscache_operation *_op)
640	goto superseded;	711	goto superseded;
641	}	712	}
642		713
643	radix_tree_tag_clear(&cookie->stores, page->index,	714	if (page) {
644	FSCACHE_COOKIE_PENDING_TAG);	715	radix_tree_tag_set(&cookie->stores, page->index,
		716	FSCACHE_COOKIE_STORING_TAG);
		717	radix_tree_tag_clear(&cookie->stores, page->index,
		718	FSCACHE_COOKIE_PENDING_TAG);
		719	}
645		720
646	spin_unlock(&cookie->stores_lock);	721	spin_unlock(&cookie->stores_lock);
647	spin_unlock(&object->lock);	722	spin_unlock(&object->lock);


diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 045ba396dbf2..cda69994e06d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c
@@ -63,6 +63,11 @@ atomic_t fscache_n_store_pages;
63	atomic_t fscache_n_store_radix_deletes;	63	atomic_t fscache_n_store_radix_deletes;
64	atomic_t fscache_n_store_pages_over_limit;	64	atomic_t fscache_n_store_pages_over_limit;
65		65
		66	atomic_t fscache_n_store_vmscan_not_storing;
		67	atomic_t fscache_n_store_vmscan_gone;
		68	atomic_t fscache_n_store_vmscan_busy;
		69	atomic_t fscache_n_store_vmscan_cancelled;
		70
66	atomic_t fscache_n_marks;	71	atomic_t fscache_n_marks;
67	atomic_t fscache_n_uncaches;	72	atomic_t fscache_n_uncaches;
68		73
@@ -211,6 +216,12 @@ static int fscache_stats_show(struct seq_file m, void v)
211	atomic_read(&fscache_n_store_radix_deletes),	216	atomic_read(&fscache_n_store_radix_deletes),
212	atomic_read(&fscache_n_store_pages_over_limit));	217	atomic_read(&fscache_n_store_pages_over_limit));
213		218
		219	seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
		220	atomic_read(&fscache_n_store_vmscan_not_storing),
		221	atomic_read(&fscache_n_store_vmscan_gone),
		222	atomic_read(&fscache_n_store_vmscan_busy),
		223	atomic_read(&fscache_n_store_vmscan_cancelled));
		224
214	seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",	225	seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
215	atomic_read(&fscache_n_op_pend),	226	atomic_read(&fscache_n_op_pend),
216	atomic_read(&fscache_n_op_run),	227	atomic_read(&fscache_n_op_run),