aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2007-03-16 17:38:26 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-03-16 22:25:05 -0400
commit89a09141df6ac1c3821fbe44ca8384eb37692965 (patch)
treeccb21055fca86ac2657b3262ac37eb3e5c44bea0
parentb74a2f0913694556a027795d2954d30523fac4c5 (diff)
[PATCH] nfs: fix congestion control
The current NFS client congestion logic is severly broken, it marks the backing device congested during each nfs_writepages() call but doesn't mirror this in nfs_writepage() which makes for deadlocks. Also it implements its own waitqueue. Replace this by a more regular congestion implementation that puts a cap on the number of active writeback pages and uses the bdi congestion waitqueue. Also always use an interruptible wait since it makes sense to be able to SIGKILL the process even for mounts without 'intr'. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no> Cc: Christoph Lameter <clameter@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/sysctl.c8
-rw-r--r--fs/nfs/write.c116
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--mm/backing-dev.c16
7 files changed, 103 insertions, 44 deletions
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb516a2cfbaf..f1eae44b9a1a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -151,10 +151,10 @@ int __init register_nfs_fs(void)
151 if (ret < 0) 151 if (ret < 0)
152 goto error_0; 152 goto error_0;
153 153
154#ifdef CONFIG_NFS_V4
155 ret = nfs_register_sysctl(); 154 ret = nfs_register_sysctl();
156 if (ret < 0) 155 if (ret < 0)
157 goto error_1; 156 goto error_1;
157#ifdef CONFIG_NFS_V4
158 ret = register_filesystem(&nfs4_fs_type); 158 ret = register_filesystem(&nfs4_fs_type);
159 if (ret < 0) 159 if (ret < 0)
160 goto error_2; 160 goto error_2;
@@ -165,9 +165,9 @@ int __init register_nfs_fs(void)
165#ifdef CONFIG_NFS_V4 165#ifdef CONFIG_NFS_V4
166error_2: 166error_2:
167 nfs_unregister_sysctl(); 167 nfs_unregister_sysctl();
168#endif
168error_1: 169error_1:
169 unregister_filesystem(&nfs_fs_type); 170 unregister_filesystem(&nfs_fs_type);
170#endif
171error_0: 171error_0:
172 return ret; 172 return ret;
173} 173}
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index fcdcafbb3293..b62481dabae9 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -50,6 +50,14 @@ static ctl_table nfs_cb_sysctls[] = {
50 .proc_handler = &proc_dointvec_jiffies, 50 .proc_handler = &proc_dointvec_jiffies,
51 .strategy = &sysctl_jiffies, 51 .strategy = &sysctl_jiffies,
52 }, 52 },
53 {
54 .ctl_name = CTL_UNNUMBERED,
55 .procname = "nfs_congestion_kb",
56 .data = &nfs_congestion_kb,
57 .maxlen = sizeof(nfs_congestion_kb),
58 .mode = 0644,
59 .proc_handler = &proc_dointvec,
60 },
53 { .ctl_name = 0 } 61 { .ctl_name = 0 }
54}; 62};
55 63
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index febdade91670..2867e6b7096f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -12,6 +12,7 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/swap.h>
15 16
16#include <linux/sunrpc/clnt.h> 17#include <linux/sunrpc/clnt.h>
17#include <linux/nfs_fs.h> 18#include <linux/nfs_fs.h>
@@ -38,7 +39,6 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context*,
38 struct page *, 39 struct page *,
39 unsigned int, unsigned int); 40 unsigned int, unsigned int);
40static void nfs_mark_request_dirty(struct nfs_page *req); 41static void nfs_mark_request_dirty(struct nfs_page *req);
41static int nfs_wait_on_write_congestion(struct address_space *, int);
42static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how); 42static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
43static const struct rpc_call_ops nfs_write_partial_ops; 43static const struct rpc_call_ops nfs_write_partial_ops;
44static const struct rpc_call_ops nfs_write_full_ops; 44static const struct rpc_call_ops nfs_write_full_ops;
@@ -48,8 +48,6 @@ static struct kmem_cache *nfs_wdata_cachep;
48static mempool_t *nfs_wdata_mempool; 48static mempool_t *nfs_wdata_mempool;
49static mempool_t *nfs_commit_mempool; 49static mempool_t *nfs_commit_mempool;
50 50
51static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
52
53struct nfs_write_data *nfs_commit_alloc(void) 51struct nfs_write_data *nfs_commit_alloc(void)
54{ 52{
55 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); 53 struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
@@ -211,6 +209,40 @@ static int wb_priority(struct writeback_control *wbc)
211} 209}
212 210
213/* 211/*
212 * NFS congestion control
213 */
214
215int nfs_congestion_kb;
216
217#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10))
218#define NFS_CONGESTION_OFF_THRESH \
219 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
220
221static void nfs_set_page_writeback(struct page *page)
222{
223 if (!test_set_page_writeback(page)) {
224 struct inode *inode = page->mapping->host;
225 struct nfs_server *nfss = NFS_SERVER(inode);
226
227 if (atomic_inc_return(&nfss->writeback) >
228 NFS_CONGESTION_ON_THRESH)
229 set_bdi_congested(&nfss->backing_dev_info, WRITE);
230 }
231}
232
233static void nfs_end_page_writeback(struct page *page)
234{
235 struct inode *inode = page->mapping->host;
236 struct nfs_server *nfss = NFS_SERVER(inode);
237
238 end_page_writeback(page);
239 if (atomic_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) {
240 clear_bdi_congested(&nfss->backing_dev_info, WRITE);
241 congestion_end(WRITE);
242 }
243}
244
245/*
214 * Find an associated nfs write request, and prepare to flush it out 246 * Find an associated nfs write request, and prepare to flush it out
215 * Returns 1 if there was no write request, or if the request was 247 * Returns 1 if there was no write request, or if the request was
216 * already tagged by nfs_set_page_dirty.Returns 0 if the request 248 * already tagged by nfs_set_page_dirty.Returns 0 if the request
@@ -247,7 +279,7 @@ static int nfs_page_mark_flush(struct page *page)
247 spin_unlock(req_lock); 279 spin_unlock(req_lock);
248 if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) { 280 if (test_and_set_bit(PG_FLUSHING, &req->wb_flags) == 0) {
249 nfs_mark_request_dirty(req); 281 nfs_mark_request_dirty(req);
250 set_page_writeback(page); 282 nfs_set_page_writeback(page);
251 } 283 }
252 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); 284 ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
253 nfs_unlock_request(req); 285 nfs_unlock_request(req);
@@ -302,13 +334,8 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
302 return err; 334 return err;
303} 335}
304 336
305/*
306 * Note: causes nfs_update_request() to block on the assumption
307 * that the writeback is generated due to memory pressure.
308 */
309int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 337int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
310{ 338{
311 struct backing_dev_info *bdi = mapping->backing_dev_info;
312 struct inode *inode = mapping->host; 339 struct inode *inode = mapping->host;
313 int err; 340 int err;
314 341
@@ -317,20 +344,12 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
317 err = generic_writepages(mapping, wbc); 344 err = generic_writepages(mapping, wbc);
318 if (err) 345 if (err)
319 return err; 346 return err;
320 while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
321 if (wbc->nonblocking)
322 return 0;
323 nfs_wait_on_write_congestion(mapping, 0);
324 }
325 err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc)); 347 err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
326 if (err < 0) 348 if (err < 0)
327 goto out; 349 goto out;
328 nfs_add_stats(inode, NFSIOS_WRITEPAGES, err); 350 nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
329 err = 0; 351 err = 0;
330out: 352out:
331 clear_bit(BDI_write_congested, &bdi->state);
332 wake_up_all(&nfs_write_congestion);
333 congestion_end(WRITE);
334 return err; 353 return err;
335} 354}
336 355
@@ -360,7 +379,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
360} 379}
361 380
362/* 381/*
363 * Insert a write request into an inode 382 * Remove a write request from an inode
364 */ 383 */
365static void nfs_inode_remove_request(struct nfs_page *req) 384static void nfs_inode_remove_request(struct nfs_page *req)
366{ 385{
@@ -531,10 +550,10 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, un
531} 550}
532#endif 551#endif
533 552
534static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr) 553static int nfs_wait_on_write_congestion(struct address_space *mapping)
535{ 554{
555 struct inode *inode = mapping->host;
536 struct backing_dev_info *bdi = mapping->backing_dev_info; 556 struct backing_dev_info *bdi = mapping->backing_dev_info;
537 DEFINE_WAIT(wait);
538 int ret = 0; 557 int ret = 0;
539 558
540 might_sleep(); 559 might_sleep();
@@ -542,31 +561,23 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
542 if (!bdi_write_congested(bdi)) 561 if (!bdi_write_congested(bdi))
543 return 0; 562 return 0;
544 563
545 nfs_inc_stats(mapping->host, NFSIOS_CONGESTIONWAIT); 564 nfs_inc_stats(inode, NFSIOS_CONGESTIONWAIT);
546 565
547 if (intr) { 566 do {
548 struct rpc_clnt *clnt = NFS_CLIENT(mapping->host); 567 struct rpc_clnt *clnt = NFS_CLIENT(inode);
549 sigset_t oldset; 568 sigset_t oldset;
550 569
551 rpc_clnt_sigmask(clnt, &oldset); 570 rpc_clnt_sigmask(clnt, &oldset);
552 prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE); 571 ret = congestion_wait_interruptible(WRITE, HZ/10);
553 if (bdi_write_congested(bdi)) {
554 if (signalled())
555 ret = -ERESTARTSYS;
556 else
557 schedule();
558 }
559 rpc_clnt_sigunmask(clnt, &oldset); 572 rpc_clnt_sigunmask(clnt, &oldset);
560 } else { 573 if (ret == -ERESTARTSYS)
561 prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE); 574 break;
562 if (bdi_write_congested(bdi)) 575 ret = 0;
563 schedule(); 576 } while (bdi_write_congested(bdi));
564 } 577
565 finish_wait(&nfs_write_congestion, &wait);
566 return ret; 578 return ret;
567} 579}
568 580
569
570/* 581/*
571 * Try to update any existing write request, or create one if there is none. 582 * Try to update any existing write request, or create one if there is none.
572 * In order to match, the request's credentials must match those of 583 * In order to match, the request's credentials must match those of
@@ -577,14 +588,15 @@ static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
577static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, 588static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
578 struct page *page, unsigned int offset, unsigned int bytes) 589 struct page *page, unsigned int offset, unsigned int bytes)
579{ 590{
580 struct inode *inode = page->mapping->host; 591 struct address_space *mapping = page->mapping;
592 struct inode *inode = mapping->host;
581 struct nfs_inode *nfsi = NFS_I(inode); 593 struct nfs_inode *nfsi = NFS_I(inode);
582 struct nfs_page *req, *new = NULL; 594 struct nfs_page *req, *new = NULL;
583 unsigned long rqend, end; 595 unsigned long rqend, end;
584 596
585 end = offset + bytes; 597 end = offset + bytes;
586 598
587 if (nfs_wait_on_write_congestion(page->mapping, NFS_SERVER(inode)->flags & NFS_MOUNT_INTR)) 599 if (nfs_wait_on_write_congestion(mapping))
588 return ERR_PTR(-ERESTARTSYS); 600 return ERR_PTR(-ERESTARTSYS);
589 for (;;) { 601 for (;;) {
590 /* Loop over all inode entries and see if we find 602 /* Loop over all inode entries and see if we find
@@ -727,7 +739,7 @@ int nfs_updatepage(struct file *file, struct page *page,
727 739
728static void nfs_writepage_release(struct nfs_page *req) 740static void nfs_writepage_release(struct nfs_page *req)
729{ 741{
730 end_page_writeback(req->wb_page); 742 nfs_end_page_writeback(req->wb_page);
731 743
732#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 744#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
733 if (!PageError(req->wb_page)) { 745 if (!PageError(req->wb_page)) {
@@ -1042,12 +1054,12 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1042 if (task->tk_status < 0) { 1054 if (task->tk_status < 0) {
1043 nfs_set_pageerror(page); 1055 nfs_set_pageerror(page);
1044 req->wb_context->error = task->tk_status; 1056 req->wb_context->error = task->tk_status;
1045 end_page_writeback(page); 1057 nfs_end_page_writeback(page);
1046 nfs_inode_remove_request(req); 1058 nfs_inode_remove_request(req);
1047 dprintk(", error = %d\n", task->tk_status); 1059 dprintk(", error = %d\n", task->tk_status);
1048 goto next; 1060 goto next;
1049 } 1061 }
1050 end_page_writeback(page); 1062 nfs_end_page_writeback(page);
1051 1063
1052#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1064#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1053 if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) { 1065 if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
@@ -1514,6 +1526,26 @@ int __init nfs_init_writepagecache(void)
1514 if (nfs_commit_mempool == NULL) 1526 if (nfs_commit_mempool == NULL)
1515 return -ENOMEM; 1527 return -ENOMEM;
1516 1528
1529 /*
1530 * NFS congestion size, scale with available memory.
1531 *
1532 * 64MB: 8192k
1533 * 128MB: 11585k
1534 * 256MB: 16384k
1535 * 512MB: 23170k
1536 * 1GB: 32768k
1537 * 2GB: 46340k
1538 * 4GB: 65536k
1539 * 8GB: 92681k
1540 * 16GB: 131072k
1541 *
1542 * This allows larger machines to have larger/more transfers.
1543 * Limit the default to 256M
1544 */
1545 nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
1546 if (nfs_congestion_kb > 256*1024)
1547 nfs_congestion_kb = 256*1024;
1548
1517 return 0; 1549 return 0;
1518} 1550}
1519 1551
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7011d6255593..f2542c24b328 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -93,6 +93,7 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
93void clear_bdi_congested(struct backing_dev_info *bdi, int rw); 93void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
94void set_bdi_congested(struct backing_dev_info *bdi, int rw); 94void set_bdi_congested(struct backing_dev_info *bdi, int rw);
95long congestion_wait(int rw, long timeout); 95long congestion_wait(int rw, long timeout);
96long congestion_wait_interruptible(int rw, long timeout);
96void congestion_end(int rw); 97void congestion_end(int rw);
97 98
98#define bdi_cap_writeback_dirty(bdi) \ 99#define bdi_cap_writeback_dirty(bdi) \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 47aaa2c66738..e9ae0c6e2c62 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -415,6 +415,7 @@ extern void nfs_complete_unlink(struct dentry *);
415/* 415/*
416 * linux/fs/nfs/write.c 416 * linux/fs/nfs/write.c
417 */ 417 */
418extern int nfs_congestion_kb;
418extern int nfs_writepage(struct page *page, struct writeback_control *wbc); 419extern int nfs_writepage(struct page *page, struct writeback_control *wbc);
419extern int nfs_writepages(struct address_space *, struct writeback_control *); 420extern int nfs_writepages(struct address_space *, struct writeback_control *);
420extern int nfs_flush_incompatible(struct file *file, struct page *page); 421extern int nfs_flush_incompatible(struct file *file, struct page *page);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 95796e6924f1..c95d5e642548 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -82,6 +82,7 @@ struct nfs_server {
82 struct rpc_clnt * client_acl; /* ACL RPC client handle */ 82 struct rpc_clnt * client_acl; /* ACL RPC client handle */
83 struct nfs_iostats * io_stats; /* I/O statistics */ 83 struct nfs_iostats * io_stats; /* I/O statistics */
84 struct backing_dev_info backing_dev_info; 84 struct backing_dev_info backing_dev_info;
85 atomic_t writeback; /* number of writeback pages */
85 int flags; /* various flags */ 86 int flags; /* various flags */
86 unsigned int caps; /* server capabilities */ 87 unsigned int caps; /* server capabilities */
87 unsigned int rsize; /* read size */ 88 unsigned int rsize; /* read size */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a2811f9dc..e5de3781d3fe 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,6 +55,22 @@ long congestion_wait(int rw, long timeout)
55} 55}
56EXPORT_SYMBOL(congestion_wait); 56EXPORT_SYMBOL(congestion_wait);
57 57
58long congestion_wait_interruptible(int rw, long timeout)
59{
60 long ret;
61 DEFINE_WAIT(wait);
62 wait_queue_head_t *wqh = &congestion_wqh[rw];
63
64 prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
65 if (signal_pending(current))
66 ret = -ERESTARTSYS;
67 else
68 ret = io_schedule_timeout(timeout);
69 finish_wait(wqh, &wait);
70 return ret;
71}
72EXPORT_SYMBOL(congestion_wait_interruptible);
73
58/** 74/**
59 * congestion_end - wake up sleepers on a congested backing_dev_info 75 * congestion_end - wake up sleepers on a congested backing_dev_info
60 * @rw: READ or WRITE 76 * @rw: READ or WRITE