aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYehuda Sadeh <yehuda@hq.newdream.net>2009-12-18 16:51:57 -0500
committerSage Weil <sage@newdream.net>2009-12-21 19:39:56 -0500
commit2baba25019ec564cd247af74013873d69a0b8190 (patch)
treec0995b8087cff771dd51aaf1194fd238f4490f01
parentdbd646a851713bec5bfff40ecf624b2e78518fe5 (diff)
ceph: writeback congestion control
Set bdi congestion bit when amount of write data in flight exceeds adjustable threshold. Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net> Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--fs/ceph/addr.c35
-rw-r--r--fs/ceph/debugfs.c33
-rw-r--r--fs/ceph/super.c36
-rw-r--r--fs/ceph/super.h3
4 files changed, 105 insertions, 2 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d0cdceb0b90b..a6850a14038e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -47,6 +47,12 @@
47 * accounting is preserved. 47 * accounting is preserved.
48 */ 48 */
49 49
50#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
51#define CONGESTION_OFF_THRESH(congestion_kb) \
52 (CONGESTION_ON_THRESH(congestion_kb) - \
53 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
54
55
50 56
51/* 57/*
52 * Dirty a page. Optimistically adjust accounting, on the assumption 58 * Dirty a page. Optimistically adjust accounting, on the assumption
@@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
377{ 383{
378 struct inode *inode; 384 struct inode *inode;
379 struct ceph_inode_info *ci; 385 struct ceph_inode_info *ci;
386 struct ceph_client *client;
380 struct ceph_osd_client *osdc; 387 struct ceph_osd_client *osdc;
381 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 388 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
382 int len = PAGE_CACHE_SIZE; 389 int len = PAGE_CACHE_SIZE;
@@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
384 int err = 0; 391 int err = 0;
385 struct ceph_snap_context *snapc; 392 struct ceph_snap_context *snapc;
386 u64 snap_size = 0; 393 u64 snap_size = 0;
394 long writeback_stat;
387 395
388 dout("writepage %p idx %lu\n", page, page->index); 396 dout("writepage %p idx %lu\n", page, page->index);
389 397
@@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
393 } 401 }
394 inode = page->mapping->host; 402 inode = page->mapping->host;
395 ci = ceph_inode(inode); 403 ci = ceph_inode(inode);
396 osdc = &ceph_inode_to_client(inode)->osdc; 404 client = ceph_inode_to_client(inode);
405 osdc = &client->osdc;
397 406
398 /* verify this is a writeable snap context */ 407 /* verify this is a writeable snap context */
399 snapc = (void *)page->private; 408 snapc = (void *)page->private;
@@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
420 dout("writepage %p page %p index %lu on %llu~%u\n", 429 dout("writepage %p page %p index %lu on %llu~%u\n",
421 inode, page, page->index, page_off, len); 430 inode, page, page->index, page_off, len);
422 431
432 writeback_stat = atomic_long_inc_return(&client->writeback_count);
433 if (writeback_stat >
434 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
435 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
436
423 set_page_writeback(page); 437 set_page_writeback(page);
424 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 438 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
425 &ci->i_layout, snapc, 439 &ci->i_layout, snapc,
@@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req,
499 struct writeback_control *wbc = req->r_wbc; 513 struct writeback_control *wbc = req->r_wbc;
500 __s32 rc = -EIO; 514 __s32 rc = -EIO;
501 u64 bytes = 0; 515 u64 bytes = 0;
516 struct ceph_client *client = ceph_inode_to_client(inode);
517 long writeback_stat;
502 518
503 /* parse reply */ 519 /* parse reply */
504 replyhead = msg->front.iov_base; 520 replyhead = msg->front.iov_base;
@@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req,
524 BUG_ON(!page); 540 BUG_ON(!page);
525 WARN_ON(!PageUptodate(page)); 541 WARN_ON(!PageUptodate(page));
526 542
543 writeback_stat =
544 atomic_long_dec_return(&client->writeback_count);
545 if (writeback_stat <
546 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
547 clear_bdi_congested(&client->backing_dev_info,
548 BLK_RW_ASYNC);
549
527 if (i >= wrote) { 550 if (i >= wrote) {
528 dout("inode %p skipping page %p\n", inode, page); 551 dout("inode %p skipping page %p\n", inode, page);
529 wbc->pages_skipped++; 552 wbc->pages_skipped++;
@@ -666,6 +689,7 @@ retry:
666 u64 offset, len; 689 u64 offset, len;
667 struct ceph_osd_request_head *reqhead; 690 struct ceph_osd_request_head *reqhead;
668 struct ceph_osd_op *op; 691 struct ceph_osd_op *op;
692 long writeback_stat;
669 693
670 next = 0; 694 next = 0;
671 locked_pages = 0; 695 locked_pages = 0;
@@ -773,6 +797,12 @@ get_more_pages:
773 first = i; 797 first = i;
774 dout("%p will write page %p idx %lu\n", 798 dout("%p will write page %p idx %lu\n",
775 inode, page, page->index); 799 inode, page, page->index);
800
801 writeback_stat = atomic_long_inc_return(&client->writeback_count);
802 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
803 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
804 }
805
776 set_page_writeback(page); 806 set_page_writeback(page);
777 req->r_pages[locked_pages] = page; 807 req->r_pages[locked_pages] = page;
778 locked_pages++; 808 locked_pages++;
@@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
998 struct page *page, void *fsdata) 1028 struct page *page, void *fsdata)
999{ 1029{
1000 struct inode *inode = file->f_dentry->d_inode; 1030 struct inode *inode = file->f_dentry->d_inode;
1001 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1031 struct ceph_client *client = ceph_inode_to_client(inode);
1032 struct ceph_mds_client *mdsc = &client->mdsc;
1002 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1033 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1003 int check_cap = 0; 1034 int check_cap = 0;
1004 1035
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 441484ab7e94..22d3b47fb1be 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -320,6 +320,30 @@ DEFINE_SHOW_FUNC(osdc_show)
320DEFINE_SHOW_FUNC(dentry_lru_show) 320DEFINE_SHOW_FUNC(dentry_lru_show)
321DEFINE_SHOW_FUNC(caps_show) 321DEFINE_SHOW_FUNC(caps_show)
322 322
323static int congestion_kb_set(void *data, u64 val)
324{
325 struct ceph_client *client = (struct ceph_client *)data;
326
327 if (client)
328 client->mount_args->congestion_kb = (int)val;
329
330 return 0;
331}
332
333static int congestion_kb_get(void *data, u64 *val)
334{
335 struct ceph_client *client = (struct ceph_client *)data;
336
337 if (client)
338 *val = (u64)client->mount_args->congestion_kb;
339
340 return 0;
341}
342
343
344DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
345 congestion_kb_set, "%llu\n");
346
323int __init ceph_debugfs_init(void) 347int __init ceph_debugfs_init(void)
324{ 348{
325 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); 349 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
@@ -409,6 +433,14 @@ int ceph_debugfs_client_init(struct ceph_client *client)
409 if (!client->debugfs_caps) 433 if (!client->debugfs_caps)
410 goto out; 434 goto out;
411 435
436 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
437 0600,
438 client->debugfs_dir,
439 client,
440 &congestion_kb_fops);
441 if (!client->debugfs_congestion_kb)
442 goto out;
443
412 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); 444 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
413 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, 445 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
414 name); 446 name);
@@ -431,6 +463,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
431 debugfs_remove(client->osdc.debugfs_file); 463 debugfs_remove(client->osdc.debugfs_file);
432 debugfs_remove(client->mdsc.debugfs_file); 464 debugfs_remove(client->mdsc.debugfs_file);
433 debugfs_remove(client->monc.debugfs_file); 465 debugfs_remove(client->monc.debugfs_file);
466 debugfs_remove(client->debugfs_congestion_kb);
434 debugfs_remove(client->debugfs_dir); 467 debugfs_remove(client->debugfs_dir);
435} 468}
436 469
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6d02a166f8ff..b9cb8cebcdc1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -150,6 +150,35 @@ static void ceph_inode_init_once(void *foo)
150 inode_init_once(&ci->vfs_inode); 150 inode_init_once(&ci->vfs_inode);
151} 151}
152 152
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
153static int __init init_caches(void) 182static int __init init_caches(void)
154{ 183{
155 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -267,6 +296,7 @@ enum {
267 Opt_caps_wanted_delay_min, 296 Opt_caps_wanted_delay_min,
268 Opt_caps_wanted_delay_max, 297 Opt_caps_wanted_delay_max,
269 Opt_readdir_max_entries, 298 Opt_readdir_max_entries,
299 Opt_congestion_kb,
270 Opt_last_int, 300 Opt_last_int,
271 /* int args above */ 301 /* int args above */
272 Opt_snapdirname, 302 Opt_snapdirname,
@@ -295,6 +325,7 @@ static match_table_t arg_tokens = {
295 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 325 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
296 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 326 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
297 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 327 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
328 {Opt_congestion_kb, "write_congestion_kb=%d"},
298 /* int args above */ 329 /* int args above */
299 {Opt_snapdirname, "snapdirname=%s"}, 330 {Opt_snapdirname, "snapdirname=%s"},
300 {Opt_name, "name=%s"}, 331 {Opt_name, "name=%s"},
@@ -342,6 +373,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
342 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 373 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
343 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 374 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
344 args->max_readdir = 1024; 375 args->max_readdir = 1024;
376 args->congestion_kb = default_congestion_kb();
345 377
346 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 378 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
347 err = -EINVAL; 379 err = -EINVAL;
@@ -445,6 +477,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
445 case Opt_readdir_max_entries: 477 case Opt_readdir_max_entries:
446 args->max_readdir = intval; 478 args->max_readdir = intval;
447 break; 479 break;
480 case Opt_congestion_kb:
481 args->congestion_kb = intval;
482 break;
448 483
449 case Opt_noshare: 484 case Opt_noshare:
450 args->flags |= CEPH_OPT_NOSHARE; 485 args->flags |= CEPH_OPT_NOSHARE;
@@ -516,6 +551,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
516 client->msgr = NULL; 551 client->msgr = NULL;
517 552
518 client->mount_err = 0; 553 client->mount_err = 0;
554 atomic_long_set(&client->writeback_count, 0);
519 555
520 err = bdi_init(&client->backing_dev_info); 556 err = bdi_init(&client->backing_dev_info);
521 if (err < 0) 557 if (err < 0)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2304bd2844a4..62d9ae482d72 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -59,6 +59,7 @@ struct ceph_mount_args {
59 int wsize; 59 int wsize;
60 int rsize; /* max readahead */ 60 int rsize; /* max readahead */
61 int max_readdir; /* max readdir size */ 61 int max_readdir; /* max readdir size */
62 int congestion_kb; /* max readdir size */
62 int osd_timeout; 63 int osd_timeout;
63 char *snapdir_name; /* default ".snap" */ 64 char *snapdir_name; /* default ".snap" */
64 char *name; 65 char *name;
@@ -136,6 +137,7 @@ struct ceph_client {
136 struct workqueue_struct *wb_wq; 137 struct workqueue_struct *wb_wq;
137 struct workqueue_struct *pg_inv_wq; 138 struct workqueue_struct *pg_inv_wq;
138 struct workqueue_struct *trunc_wq; 139 struct workqueue_struct *trunc_wq;
140 atomic_long_t writeback_count;
139 141
140 struct backing_dev_info backing_dev_info; 142 struct backing_dev_info backing_dev_info;
141 143
@@ -143,6 +145,7 @@ struct ceph_client {
143 struct dentry *debugfs_monmap; 145 struct dentry *debugfs_monmap;
144 struct dentry *debugfs_mdsmap, *debugfs_osdmap; 146 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
145 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; 147 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
148 struct dentry *debugfs_congestion_kb;
146 struct dentry *debugfs_bdi; 149 struct dentry *debugfs_bdi;
147#endif 150#endif
148}; 151};