diff options
author | Yehuda Sadeh <yehuda@hq.newdream.net> | 2009-12-18 16:51:57 -0500 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2009-12-21 19:39:56 -0500 |
commit | 2baba25019ec564cd247af74013873d69a0b8190 (patch) | |
tree | c0995b8087cff771dd51aaf1194fd238f4490f01 | |
parent | dbd646a851713bec5bfff40ecf624b2e78518fe5 (diff) |
ceph: writeback congestion control
Set bdi congestion bit when amount of write data in flight exceeds adjustable
threshold.
Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r-- | fs/ceph/addr.c | 35 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 33 | ||||
-rw-r--r-- | fs/ceph/super.c | 36 | ||||
-rw-r--r-- | fs/ceph/super.h | 3 |
4 files changed, 105 insertions, 2 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d0cdceb0b90b..a6850a14038e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -47,6 +47,12 @@ | |||
47 | * accounting is preserved. | 47 | * accounting is preserved. |
48 | */ | 48 | */ |
49 | 49 | ||
50 | #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) | ||
51 | #define CONGESTION_OFF_THRESH(congestion_kb) \ | ||
52 | (CONGESTION_ON_THRESH(congestion_kb) - \ | ||
53 | (CONGESTION_ON_THRESH(congestion_kb) >> 2)) | ||
54 | |||
55 | |||
50 | 56 | ||
51 | /* | 57 | /* |
52 | * Dirty a page. Optimistically adjust accounting, on the assumption | 58 | * Dirty a page. Optimistically adjust accounting, on the assumption |
@@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
377 | { | 383 | { |
378 | struct inode *inode; | 384 | struct inode *inode; |
379 | struct ceph_inode_info *ci; | 385 | struct ceph_inode_info *ci; |
386 | struct ceph_client *client; | ||
380 | struct ceph_osd_client *osdc; | 387 | struct ceph_osd_client *osdc; |
381 | loff_t page_off = page->index << PAGE_CACHE_SHIFT; | 388 | loff_t page_off = page->index << PAGE_CACHE_SHIFT; |
382 | int len = PAGE_CACHE_SIZE; | 389 | int len = PAGE_CACHE_SIZE; |
@@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
384 | int err = 0; | 391 | int err = 0; |
385 | struct ceph_snap_context *snapc; | 392 | struct ceph_snap_context *snapc; |
386 | u64 snap_size = 0; | 393 | u64 snap_size = 0; |
394 | long writeback_stat; | ||
387 | 395 | ||
388 | dout("writepage %p idx %lu\n", page, page->index); | 396 | dout("writepage %p idx %lu\n", page, page->index); |
389 | 397 | ||
@@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
393 | } | 401 | } |
394 | inode = page->mapping->host; | 402 | inode = page->mapping->host; |
395 | ci = ceph_inode(inode); | 403 | ci = ceph_inode(inode); |
396 | osdc = &ceph_inode_to_client(inode)->osdc; | 404 | client = ceph_inode_to_client(inode); |
405 | osdc = &client->osdc; | ||
397 | 406 | ||
398 | /* verify this is a writeable snap context */ | 407 | /* verify this is a writeable snap context */ |
399 | snapc = (void *)page->private; | 408 | snapc = (void *)page->private; |
@@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
420 | dout("writepage %p page %p index %lu on %llu~%u\n", | 429 | dout("writepage %p page %p index %lu on %llu~%u\n", |
421 | inode, page, page->index, page_off, len); | 430 | inode, page, page->index, page_off, len); |
422 | 431 | ||
432 | writeback_stat = atomic_long_inc_return(&client->writeback_count); | ||
433 | if (writeback_stat > | ||
434 | CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) | ||
435 | set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); | ||
436 | |||
423 | set_page_writeback(page); | 437 | set_page_writeback(page); |
424 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), | 438 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), |
425 | &ci->i_layout, snapc, | 439 | &ci->i_layout, snapc, |
@@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
499 | struct writeback_control *wbc = req->r_wbc; | 513 | struct writeback_control *wbc = req->r_wbc; |
500 | __s32 rc = -EIO; | 514 | __s32 rc = -EIO; |
501 | u64 bytes = 0; | 515 | u64 bytes = 0; |
516 | struct ceph_client *client = ceph_inode_to_client(inode); | ||
517 | long writeback_stat; | ||
502 | 518 | ||
503 | /* parse reply */ | 519 | /* parse reply */ |
504 | replyhead = msg->front.iov_base; | 520 | replyhead = msg->front.iov_base; |
@@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
524 | BUG_ON(!page); | 540 | BUG_ON(!page); |
525 | WARN_ON(!PageUptodate(page)); | 541 | WARN_ON(!PageUptodate(page)); |
526 | 542 | ||
543 | writeback_stat = | ||
544 | atomic_long_dec_return(&client->writeback_count); | ||
545 | if (writeback_stat < | ||
546 | CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) | ||
547 | clear_bdi_congested(&client->backing_dev_info, | ||
548 | BLK_RW_ASYNC); | ||
549 | |||
527 | if (i >= wrote) { | 550 | if (i >= wrote) { |
528 | dout("inode %p skipping page %p\n", inode, page); | 551 | dout("inode %p skipping page %p\n", inode, page); |
529 | wbc->pages_skipped++; | 552 | wbc->pages_skipped++; |
@@ -666,6 +689,7 @@ retry: | |||
666 | u64 offset, len; | 689 | u64 offset, len; |
667 | struct ceph_osd_request_head *reqhead; | 690 | struct ceph_osd_request_head *reqhead; |
668 | struct ceph_osd_op *op; | 691 | struct ceph_osd_op *op; |
692 | long writeback_stat; | ||
669 | 693 | ||
670 | next = 0; | 694 | next = 0; |
671 | locked_pages = 0; | 695 | locked_pages = 0; |
@@ -773,6 +797,12 @@ get_more_pages: | |||
773 | first = i; | 797 | first = i; |
774 | dout("%p will write page %p idx %lu\n", | 798 | dout("%p will write page %p idx %lu\n", |
775 | inode, page, page->index); | 799 | inode, page, page->index); |
800 | |||
801 | writeback_stat = atomic_long_inc_return(&client->writeback_count); | ||
802 | if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { | ||
803 | set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); | ||
804 | } | ||
805 | |||
776 | set_page_writeback(page); | 806 | set_page_writeback(page); |
777 | req->r_pages[locked_pages] = page; | 807 | req->r_pages[locked_pages] = page; |
778 | locked_pages++; | 808 | locked_pages++; |
@@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
998 | struct page *page, void *fsdata) | 1028 | struct page *page, void *fsdata) |
999 | { | 1029 | { |
1000 | struct inode *inode = file->f_dentry->d_inode; | 1030 | struct inode *inode = file->f_dentry->d_inode; |
1001 | struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; | 1031 | struct ceph_client *client = ceph_inode_to_client(inode); |
1032 | struct ceph_mds_client *mdsc = &client->mdsc; | ||
1002 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1033 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1003 | int check_cap = 0; | 1034 | int check_cap = 0; |
1004 | 1035 | ||
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 441484ab7e94..22d3b47fb1be 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -320,6 +320,30 @@ DEFINE_SHOW_FUNC(osdc_show) | |||
320 | DEFINE_SHOW_FUNC(dentry_lru_show) | 320 | DEFINE_SHOW_FUNC(dentry_lru_show) |
321 | DEFINE_SHOW_FUNC(caps_show) | 321 | DEFINE_SHOW_FUNC(caps_show) |
322 | 322 | ||
323 | static int congestion_kb_set(void *data, u64 val) | ||
324 | { | ||
325 | struct ceph_client *client = (struct ceph_client *)data; | ||
326 | |||
327 | if (client) | ||
328 | client->mount_args->congestion_kb = (int)val; | ||
329 | |||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | static int congestion_kb_get(void *data, u64 *val) | ||
334 | { | ||
335 | struct ceph_client *client = (struct ceph_client *)data; | ||
336 | |||
337 | if (client) | ||
338 | *val = (u64)client->mount_args->congestion_kb; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | |||
344 | DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, | ||
345 | congestion_kb_set, "%llu\n"); | ||
346 | |||
323 | int __init ceph_debugfs_init(void) | 347 | int __init ceph_debugfs_init(void) |
324 | { | 348 | { |
325 | ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); | 349 | ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); |
@@ -409,6 +433,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) | |||
409 | if (!client->debugfs_caps) | 433 | if (!client->debugfs_caps) |
410 | goto out; | 434 | goto out; |
411 | 435 | ||
436 | client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", | ||
437 | 0600, | ||
438 | client->debugfs_dir, | ||
439 | client, | ||
440 | &congestion_kb_fops); | ||
441 | if (!client->debugfs_congestion_kb) | ||
442 | goto out; | ||
443 | |||
412 | sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); | 444 | sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); |
413 | client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, | 445 | client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, |
414 | name); | 446 | name); |
@@ -431,6 +463,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client) | |||
431 | debugfs_remove(client->osdc.debugfs_file); | 463 | debugfs_remove(client->osdc.debugfs_file); |
432 | debugfs_remove(client->mdsc.debugfs_file); | 464 | debugfs_remove(client->mdsc.debugfs_file); |
433 | debugfs_remove(client->monc.debugfs_file); | 465 | debugfs_remove(client->monc.debugfs_file); |
466 | debugfs_remove(client->debugfs_congestion_kb); | ||
434 | debugfs_remove(client->debugfs_dir); | 467 | debugfs_remove(client->debugfs_dir); |
435 | } | 468 | } |
436 | 469 | ||
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6d02a166f8ff..b9cb8cebcdc1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -150,6 +150,35 @@ static void ceph_inode_init_once(void *foo) | |||
150 | inode_init_once(&ci->vfs_inode); | 150 | inode_init_once(&ci->vfs_inode); |
151 | } | 151 | } |
152 | 152 | ||
153 | static int default_congestion_kb(void) | ||
154 | { | ||
155 | int congestion_kb; | ||
156 | |||
157 | /* | ||
158 | * Copied from NFS | ||
159 | * | ||
160 | * congestion size, scale with available memory. | ||
161 | * | ||
162 | * 64MB: 8192k | ||
163 | * 128MB: 11585k | ||
164 | * 256MB: 16384k | ||
165 | * 512MB: 23170k | ||
166 | * 1GB: 32768k | ||
167 | * 2GB: 46340k | ||
168 | * 4GB: 65536k | ||
169 | * 8GB: 92681k | ||
170 | * 16GB: 131072k | ||
171 | * | ||
172 | * This allows larger machines to have larger/more transfers. | ||
173 | * Limit the default to 256M | ||
174 | */ | ||
175 | congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); | ||
176 | if (congestion_kb > 256*1024) | ||
177 | congestion_kb = 256*1024; | ||
178 | |||
179 | return congestion_kb; | ||
180 | } | ||
181 | |||
153 | static int __init init_caches(void) | 182 | static int __init init_caches(void) |
154 | { | 183 | { |
155 | ceph_inode_cachep = kmem_cache_create("ceph_inode_info", | 184 | ceph_inode_cachep = kmem_cache_create("ceph_inode_info", |
@@ -267,6 +296,7 @@ enum { | |||
267 | Opt_caps_wanted_delay_min, | 296 | Opt_caps_wanted_delay_min, |
268 | Opt_caps_wanted_delay_max, | 297 | Opt_caps_wanted_delay_max, |
269 | Opt_readdir_max_entries, | 298 | Opt_readdir_max_entries, |
299 | Opt_congestion_kb, | ||
270 | Opt_last_int, | 300 | Opt_last_int, |
271 | /* int args above */ | 301 | /* int args above */ |
272 | Opt_snapdirname, | 302 | Opt_snapdirname, |
@@ -295,6 +325,7 @@ static match_table_t arg_tokens = { | |||
295 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, | 325 | {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, |
296 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, | 326 | {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, |
297 | {Opt_readdir_max_entries, "readdir_max_entries=%d"}, | 327 | {Opt_readdir_max_entries, "readdir_max_entries=%d"}, |
328 | {Opt_congestion_kb, "write_congestion_kb=%d"}, | ||
298 | /* int args above */ | 329 | /* int args above */ |
299 | {Opt_snapdirname, "snapdirname=%s"}, | 330 | {Opt_snapdirname, "snapdirname=%s"}, |
300 | {Opt_name, "name=%s"}, | 331 | {Opt_name, "name=%s"}, |
@@ -342,6 +373,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, | |||
342 | args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | 373 | args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); |
343 | args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; | 374 | args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; |
344 | args->max_readdir = 1024; | 375 | args->max_readdir = 1024; |
376 | args->congestion_kb = default_congestion_kb(); | ||
345 | 377 | ||
346 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ | 378 | /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ |
347 | err = -EINVAL; | 379 | err = -EINVAL; |
@@ -445,6 +477,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, | |||
445 | case Opt_readdir_max_entries: | 477 | case Opt_readdir_max_entries: |
446 | args->max_readdir = intval; | 478 | args->max_readdir = intval; |
447 | break; | 479 | break; |
480 | case Opt_congestion_kb: | ||
481 | args->congestion_kb = intval; | ||
482 | break; | ||
448 | 483 | ||
449 | case Opt_noshare: | 484 | case Opt_noshare: |
450 | args->flags |= CEPH_OPT_NOSHARE; | 485 | args->flags |= CEPH_OPT_NOSHARE; |
@@ -516,6 +551,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) | |||
516 | client->msgr = NULL; | 551 | client->msgr = NULL; |
517 | 552 | ||
518 | client->mount_err = 0; | 553 | client->mount_err = 0; |
554 | atomic_long_set(&client->writeback_count, 0); | ||
519 | 555 | ||
520 | err = bdi_init(&client->backing_dev_info); | 556 | err = bdi_init(&client->backing_dev_info); |
521 | if (err < 0) | 557 | if (err < 0) |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2304bd2844a4..62d9ae482d72 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -59,6 +59,7 @@ struct ceph_mount_args { | |||
59 | int wsize; | 59 | int wsize; |
60 | int rsize; /* max readahead */ | 60 | int rsize; /* max readahead */ |
61 | int max_readdir; /* max readdir size */ | 61 | int max_readdir; /* max readdir size */ |
62 | int congestion_kb; /* max readdir size */ | ||
62 | int osd_timeout; | 63 | int osd_timeout; |
63 | char *snapdir_name; /* default ".snap" */ | 64 | char *snapdir_name; /* default ".snap" */ |
64 | char *name; | 65 | char *name; |
@@ -136,6 +137,7 @@ struct ceph_client { | |||
136 | struct workqueue_struct *wb_wq; | 137 | struct workqueue_struct *wb_wq; |
137 | struct workqueue_struct *pg_inv_wq; | 138 | struct workqueue_struct *pg_inv_wq; |
138 | struct workqueue_struct *trunc_wq; | 139 | struct workqueue_struct *trunc_wq; |
140 | atomic_long_t writeback_count; | ||
139 | 141 | ||
140 | struct backing_dev_info backing_dev_info; | 142 | struct backing_dev_info backing_dev_info; |
141 | 143 | ||
@@ -143,6 +145,7 @@ struct ceph_client { | |||
143 | struct dentry *debugfs_monmap; | 145 | struct dentry *debugfs_monmap; |
144 | struct dentry *debugfs_mdsmap, *debugfs_osdmap; | 146 | struct dentry *debugfs_mdsmap, *debugfs_osdmap; |
145 | struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; | 147 | struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; |
148 | struct dentry *debugfs_congestion_kb; | ||
146 | struct dentry *debugfs_bdi; | 149 | struct dentry *debugfs_bdi; |
147 | #endif | 150 | #endif |
148 | }; | 151 | }; |