summaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-22 18:23:21 -0400
committerJens Axboe <axboe@fb.com>2015-06-02 10:38:12 -0400
commit380c27ca33ebecc9da35aa90c8b3a9154f90aac2 (patch)
tree9216a1f6382db31f61bc7ccf3983d0b83553dbe8 /mm/page-writeback.c
parent8a73179956e649df0d4b3250db17734f272d8266 (diff)
writeback: implement wb_domain
Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. Currently, what constitutes the global writeback domain are scattered across a number of global states. This patch starts collecting them into struct wb_domain. * fprop_global which serves as the basis for proportional bandwidth measurement and its period timer are moved into struct wb_domain. * global_wb_domain hosts the states for the global domain. * While at it, flatten wb_writeout_fraction() into its callers. This thin wrapper doesn't provide any actual benefits while getting in the way. This is pure reorganization and doesn't introduce any behavioral changes. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c72
1 files changed, 27 insertions, 45 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bebdd41b8d8e..08e1737edb39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -124,29 +124,7 @@ EXPORT_SYMBOL(laptop_mode);
124 124
125unsigned long global_dirty_limit; 125unsigned long global_dirty_limit;
126 126
127/* 127static struct wb_domain global_wb_domain;
128 * Scale the writeback cache size proportional to the relative writeout speeds.
129 *
130 * We do this by keeping a floating proportion between BDIs, based on page
131 * writeback completions [end_page_writeback()]. Those devices that write out
132 * pages fastest will get the larger share, while the slower will get a smaller
133 * share.
134 *
135 * We use page writeout completions because we are interested in getting rid of
136 * dirty pages. Having them written out is the primary goal.
137 *
138 * We introduce a concept of time, a period over which we measure these events,
139 * because demand can/will vary over time. The length of this period itself is
140 * measured in page writeback completions.
141 *
142 */
143static struct fprop_global writeout_completions;
144
145static void writeout_period(unsigned long t);
146/* Timer for aging of writeout_completions */
147static struct timer_list writeout_period_timer =
148 TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
149static unsigned long writeout_period_time = 0;
150 128
151/* 129/*
152 * Length of period for aging writeout fractions of bdis. This is an 130 * Length of period for aging writeout fractions of bdis. This is an
@@ -433,24 +411,26 @@ static unsigned long wp_next_time(unsigned long cur_time)
433} 411}
434 412
435/* 413/*
436 * Increment the BDI's writeout completion count and the global writeout 414 * Increment the wb's writeout completion count and the global writeout
437 * completion count. Called from test_clear_page_writeback(). 415 * completion count. Called from test_clear_page_writeback().
438 */ 416 */
439static inline void __wb_writeout_inc(struct bdi_writeback *wb) 417static inline void __wb_writeout_inc(struct bdi_writeback *wb)
440{ 418{
419 struct wb_domain *dom = &global_wb_domain;
420
441 __inc_wb_stat(wb, WB_WRITTEN); 421 __inc_wb_stat(wb, WB_WRITTEN);
442 __fprop_inc_percpu_max(&writeout_completions, &wb->completions, 422 __fprop_inc_percpu_max(&dom->completions, &wb->completions,
443 wb->bdi->max_prop_frac); 423 wb->bdi->max_prop_frac);
444 /* First event after period switching was turned off? */ 424 /* First event after period switching was turned off? */
445 if (!unlikely(writeout_period_time)) { 425 if (!unlikely(dom->period_time)) {
446 /* 426 /*
447 * We can race with other __bdi_writeout_inc calls here but 427 * We can race with other __bdi_writeout_inc calls here but
448 * it does not cause any harm since the resulting time when 428 * it does not cause any harm since the resulting time when
449 * timer will fire and what is in writeout_period_time will be 429 * timer will fire and what is in writeout_period_time will be
450 * roughly the same. 430 * roughly the same.
451 */ 431 */
452 writeout_period_time = wp_next_time(jiffies); 432 dom->period_time = wp_next_time(jiffies);
453 mod_timer(&writeout_period_timer, writeout_period_time); 433 mod_timer(&dom->period_timer, dom->period_time);
454 } 434 }
455} 435}
456 436
@@ -465,37 +445,37 @@ void wb_writeout_inc(struct bdi_writeback *wb)
465EXPORT_SYMBOL_GPL(wb_writeout_inc); 445EXPORT_SYMBOL_GPL(wb_writeout_inc);
466 446
467/* 447/*
468 * Obtain an accurate fraction of the BDI's portion.
469 */
470static void wb_writeout_fraction(struct bdi_writeback *wb,
471 long *numerator, long *denominator)
472{
473 fprop_fraction_percpu(&writeout_completions, &wb->completions,
474 numerator, denominator);
475}
476
477/*
478 * On idle system, we can be called long after we scheduled because we use 448 * On idle system, we can be called long after we scheduled because we use
479 * deferred timers so count with missed periods. 449 * deferred timers so count with missed periods.
480 */ 450 */
481static void writeout_period(unsigned long t) 451static void writeout_period(unsigned long t)
482{ 452{
483 int miss_periods = (jiffies - writeout_period_time) / 453 struct wb_domain *dom = (void *)t;
454 int miss_periods = (jiffies - dom->period_time) /
484 VM_COMPLETIONS_PERIOD_LEN; 455 VM_COMPLETIONS_PERIOD_LEN;
485 456
486 if (fprop_new_period(&writeout_completions, miss_periods + 1)) { 457 if (fprop_new_period(&dom->completions, miss_periods + 1)) {
487 writeout_period_time = wp_next_time(writeout_period_time + 458 dom->period_time = wp_next_time(dom->period_time +
488 miss_periods * VM_COMPLETIONS_PERIOD_LEN); 459 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
489 mod_timer(&writeout_period_timer, writeout_period_time); 460 mod_timer(&dom->period_timer, dom->period_time);
490 } else { 461 } else {
491 /* 462 /*
492 * Aging has zeroed all fractions. Stop wasting CPU on period 463 * Aging has zeroed all fractions. Stop wasting CPU on period
493 * updates. 464 * updates.
494 */ 465 */
495 writeout_period_time = 0; 466 dom->period_time = 0;
496 } 467 }
497} 468}
498 469
470int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
471{
472 memset(dom, 0, sizeof(*dom));
473 init_timer_deferrable(&dom->period_timer);
474 dom->period_timer.function = writeout_period;
475 dom->period_timer.data = (unsigned long)dom;
476 return fprop_global_init(&dom->completions, gfp);
477}
478
499/* 479/*
500 * bdi_min_ratio keeps the sum of the minimum dirty shares of all 480 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
501 * registered backing devices, which, for obvious reasons, can not 481 * registered backing devices, which, for obvious reasons, can not
@@ -579,6 +559,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
579 */ 559 */
580unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) 560unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
581{ 561{
562 struct wb_domain *dom = &global_wb_domain;
582 u64 wb_thresh; 563 u64 wb_thresh;
583 long numerator, denominator; 564 long numerator, denominator;
584 unsigned long wb_min_ratio, wb_max_ratio; 565 unsigned long wb_min_ratio, wb_max_ratio;
@@ -586,7 +567,8 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
586 /* 567 /*
587 * Calculate this BDI's share of the thresh ratio. 568 * Calculate this BDI's share of the thresh ratio.
588 */ 569 */
589 wb_writeout_fraction(wb, &numerator, &denominator); 570 fprop_fraction_percpu(&dom->completions, &wb->completions,
571 &numerator, &denominator);
590 572
591 wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; 573 wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
592 wb_thresh *= numerator; 574 wb_thresh *= numerator;
@@ -1831,7 +1813,7 @@ void __init page_writeback_init(void)
1831 writeback_set_ratelimit(); 1813 writeback_set_ratelimit();
1832 register_cpu_notifier(&ratelimit_nb); 1814 register_cpu_notifier(&ratelimit_nb);
1833 1815
1834 fprop_global_init(&writeout_completions, GFP_KERNEL); 1816 BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
1835} 1817}
1836 1818
1837/** 1819/**