aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2012-05-24 12:59:11 -0400
committerFengguang Wu <fengguang.wu@intel.com>2012-06-08 19:37:56 -0400
commiteb608e3a344b3af21300360fcf868f8b4e808a8e (patch)
tree34e36bc81c8b7bf459301e1eda96062c40bccbb3
parente78d4833c03e28205b3d983f0c4e586ee34785fd (diff)
block: Convert BDI proportion calculations to flexible proportions
Convert calculations of proportion of writeback each bdi does to new flexible proportion code. That allows us to use aging period of fixed wallclock time which gives better proportion estimates given the hugely varying throughput of different devices. Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
-rw-r--r--include/linux/backing-dev.h4
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/page-writeback.c103
3 files changed, 69 insertions, 44 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b1038bd686ac..489de625cd25 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,7 +10,7 @@
10 10
11#include <linux/percpu_counter.h> 11#include <linux/percpu_counter.h>
12#include <linux/log2.h> 12#include <linux/log2.h>
13#include <linux/proportions.h> 13#include <linux/flex_proportions.h>
14#include <linux/kernel.h> 14#include <linux/kernel.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
@@ -89,7 +89,7 @@ struct backing_dev_info {
89 unsigned long dirty_ratelimit; 89 unsigned long dirty_ratelimit;
90 unsigned long balanced_dirty_ratelimit; 90 unsigned long balanced_dirty_ratelimit;
91 91
92 struct prop_local_percpu completions; 92 struct fprop_local_percpu completions;
93 int dirty_exceeded; 93 int dirty_exceeded;
94 94
95 unsigned int min_ratio; 95 unsigned int min_ratio;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..3387aea11209 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi)
677 677
678 bdi->min_ratio = 0; 678 bdi->min_ratio = 0;
679 bdi->max_ratio = 100; 679 bdi->max_ratio = 100;
680 bdi->max_prop_frac = PROP_FRAC_BASE; 680 bdi->max_prop_frac = FPROP_FRAC_BASE;
681 spin_lock_init(&bdi->wb_lock); 681 spin_lock_init(&bdi->wb_lock);
682 INIT_LIST_HEAD(&bdi->bdi_list); 682 INIT_LIST_HEAD(&bdi->bdi_list);
683 INIT_LIST_HEAD(&bdi->work_list); 683 INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi)
700 bdi->write_bandwidth = INIT_BW; 700 bdi->write_bandwidth = INIT_BW;
701 bdi->avg_write_bandwidth = INIT_BW; 701 bdi->avg_write_bandwidth = INIT_BW;
702 702
703 err = prop_local_init_percpu(&bdi->completions); 703 err = fprop_local_init_percpu(&bdi->completions);
704 704
705 if (err) { 705 if (err) {
706err: 706err:
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 744 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
745 percpu_counter_destroy(&bdi->bdi_stat[i]); 745 percpu_counter_destroy(&bdi->bdi_stat[i]);
746 746
747 prop_local_destroy_percpu(&bdi->completions); 747 fprop_local_destroy_percpu(&bdi->completions);
748} 748}
749EXPORT_SYMBOL(bdi_destroy); 749EXPORT_SYMBOL(bdi_destroy);
750 750
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108c..ec14419e53b5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ 35#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h>
37#include <trace/events/writeback.h> 38#include <trace/events/writeback.h>
38 39
39/* 40/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
135 * measured in page writeback completions. 136 * measured in page writeback completions.
136 * 137 *
137 */ 138 */
138static struct prop_descriptor vm_completions; 139static struct fprop_global writeout_completions;
140
141static void writeout_period(unsigned long t);
142/* Timer for aging of writeout_completions */
143static struct timer_list writeout_period_timer =
144 TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
145static unsigned long writeout_period_time = 0;
146
147/*
148 * Length of period for aging writeout fractions of bdis. This is an
149 * arbitrarily chosen number. The longer the period, the slower fractions will
150 * reflect changes in current writeout rate.
151 */
152#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
139 153
140/* 154/*
141 * Work out the current dirty-memory clamping and background writeout 155 * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
322 zone_page_state(zone, NR_WRITEBACK) <= limit; 336 zone_page_state(zone, NR_WRITEBACK) <= limit;
323} 337}
324 338
325/*
326 * couple the period to the dirty_ratio:
327 *
328 * period/2 ~ roundup_pow_of_two(dirty limit)
329 */
330static int calc_period_shift(void)
331{
332 unsigned long dirty_total;
333
334 if (vm_dirty_bytes)
335 dirty_total = vm_dirty_bytes / PAGE_SIZE;
336 else
337 dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
338 100;
339 return 2 + ilog2(dirty_total - 1);
340}
341
342/*
343 * update the period when the dirty threshold changes.
344 */
345static void update_completion_period(void)
346{
347 int shift = calc_period_shift();
348 prop_change_shift(&vm_completions, shift);
349
350 writeback_set_ratelimit();
351}
352
353int dirty_background_ratio_handler(struct ctl_table *table, int write, 339int dirty_background_ratio_handler(struct ctl_table *table, int write,
354 void __user *buffer, size_t *lenp, 340 void __user *buffer, size_t *lenp,
355 loff_t *ppos) 341 loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
383 369
384 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 370 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
385 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 371 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
386 update_completion_period(); 372 writeback_set_ratelimit();
387 vm_dirty_bytes = 0; 373 vm_dirty_bytes = 0;
388 } 374 }
389 return ret; 375 return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
398 384
399 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 385 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
400 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 386 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
401 update_completion_period(); 387 writeback_set_ratelimit();
402 vm_dirty_ratio = 0; 388 vm_dirty_ratio = 0;
403 } 389 }
404 return ret; 390 return ret;
405} 391}
406 392
393static unsigned long wp_next_time(unsigned long cur_time)
394{
395 cur_time += VM_COMPLETIONS_PERIOD_LEN;
396 /* 0 has a special meaning... */
397 if (!cur_time)
398 return 1;
399 return cur_time;
400}
401
407/* 402/*
408 * Increment the BDI's writeout completion count and the global writeout 403 * Increment the BDI's writeout completion count and the global writeout
409 * completion count. Called from test_clear_page_writeback(). 404 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
411static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 406static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
412{ 407{
413 __inc_bdi_stat(bdi, BDI_WRITTEN); 408 __inc_bdi_stat(bdi, BDI_WRITTEN);
414 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 409 __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
415 bdi->max_prop_frac); 410 bdi->max_prop_frac);
411 /* First event after period switching was turned off? */
412 if (!unlikely(writeout_period_time)) {
413 /*
414 * We can race with other __bdi_writeout_inc calls here but
415 * it does not cause any harm since the resulting time when
416 * timer will fire and what is in writeout_period_time will be
417 * roughly the same.
418 */
419 writeout_period_time = wp_next_time(jiffies);
420 mod_timer(&writeout_period_timer, writeout_period_time);
421 }
416} 422}
417 423
418void bdi_writeout_inc(struct backing_dev_info *bdi) 424void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
431static void bdi_writeout_fraction(struct backing_dev_info *bdi, 437static void bdi_writeout_fraction(struct backing_dev_info *bdi,
432 long *numerator, long *denominator) 438 long *numerator, long *denominator)
433{ 439{
434 prop_fraction_percpu(&vm_completions, &bdi->completions, 440 fprop_fraction_percpu(&writeout_completions, &bdi->completions,
435 numerator, denominator); 441 numerator, denominator);
436} 442}
437 443
438/* 444/*
445 * On idle system, we can be called long after we scheduled because we use
446 * deferred timers so count with missed periods.
447 */
448static void writeout_period(unsigned long t)
449{
450 int miss_periods = (jiffies - writeout_period_time) /
451 VM_COMPLETIONS_PERIOD_LEN;
452
453 if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
454 writeout_period_time = wp_next_time(writeout_period_time +
455 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
456 mod_timer(&writeout_period_timer, writeout_period_time);
457 } else {
458 /*
459 * Aging has zeroed all fractions. Stop wasting CPU on period
460 * updates.
461 */
462 writeout_period_time = 0;
463 }
464}
465
466/*
439 * bdi_min_ratio keeps the sum of the minimum dirty shares of all 467 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
440 * registered backing devices, which, for obvious reasons, can not 468 * registered backing devices, which, for obvious reasons, can not
441 * exceed 100%. 469 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
475 ret = -EINVAL; 503 ret = -EINVAL;
476 } else { 504 } else {
477 bdi->max_ratio = max_ratio; 505 bdi->max_ratio = max_ratio;
478 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 506 bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
479 } 507 }
480 spin_unlock_bh(&bdi_lock); 508 spin_unlock_bh(&bdi_lock);
481 509
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
1606 */ 1634 */
1607void __init page_writeback_init(void) 1635void __init page_writeback_init(void)
1608{ 1636{
1609 int shift;
1610
1611 writeback_set_ratelimit(); 1637 writeback_set_ratelimit();
1612 register_cpu_notifier(&ratelimit_nb); 1638 register_cpu_notifier(&ratelimit_nb);
1613 1639
1614 shift = calc_period_shift(); 1640 fprop_global_init(&writeout_completions);
1615 prop_descriptor_init(&vm_completions, shift);
1616} 1641}
1617 1642
1618/** 1643/**