diff options
Diffstat (limited to 'mm/page-writeback.c')
| -rw-r--r-- | mm/page-writeback.c | 107 |
1 files changed, 66 insertions, 41 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 93d8d2f7108c..e5363f34e025 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -34,6 +34,7 @@ | |||
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ | 35 | #include <linux/buffer_head.h> /* __set_page_dirty_buffers */ |
| 36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
| 37 | #include <linux/timer.h> | ||
| 37 | #include <trace/events/writeback.h> | 38 | #include <trace/events/writeback.h> |
| 38 | 39 | ||
| 39 | /* | 40 | /* |
| @@ -135,7 +136,20 @@ unsigned long global_dirty_limit; | |||
| 135 | * measured in page writeback completions. | 136 | * measured in page writeback completions. |
| 136 | * | 137 | * |
| 137 | */ | 138 | */ |
| 138 | static struct prop_descriptor vm_completions; | 139 | static struct fprop_global writeout_completions; |
| 140 | |||
| 141 | static void writeout_period(unsigned long t); | ||
| 142 | /* Timer for aging of writeout_completions */ | ||
| 143 | static struct timer_list writeout_period_timer = | ||
| 144 | TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); | ||
| 145 | static unsigned long writeout_period_time = 0; | ||
| 146 | |||
| 147 | /* | ||
| 148 | * Length of period for aging writeout fractions of bdis. This is an | ||
| 149 | * arbitrarily chosen number. The longer the period, the slower fractions will | ||
| 150 | * reflect changes in current writeout rate. | ||
| 151 | */ | ||
| 152 | #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) | ||
| 139 | 153 | ||
| 140 | /* | 154 | /* |
| 141 | * Work out the current dirty-memory clamping and background writeout | 155 | * Work out the current dirty-memory clamping and background writeout |
| @@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone) | |||
| 322 | zone_page_state(zone, NR_WRITEBACK) <= limit; | 336 | zone_page_state(zone, NR_WRITEBACK) <= limit; |
| 323 | } | 337 | } |
| 324 | 338 | ||
| 325 | /* | ||
| 326 | * couple the period to the dirty_ratio: | ||
| 327 | * | ||
| 328 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
| 329 | */ | ||
| 330 | static int calc_period_shift(void) | ||
| 331 | { | ||
| 332 | unsigned long dirty_total; | ||
| 333 | |||
| 334 | if (vm_dirty_bytes) | ||
| 335 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | ||
| 336 | else | ||
| 337 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / | ||
| 338 | 100; | ||
| 339 | return 2 + ilog2(dirty_total - 1); | ||
| 340 | } | ||
| 341 | |||
| 342 | /* | ||
| 343 | * update the period when the dirty threshold changes. | ||
| 344 | */ | ||
| 345 | static void update_completion_period(void) | ||
| 346 | { | ||
| 347 | int shift = calc_period_shift(); | ||
| 348 | prop_change_shift(&vm_completions, shift); | ||
| 349 | |||
| 350 | writeback_set_ratelimit(); | ||
| 351 | } | ||
| 352 | |||
| 353 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 339 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
| 354 | void __user *buffer, size_t *lenp, | 340 | void __user *buffer, size_t *lenp, |
| 355 | loff_t *ppos) | 341 | loff_t *ppos) |
| @@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
| 383 | 369 | ||
| 384 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 370 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 385 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 371 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
| 386 | update_completion_period(); | 372 | writeback_set_ratelimit(); |
| 387 | vm_dirty_bytes = 0; | 373 | vm_dirty_bytes = 0; |
| 388 | } | 374 | } |
| 389 | return ret; | 375 | return ret; |
| @@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
| 398 | 384 | ||
| 399 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); | 385 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
| 400 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 386 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
| 401 | update_completion_period(); | 387 | writeback_set_ratelimit(); |
| 402 | vm_dirty_ratio = 0; | 388 | vm_dirty_ratio = 0; |
| 403 | } | 389 | } |
| 404 | return ret; | 390 | return ret; |
| 405 | } | 391 | } |
| 406 | 392 | ||
| 393 | static unsigned long wp_next_time(unsigned long cur_time) | ||
| 394 | { | ||
| 395 | cur_time += VM_COMPLETIONS_PERIOD_LEN; | ||
| 396 | /* 0 has a special meaning... */ | ||
| 397 | if (!cur_time) | ||
| 398 | return 1; | ||
| 399 | return cur_time; | ||
| 400 | } | ||
| 401 | |||
| 407 | /* | 402 | /* |
| 408 | * Increment the BDI's writeout completion count and the global writeout | 403 | * Increment the BDI's writeout completion count and the global writeout |
| 409 | * completion count. Called from test_clear_page_writeback(). | 404 | * completion count. Called from test_clear_page_writeback(). |
| @@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
| 411 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 406 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
| 412 | { | 407 | { |
| 413 | __inc_bdi_stat(bdi, BDI_WRITTEN); | 408 | __inc_bdi_stat(bdi, BDI_WRITTEN); |
| 414 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 409 | __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, |
| 415 | bdi->max_prop_frac); | 410 | bdi->max_prop_frac); |
| 411 | /* First event after period switching was turned off? */ | ||
| 412 | if (!unlikely(writeout_period_time)) { | ||
| 413 | /* | ||
| 414 | * We can race with other __bdi_writeout_inc calls here but | ||
| 415 | * it does not cause any harm since the resulting time when | ||
| 416 | * timer will fire and what is in writeout_period_time will be | ||
| 417 | * roughly the same. | ||
| 418 | */ | ||
| 419 | writeout_period_time = wp_next_time(jiffies); | ||
| 420 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
| 421 | } | ||
| 416 | } | 422 | } |
| 417 | 423 | ||
| 418 | void bdi_writeout_inc(struct backing_dev_info *bdi) | 424 | void bdi_writeout_inc(struct backing_dev_info *bdi) |
| @@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); | |||
| 431 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 437 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
| 432 | long *numerator, long *denominator) | 438 | long *numerator, long *denominator) |
| 433 | { | 439 | { |
| 434 | prop_fraction_percpu(&vm_completions, &bdi->completions, | 440 | fprop_fraction_percpu(&writeout_completions, &bdi->completions, |
| 435 | numerator, denominator); | 441 | numerator, denominator); |
| 436 | } | 442 | } |
| 437 | 443 | ||
| 438 | /* | 444 | /* |
| 445 | * On idle system, we can be called long after we scheduled because we use | ||
| 446 | * deferred timers so count with missed periods. | ||
| 447 | */ | ||
| 448 | static void writeout_period(unsigned long t) | ||
| 449 | { | ||
| 450 | int miss_periods = (jiffies - writeout_period_time) / | ||
| 451 | VM_COMPLETIONS_PERIOD_LEN; | ||
| 452 | |||
| 453 | if (fprop_new_period(&writeout_completions, miss_periods + 1)) { | ||
| 454 | writeout_period_time = wp_next_time(writeout_period_time + | ||
| 455 | miss_periods * VM_COMPLETIONS_PERIOD_LEN); | ||
| 456 | mod_timer(&writeout_period_timer, writeout_period_time); | ||
| 457 | } else { | ||
| 458 | /* | ||
| 459 | * Aging has zeroed all fractions. Stop wasting CPU on period | ||
| 460 | * updates. | ||
| 461 | */ | ||
| 462 | writeout_period_time = 0; | ||
| 463 | } | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 439 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all | 467 | * bdi_min_ratio keeps the sum of the minimum dirty shares of all |
| 440 | * registered backing devices, which, for obvious reasons, can not | 468 | * registered backing devices, which, for obvious reasons, can not |
| 441 | * exceed 100%. | 469 | * exceed 100%. |
| @@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
| 475 | ret = -EINVAL; | 503 | ret = -EINVAL; |
| 476 | } else { | 504 | } else { |
| 477 | bdi->max_ratio = max_ratio; | 505 | bdi->max_ratio = max_ratio; |
| 478 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 506 | bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; |
| 479 | } | 507 | } |
| 480 | spin_unlock_bh(&bdi_lock); | 508 | spin_unlock_bh(&bdi_lock); |
| 481 | 509 | ||
| @@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
| 918 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; | 946 | * bdi->dirty_ratelimit = balanced_dirty_ratelimit; |
| 919 | * | 947 | * |
| 920 | * However to get a more stable dirty_ratelimit, the below elaborated | 948 | * However to get a more stable dirty_ratelimit, the below elaborated |
| 921 | * code makes use of task_ratelimit to filter out sigular points and | 949 | * code makes use of task_ratelimit to filter out singular points and |
| 922 | * limit the step size. | 950 | * limit the step size. |
| 923 | * | 951 | * |
| 924 | * The below code essentially only uses the relative value of | 952 | * The below code essentially only uses the relative value of |
| @@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
| 941 | * feel and care are stable dirty rate and small position error. | 969 | * feel and care are stable dirty rate and small position error. |
| 942 | * | 970 | * |
| 943 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size | 971 | * |task_ratelimit - dirty_ratelimit| is used to limit the step size |
| 944 | * and filter out the sigular points of balanced_dirty_ratelimit. Which | 972 | * and filter out the singular points of balanced_dirty_ratelimit. Which |
| 945 | * keeps jumping around randomly and can even leap far away at times | 973 | * keeps jumping around randomly and can even leap far away at times |
| 946 | * due to the small 200ms estimation period of dirty_rate (we want to | 974 | * due to the small 200ms estimation period of dirty_rate (we want to |
| 947 | * keep that period small to reduce time lags). | 975 | * keep that period small to reduce time lags). |
| @@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
| 1606 | */ | 1634 | */ |
| 1607 | void __init page_writeback_init(void) | 1635 | void __init page_writeback_init(void) |
| 1608 | { | 1636 | { |
| 1609 | int shift; | ||
| 1610 | |||
| 1611 | writeback_set_ratelimit(); | 1637 | writeback_set_ratelimit(); |
| 1612 | register_cpu_notifier(&ratelimit_nb); | 1638 | register_cpu_notifier(&ratelimit_nb); |
| 1613 | 1639 | ||
| 1614 | shift = calc_period_shift(); | 1640 | fprop_global_init(&writeout_completions); |
| 1615 | prop_descriptor_init(&vm_completions, shift); | ||
| 1616 | } | 1641 | } |
| 1617 | 1642 | ||
| 1618 | /** | 1643 | /** |
