aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-09-09 03:08:54 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 03:20:25 -0400
commit03ba3782e8dcc5b0e1efe440d33084f066e38cae (patch)
treee5a6513b411de16a46199530ec98ef9b7f1efc50 /mm
parent66f3b8e2e103a0b93b945764d98e9ba46cb926dd (diff)
writeback: switch to per-bdi threads for flushing data
This gets rid of pdflush for bdi writeout and kupdated style cleaning. pdflush writeout suffers from lack of locality and also requires more threads to handle the same workload, since it has to work in a non-blocking fashion against each queue. This also introduces lumpy behaviour and potential request starvation, since pdflush can be starved for queue access if others are accessing it. A sample ffsb workload that does random writes to files is about 8% faster here on a simple SATA drive during the benchmark phase. File layout also seems a LOT more smooth in vmstat: r b swpd free buff cache si so bi bo in cs us sy id wa 0 1 0 608848 2652 375372 0 0 0 71024 604 24 1 10 48 42 0 1 0 549644 2712 433736 0 0 0 60692 505 27 1 8 48 44 1 0 0 476928 2784 505192 0 0 4 29540 553 24 0 9 53 37 0 1 0 457972 2808 524008 0 0 0 54876 331 16 0 4 38 58 0 1 0 366128 2928 614284 0 0 4 92168 710 58 0 13 53 34 0 1 0 295092 3000 684140 0 0 0 62924 572 23 0 9 53 37 0 1 0 236592 3064 741704 0 0 4 58256 523 17 0 8 48 44 0 1 0 165608 3132 811464 0 0 0 57460 560 21 0 8 54 38 0 1 0 102952 3200 873164 0 0 4 74748 540 29 1 10 48 41 0 1 0 48604 3252 926472 0 0 0 53248 469 29 0 7 47 45 where vanilla tends to fluctuate a lot in the creation phase: r b swpd free buff cache si so bi bo in cs us sy id wa 1 1 0 678716 5792 303380 0 0 0 74064 565 50 1 11 52 36 1 0 0 662488 5864 319396 0 0 4 352 302 329 0 2 47 51 0 1 0 599312 5924 381468 0 0 0 78164 516 55 0 9 51 40 0 1 0 519952 6008 459516 0 0 4 78156 622 56 1 11 52 37 1 1 0 436640 6092 541632 0 0 0 82244 622 54 0 11 48 41 0 1 0 436640 6092 541660 0 0 0 8 152 39 0 0 51 49 0 1 0 332224 6200 644252 0 0 4 102800 728 46 1 13 49 36 1 0 0 274492 6260 701056 0 0 4 12328 459 49 0 7 50 43 0 1 0 211220 6324 763356 0 0 0 106940 515 37 1 10 51 39 1 0 0 160412 6376 813468 0 0 0 8224 415 43 0 6 49 45 1 1 0 85980 6452 886556 0 0 4 113516 575 39 1 11 54 34 0 2 0 85968 6452 886620 0 0 0 1640 158 211 0 0 46 54 A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A SSD based writeback test on XFS performs over 20% better as well, with the throughput being very stable around 1GB/sec, where pdflush only manages 750MB/sec and fluctuates wildly while doing so. Random buffered writes to many files behave a lot better as well, as does random mmap'ed writes. A separate thread is added to sync the super blocks. In the long term, adding sync_supers_bdi() functionality could get rid of this thread again. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c341
-rw-r--r--mm/page-writeback.c179
-rw-r--r--mm/vmscan.c2
3 files changed, 352 insertions, 170 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6f163e0f0509..7f3fa79f25c0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -22,8 +25,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 25EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 26
24static struct class *bdi_class; 27static struct class *bdi_class;
25DEFINE_MUTEX(bdi_lock); 28DEFINE_SPINLOCK(bdi_lock);
26LIST_HEAD(bdi_list); 29LIST_HEAD(bdi_list);
30LIST_HEAD(bdi_pending_list);
31
32static struct task_struct *sync_supers_tsk;
33static struct timer_list sync_supers_timer;
34
35static int bdi_sync_supers(void *);
36static void sync_supers_timer_fn(unsigned long);
37static void arm_supers_timer(void);
38
39static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
27 40
28#ifdef CONFIG_DEBUG_FS 41#ifdef CONFIG_DEBUG_FS
29#include <linux/debugfs.h> 42#include <linux/debugfs.h>
@@ -187,6 +200,13 @@ static int __init default_bdi_init(void)
187{ 200{
188 int err; 201 int err;
189 202
203 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
204 BUG_ON(IS_ERR(sync_supers_tsk));
205
206 init_timer(&sync_supers_timer);
207 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
208 arm_supers_timer();
209
190 err = bdi_init(&default_backing_dev_info); 210 err = bdi_init(&default_backing_dev_info);
191 if (!err) 211 if (!err)
192 bdi_register(&default_backing_dev_info, NULL, "default"); 212 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -195,6 +215,242 @@ static int __init default_bdi_init(void)
195} 215}
196subsys_initcall(default_bdi_init); 216subsys_initcall(default_bdi_init);
197 217
218static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
219{
220 memset(wb, 0, sizeof(*wb));
221
222 wb->bdi = bdi;
223 wb->last_old_flush = jiffies;
224 INIT_LIST_HEAD(&wb->b_dirty);
225 INIT_LIST_HEAD(&wb->b_io);
226 INIT_LIST_HEAD(&wb->b_more_io);
227}
228
229static void bdi_task_init(struct backing_dev_info *bdi,
230 struct bdi_writeback *wb)
231{
232 struct task_struct *tsk = current;
233
234 spin_lock(&bdi->wb_lock);
235 list_add_tail_rcu(&wb->list, &bdi->wb_list);
236 spin_unlock(&bdi->wb_lock);
237
238 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
239 set_freezable();
240
241 /*
242 * Our parent may run at a different priority, just set us to normal
243 */
244 set_user_nice(tsk, 0);
245}
246
247static int bdi_start_fn(void *ptr)
248{
249 struct bdi_writeback *wb = ptr;
250 struct backing_dev_info *bdi = wb->bdi;
251 int ret;
252
253 /*
254 * Add us to the active bdi_list
255 */
256 spin_lock(&bdi_lock);
257 list_add(&bdi->bdi_list, &bdi_list);
258 spin_unlock(&bdi_lock);
259
260 bdi_task_init(bdi, wb);
261
262 /*
263 * Clear pending bit and wakeup anybody waiting to tear us down
264 */
265 clear_bit(BDI_pending, &bdi->state);
266 smp_mb__after_clear_bit();
267 wake_up_bit(&bdi->state, BDI_pending);
268
269 ret = bdi_writeback_task(wb);
270
271 /*
272 * Remove us from the list
273 */
274 spin_lock(&bdi->wb_lock);
275 list_del_rcu(&wb->list);
276 spin_unlock(&bdi->wb_lock);
277
278 /*
279 * Flush any work that raced with us exiting. No new work
280 * will be added, since this bdi isn't discoverable anymore.
281 */
282 if (!list_empty(&bdi->work_list))
283 wb_do_writeback(wb, 1);
284
285 wb->task = NULL;
286 return ret;
287}
288
289int bdi_has_dirty_io(struct backing_dev_info *bdi)
290{
291 return wb_has_dirty_io(&bdi->wb);
292}
293
294static void bdi_flush_io(struct backing_dev_info *bdi)
295{
296 struct writeback_control wbc = {
297 .bdi = bdi,
298 .sync_mode = WB_SYNC_NONE,
299 .older_than_this = NULL,
300 .range_cyclic = 1,
301 .nr_to_write = 1024,
302 };
303
304 writeback_inodes_wbc(&wbc);
305}
306
307/*
308 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
309 * or we risk deadlocking on ->s_umount. The longer term solution would be
310 * to implement sync_supers_bdi() or similar and simply do it from the
311 * bdi writeback tasks individually.
312 */
313static int bdi_sync_supers(void *unused)
314{
315 set_user_nice(current, 0);
316
317 while (!kthread_should_stop()) {
318 set_current_state(TASK_INTERRUPTIBLE);
319 schedule();
320
321 /*
322 * Do this periodically, like kupdated() did before.
323 */
324 sync_supers();
325 }
326
327 return 0;
328}
329
330static void arm_supers_timer(void)
331{
332 unsigned long next;
333
334 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
335 mod_timer(&sync_supers_timer, round_jiffies_up(next));
336}
337
338static void sync_supers_timer_fn(unsigned long unused)
339{
340 wake_up_process(sync_supers_tsk);
341 arm_supers_timer();
342}
343
344static int bdi_forker_task(void *ptr)
345{
346 struct bdi_writeback *me = ptr;
347
348 bdi_task_init(me->bdi, me);
349
350 for (;;) {
351 struct backing_dev_info *bdi, *tmp;
352 struct bdi_writeback *wb;
353
354 /*
355 * Temporary measure, we want to make sure we don't see
356 * dirty data on the default backing_dev_info
357 */
358 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
359 wb_do_writeback(me, 0);
360
361 spin_lock(&bdi_lock);
362
363 /*
364 * Check if any existing bdi's have dirty data without
365 * a thread registered. If so, set that up.
366 */
367 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
368 if (bdi->wb.task)
369 continue;
370 if (list_empty(&bdi->work_list) &&
371 !bdi_has_dirty_io(bdi))
372 continue;
373
374 bdi_add_default_flusher_task(bdi);
375 }
376
377 set_current_state(TASK_INTERRUPTIBLE);
378
379 if (list_empty(&bdi_pending_list)) {
380 unsigned long wait;
381
382 spin_unlock(&bdi_lock);
383 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
384 schedule_timeout(wait);
385 try_to_freeze();
386 continue;
387 }
388
389 __set_current_state(TASK_RUNNING);
390
391 /*
392 * This is our real job - check for pending entries in
393 * bdi_pending_list, and create the tasks that got added
394 */
395 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
396 bdi_list);
397 list_del_init(&bdi->bdi_list);
398 spin_unlock(&bdi_lock);
399
400 wb = &bdi->wb;
401 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
402 dev_name(bdi->dev));
403 /*
404 * If task creation fails, then readd the bdi to
405 * the pending list and force writeout of the bdi
406 * from this forker thread. That will free some memory
407 * and we can try again.
408 */
409 if (IS_ERR(wb->task)) {
410 wb->task = NULL;
411
412 /*
413 * Add this 'bdi' to the back, so we get
414 * a chance to flush other bdi's to free
415 * memory.
416 */
417 spin_lock(&bdi_lock);
418 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
419 spin_unlock(&bdi_lock);
420
421 bdi_flush_io(bdi);
422 }
423 }
424
425 return 0;
426}
427
428/*
429 * Add the default flusher task that gets created for any bdi
430 * that has dirty data pending writeout
431 */
432void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
433{
434 if (!bdi_cap_writeback_dirty(bdi))
435 return;
436
437 /*
438 * Check with the helper whether to proceed adding a task. Will only
439 * abort if we two or more simultanous calls to
440 * bdi_add_default_flusher_task() occured, further additions will block
441 * waiting for previous additions to finish.
442 */
443 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
444 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
445
446 /*
447 * We are now on the pending list, wake up bdi_forker_task()
448 * to finish the job and add us back to the active bdi_list
449 */
450 wake_up_process(default_backing_dev_info.wb.task);
451 }
452}
453
198int bdi_register(struct backing_dev_info *bdi, struct device *parent, 454int bdi_register(struct backing_dev_info *bdi, struct device *parent,
199 const char *fmt, ...) 455 const char *fmt, ...)
200{ 456{
@@ -213,13 +469,34 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
213 goto exit; 469 goto exit;
214 } 470 }
215 471
216 mutex_lock(&bdi_lock); 472 spin_lock(&bdi_lock);
217 list_add_tail(&bdi->bdi_list, &bdi_list); 473 list_add_tail(&bdi->bdi_list, &bdi_list);
218 mutex_unlock(&bdi_lock); 474 spin_unlock(&bdi_lock);
219 475
220 bdi->dev = dev; 476 bdi->dev = dev;
221 bdi_debug_register(bdi, dev_name(dev));
222 477
478 /*
479 * Just start the forker thread for our default backing_dev_info,
480 * and add other bdi's to the list. They will get a thread created
481 * on-demand when they need it.
482 */
483 if (bdi_cap_flush_forker(bdi)) {
484 struct bdi_writeback *wb = &bdi->wb;
485
486 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
487 dev_name(dev));
488 if (IS_ERR(wb->task)) {
489 wb->task = NULL;
490 ret = -ENOMEM;
491
492 spin_lock(&bdi_lock);
493 list_del(&bdi->bdi_list);
494 spin_unlock(&bdi_lock);
495 goto exit;
496 }
497 }
498
499 bdi_debug_register(bdi, dev_name(dev));
223exit: 500exit:
224 return ret; 501 return ret;
225} 502}
@@ -231,17 +508,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
231} 508}
232EXPORT_SYMBOL(bdi_register_dev); 509EXPORT_SYMBOL(bdi_register_dev);
233 510
234static void bdi_remove_from_list(struct backing_dev_info *bdi) 511/*
512 * Remove bdi from the global list and shutdown any threads we have running
513 */
514static void bdi_wb_shutdown(struct backing_dev_info *bdi)
235{ 515{
236 mutex_lock(&bdi_lock); 516 struct bdi_writeback *wb;
517
518 if (!bdi_cap_writeback_dirty(bdi))
519 return;
520
521 /*
522 * If setup is pending, wait for that to complete first
523 */
524 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
525 TASK_UNINTERRUPTIBLE);
526
527 /*
528 * Make sure nobody finds us on the bdi_list anymore
529 */
530 spin_lock(&bdi_lock);
237 list_del(&bdi->bdi_list); 531 list_del(&bdi->bdi_list);
238 mutex_unlock(&bdi_lock); 532 spin_unlock(&bdi_lock);
533
534 /*
535 * Finally, kill the kernel threads. We don't need to be RCU
536 * safe anymore, since the bdi is gone from visibility.
537 */
538 list_for_each_entry(wb, &bdi->wb_list, list)
539 kthread_stop(wb->task);
239} 540}
240 541
241void bdi_unregister(struct backing_dev_info *bdi) 542void bdi_unregister(struct backing_dev_info *bdi)
242{ 543{
243 if (bdi->dev) { 544 if (bdi->dev) {
244 bdi_remove_from_list(bdi); 545 if (!bdi_cap_flush_forker(bdi))
546 bdi_wb_shutdown(bdi);
245 bdi_debug_unregister(bdi); 547 bdi_debug_unregister(bdi);
246 device_unregister(bdi->dev); 548 device_unregister(bdi->dev);
247 bdi->dev = NULL; 549 bdi->dev = NULL;
@@ -251,18 +553,25 @@ EXPORT_SYMBOL(bdi_unregister);
251 553
252int bdi_init(struct backing_dev_info *bdi) 554int bdi_init(struct backing_dev_info *bdi)
253{ 555{
254 int i; 556 int i, err;
255 int err;
256 557
257 bdi->dev = NULL; 558 bdi->dev = NULL;
258 559
259 bdi->min_ratio = 0; 560 bdi->min_ratio = 0;
260 bdi->max_ratio = 100; 561 bdi->max_ratio = 100;
261 bdi->max_prop_frac = PROP_FRAC_BASE; 562 bdi->max_prop_frac = PROP_FRAC_BASE;
563 spin_lock_init(&bdi->wb_lock);
262 INIT_LIST_HEAD(&bdi->bdi_list); 564 INIT_LIST_HEAD(&bdi->bdi_list);
263 INIT_LIST_HEAD(&bdi->b_io); 565 INIT_LIST_HEAD(&bdi->wb_list);
264 INIT_LIST_HEAD(&bdi->b_dirty); 566 INIT_LIST_HEAD(&bdi->work_list);
265 INIT_LIST_HEAD(&bdi->b_more_io); 567
568 bdi_wb_init(&bdi->wb, bdi);
569
570 /*
571 * Just one thread support for now, hard code mask and count
572 */
573 bdi->wb_mask = 1;
574 bdi->wb_cnt = 1;
266 575
267 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 576 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
268 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 577 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -277,8 +586,6 @@ int bdi_init(struct backing_dev_info *bdi)
277err: 586err:
278 while (i--) 587 while (i--)
279 percpu_counter_destroy(&bdi->bdi_stat[i]); 588 percpu_counter_destroy(&bdi->bdi_stat[i]);
280
281 bdi_remove_from_list(bdi);
282 } 589 }
283 590
284 return err; 591 return err;
@@ -289,9 +596,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
289{ 596{
290 int i; 597 int i;
291 598
292 WARN_ON(!list_empty(&bdi->b_dirty)); 599 WARN_ON(bdi_has_dirty_io(bdi));
293 WARN_ON(!list_empty(&bdi->b_io));
294 WARN_ON(!list_empty(&bdi->b_more_io));
295 600
296 bdi_unregister(bdi); 601 bdi_unregister(bdi);
297 602
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f8341b6019bf..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -326,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
326{ 315{
327 int ret = 0; 316 int ret = 0;
328 317
329 mutex_lock(&bdi_lock); 318 spin_lock(&bdi_lock);
330 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
331 ret = -EINVAL; 320 ret = -EINVAL;
332 } else { 321 } else {
@@ -338,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
338 ret = -EINVAL; 327 ret = -EINVAL;
339 } 328 }
340 } 329 }
341 mutex_unlock(&bdi_lock); 330 spin_unlock(&bdi_lock);
342 331
343 return ret; 332 return ret;
344} 333}
@@ -350,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
350 if (max_ratio > 100) 339 if (max_ratio > 100)
351 return -EINVAL; 340 return -EINVAL;
352 341
353 mutex_lock(&bdi_lock); 342 spin_lock(&bdi_lock);
354 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
355 ret = -EINVAL; 344 ret = -EINVAL;
356 } else { 345 } else {
357 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
358 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
359 } 348 }
360 mutex_unlock(&bdi_lock); 349 spin_unlock(&bdi_lock);
361 350
362 return ret; 351 return ret;
363} 352}
@@ -543,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
543 * up. 532 * up.
544 */ 533 */
545 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
546 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
547 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
548 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
549 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -572,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
572 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
573 break; /* We've done our duty */ 562 break; /* We've done our duty */
574 563
575 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
576 } 565 }
577 566
578 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -591,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
591 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
592 */ 581 */
593 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
594 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
595 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
596 > background_thresh))) 585 > background_thresh))) {
597 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
598} 595}
599 596
600void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -678,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
678 } 675 }
679} 676}
680 677
681/*
682 * writeback at least _min_pages, and keep writing until the amount of dirty
683 * memory is less than the background threshold, or until we're all clean.
684 */
685static void background_writeout(unsigned long _min_pages)
686{
687 long min_pages = _min_pages;
688 struct writeback_control wbc = {
689 .bdi = NULL,
690 .sync_mode = WB_SYNC_NONE,
691 .older_than_this = NULL,
692 .nr_to_write = 0,
693 .nonblocking = 1,
694 .range_cyclic = 1,
695 };
696
697 for ( ; ; ) {
698 unsigned long background_thresh;
699 unsigned long dirty_thresh;
700
701 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
702 if (global_page_state(NR_FILE_DIRTY) +
703 global_page_state(NR_UNSTABLE_NFS) < background_thresh
704 && min_pages <= 0)
705 break;
706 wbc.more_io = 0;
707 wbc.encountered_congestion = 0;
708 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
709 wbc.pages_skipped = 0;
710 writeback_inodes(&wbc);
711 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
712 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
713 /* Wrote less than expected */
714 if (wbc.encountered_congestion || wbc.more_io)
715 congestion_wait(BLK_RW_ASYNC, HZ/10);
716 else
717 break;
718 }
719 }
720}
721
722/*
723 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
724 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
725 * -1 if all pdflush threads were busy.
726 */
727int wakeup_pdflush(long nr_pages)
728{
729 if (nr_pages == 0)
730 nr_pages = global_page_state(NR_FILE_DIRTY) +
731 global_page_state(NR_UNSTABLE_NFS);
732 return pdflush_operation(background_writeout, nr_pages);
733}
734
735static void wb_timer_fn(unsigned long unused);
736static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
737 679
738static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
739static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
740 681
741/* 682/*
742 * Periodic writeback of "old" data.
743 *
744 * Define "old": the first time one of an inode's pages is dirtied, we mark the
745 * dirtying-time in the inode's address_space. So this periodic writeback code
746 * just walks the superblock inode list, writing back any inodes which are
747 * older than a specific point in time.
748 *
749 * Try to run once per dirty_writeback_interval. But if a writeback event
750 * takes longer than a dirty_writeback_interval interval, then leave a
751 * one-second gap.
752 *
753 * older_than_this takes precedence over nr_to_write. So we'll only write back
754 * all dirty pages if they are all attached to "old" mappings.
755 */
756static void wb_kupdate(unsigned long arg)
757{
758 unsigned long oldest_jif;
759 unsigned long start_jif;
760 unsigned long next_jif;
761 long nr_to_write;
762 struct writeback_control wbc = {
763 .bdi = NULL,
764 .sync_mode = WB_SYNC_NONE,
765 .older_than_this = &oldest_jif,
766 .nr_to_write = 0,
767 .nonblocking = 1,
768 .for_kupdate = 1,
769 .range_cyclic = 1,
770 };
771
772 sync_supers();
773
774 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
775 start_jif = jiffies;
776 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
777 nr_to_write = global_page_state(NR_FILE_DIRTY) +
778 global_page_state(NR_UNSTABLE_NFS) +
779 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
780 while (nr_to_write > 0) {
781 wbc.more_io = 0;
782 wbc.encountered_congestion = 0;
783 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
784 writeback_inodes(&wbc);
785 if (wbc.nr_to_write > 0) {
786 if (wbc.encountered_congestion || wbc.more_io)
787 congestion_wait(BLK_RW_ASYNC, HZ/10);
788 else
789 break; /* All the old data is written */
790 }
791 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
792 }
793 if (time_before(next_jif, jiffies + HZ))
794 next_jif = jiffies + HZ;
795 if (dirty_writeback_interval)
796 mod_timer(&wb_timer, next_jif);
797}
798
799/*
800 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
801 */ 684 */
802int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
803 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
804{ 687{
805 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
806 if (dirty_writeback_interval)
807 mod_timer(&wb_timer, jiffies +
808 msecs_to_jiffies(dirty_writeback_interval * 10));
809 else
810 del_timer(&wb_timer);
811 return 0; 689 return 0;
812} 690}
813 691
814static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
815{
816 if (pdflush_operation(wb_kupdate, 0) < 0)
817 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
818}
819
820static void laptop_flush(unsigned long unused)
821{ 693{
822 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
823} 696}
824 697
825static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
826{ 699{
827 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
828} 707}
829 708
830/* 709/*
@@ -907,8 +786,6 @@ void __init page_writeback_init(void)
907{ 786{
908 int shift; 787 int shift;
909 788
910 mod_timer(&wb_timer,
911 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
912 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
913 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
914 791
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726