aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 12:17:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-11 12:17:05 -0400
commita12e4d304ce701844c639541d90df86e165d03f9 (patch)
tree6ad7314b63a3303d9aa36f1c7eeb68abf64d3592 /mm
parent89af571ca633ada14d17746519a179553a732d31 (diff)
parent500b067c5e6ceea49cf280a02597b1169320e08c (diff)
Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block
* 'writeback' of git://git.kernel.dk/linux-2.6-block: writeback: check for registered bdi in flusher add and inode dirty writeback: add name to backing_dev_info writeback: add some debug inode list counters to bdi stats writeback: get rid of pdflush completely writeback: switch to per-bdi threads for flushing data writeback: move dirty inodes from super_block to backing_dev_info writeback: get rid of generic_sync_sb_inodes() export
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c381
-rw-r--r--mm/page-writeback.c182
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/vmscan.c2
6 files changed, 405 insertions, 432 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..147a7a7873c4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..d3ca0dac1111 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = {
22EXPORT_SYMBOL_GPL(default_backing_dev_info); 26EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
29DEFINE_SPINLOCK(bdi_lock);
30LIST_HEAD(bdi_list);
31LIST_HEAD(bdi_pending_list);
32
33static struct task_struct *sync_supers_tsk;
34static struct timer_list sync_supers_timer;
35
36static int bdi_sync_supers(void *);
37static void sync_supers_timer_fn(unsigned long);
38static void arm_supers_timer(void);
39
40static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
25 41
26#ifdef CONFIG_DEBUG_FS 42#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 43#include <linux/debugfs.h>
@@ -37,9 +53,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 53static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 54{
39 struct backing_dev_info *bdi = m->private; 55 struct backing_dev_info *bdi = m->private;
56 struct bdi_writeback *wb;
40 unsigned long background_thresh; 57 unsigned long background_thresh;
41 unsigned long dirty_thresh; 58 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 59 unsigned long bdi_thresh;
60 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
61 struct inode *inode;
62
63 /*
64 * inode lock is enough here, the bdi->wb_list is protected by
65 * RCU on the reader side
66 */
67 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
68 spin_lock(&inode_lock);
69 list_for_each_entry(wb, &bdi->wb_list, list) {
70 nr_wb++;
71 list_for_each_entry(inode, &wb->b_dirty, i_list)
72 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_list)
74 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_list)
76 nr_more_io++;
77 }
78 spin_unlock(&inode_lock);
43 79
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 80 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 81
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 85 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 86 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 87 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 88 "BackgroundThresh: %8lu kB\n"
89 "WriteBack threads:%8lu\n"
90 "b_dirty: %8lu\n"
91 "b_io: %8lu\n"
92 "b_more_io: %8lu\n"
93 "bdi_list: %8u\n"
94 "state: %8lx\n"
95 "wb_mask: %8lx\n"
96 "wb_list: %8u\n"
97 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 98 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 99 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 100 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 101 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 102 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
103 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 104#undef K
59 105
60 return 0; 106 return 0;
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void)
185{ 231{
186 int err; 232 int err;
187 233
234 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
235 BUG_ON(IS_ERR(sync_supers_tsk));
236
237 init_timer(&sync_supers_timer);
238 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
239 arm_supers_timer();
240
188 err = bdi_init(&default_backing_dev_info); 241 err = bdi_init(&default_backing_dev_info);
189 if (!err) 242 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 243 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void)
193} 246}
194subsys_initcall(default_bdi_init); 247subsys_initcall(default_bdi_init);
195 248
249static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
250{
251 memset(wb, 0, sizeof(*wb));
252
253 wb->bdi = bdi;
254 wb->last_old_flush = jiffies;
255 INIT_LIST_HEAD(&wb->b_dirty);
256 INIT_LIST_HEAD(&wb->b_io);
257 INIT_LIST_HEAD(&wb->b_more_io);
258}
259
260static void bdi_task_init(struct backing_dev_info *bdi,
261 struct bdi_writeback *wb)
262{
263 struct task_struct *tsk = current;
264
265 spin_lock(&bdi->wb_lock);
266 list_add_tail_rcu(&wb->list, &bdi->wb_list);
267 spin_unlock(&bdi->wb_lock);
268
269 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
270 set_freezable();
271
272 /*
273 * Our parent may run at a different priority, just set us to normal
274 */
275 set_user_nice(tsk, 0);
276}
277
278static int bdi_start_fn(void *ptr)
279{
280 struct bdi_writeback *wb = ptr;
281 struct backing_dev_info *bdi = wb->bdi;
282 int ret;
283
284 /*
285 * Add us to the active bdi_list
286 */
287 spin_lock(&bdi_lock);
288 list_add(&bdi->bdi_list, &bdi_list);
289 spin_unlock(&bdi_lock);
290
291 bdi_task_init(bdi, wb);
292
293 /*
294 * Clear pending bit and wakeup anybody waiting to tear us down
295 */
296 clear_bit(BDI_pending, &bdi->state);
297 smp_mb__after_clear_bit();
298 wake_up_bit(&bdi->state, BDI_pending);
299
300 ret = bdi_writeback_task(wb);
301
302 /*
303 * Remove us from the list
304 */
305 spin_lock(&bdi->wb_lock);
306 list_del_rcu(&wb->list);
307 spin_unlock(&bdi->wb_lock);
308
309 /*
310 * Flush any work that raced with us exiting. No new work
311 * will be added, since this bdi isn't discoverable anymore.
312 */
313 if (!list_empty(&bdi->work_list))
314 wb_do_writeback(wb, 1);
315
316 wb->task = NULL;
317 return ret;
318}
319
320int bdi_has_dirty_io(struct backing_dev_info *bdi)
321{
322 return wb_has_dirty_io(&bdi->wb);
323}
324
325static void bdi_flush_io(struct backing_dev_info *bdi)
326{
327 struct writeback_control wbc = {
328 .bdi = bdi,
329 .sync_mode = WB_SYNC_NONE,
330 .older_than_this = NULL,
331 .range_cyclic = 1,
332 .nr_to_write = 1024,
333 };
334
335 writeback_inodes_wbc(&wbc);
336}
337
338/*
339 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
340 * or we risk deadlocking on ->s_umount. The longer term solution would be
341 * to implement sync_supers_bdi() or similar and simply do it from the
342 * bdi writeback tasks individually.
343 */
344static int bdi_sync_supers(void *unused)
345{
346 set_user_nice(current, 0);
347
348 while (!kthread_should_stop()) {
349 set_current_state(TASK_INTERRUPTIBLE);
350 schedule();
351
352 /*
353 * Do this periodically, like kupdated() did before.
354 */
355 sync_supers();
356 }
357
358 return 0;
359}
360
361static void arm_supers_timer(void)
362{
363 unsigned long next;
364
365 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
366 mod_timer(&sync_supers_timer, round_jiffies_up(next));
367}
368
369static void sync_supers_timer_fn(unsigned long unused)
370{
371 wake_up_process(sync_supers_tsk);
372 arm_supers_timer();
373}
374
375static int bdi_forker_task(void *ptr)
376{
377 struct bdi_writeback *me = ptr;
378
379 bdi_task_init(me->bdi, me);
380
381 for (;;) {
382 struct backing_dev_info *bdi, *tmp;
383 struct bdi_writeback *wb;
384
385 /*
386 * Temporary measure, we want to make sure we don't see
387 * dirty data on the default backing_dev_info
388 */
389 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
390 wb_do_writeback(me, 0);
391
392 spin_lock(&bdi_lock);
393
394 /*
395 * Check if any existing bdi's have dirty data without
396 * a thread registered. If so, set that up.
397 */
398 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
399 if (bdi->wb.task)
400 continue;
401 if (list_empty(&bdi->work_list) &&
402 !bdi_has_dirty_io(bdi))
403 continue;
404
405 bdi_add_default_flusher_task(bdi);
406 }
407
408 set_current_state(TASK_INTERRUPTIBLE);
409
410 if (list_empty(&bdi_pending_list)) {
411 unsigned long wait;
412
413 spin_unlock(&bdi_lock);
414 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
415 schedule_timeout(wait);
416 try_to_freeze();
417 continue;
418 }
419
420 __set_current_state(TASK_RUNNING);
421
422 /*
423 * This is our real job - check for pending entries in
424 * bdi_pending_list, and create the tasks that got added
425 */
426 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
427 bdi_list);
428 list_del_init(&bdi->bdi_list);
429 spin_unlock(&bdi_lock);
430
431 wb = &bdi->wb;
432 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
433 dev_name(bdi->dev));
434 /*
435 * If task creation fails, then readd the bdi to
436 * the pending list and force writeout of the bdi
437 * from this forker thread. That will free some memory
438 * and we can try again.
439 */
440 if (IS_ERR(wb->task)) {
441 wb->task = NULL;
442
443 /*
444 * Add this 'bdi' to the back, so we get
445 * a chance to flush other bdi's to free
446 * memory.
447 */
448 spin_lock(&bdi_lock);
449 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
450 spin_unlock(&bdi_lock);
451
452 bdi_flush_io(bdi);
453 }
454 }
455
456 return 0;
457}
458
459/*
460 * Add the default flusher task that gets created for any bdi
461 * that has dirty data pending writeout
462 */
463void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
464{
465 if (!bdi_cap_writeback_dirty(bdi))
466 return;
467
468 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
469 printk(KERN_ERR "bdi %p/%s is not registered!\n",
470 bdi, bdi->name);
471 return;
472 }
473
474 /*
475 * Check with the helper whether to proceed adding a task. Will only
476 * abort if we two or more simultanous calls to
477 * bdi_add_default_flusher_task() occured, further additions will block
478 * waiting for previous additions to finish.
479 */
480 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
481 list_move_tail(&bdi->bdi_list, &bdi_pending_list);
482
483 /*
484 * We are now on the pending list, wake up bdi_forker_task()
485 * to finish the job and add us back to the active bdi_list
486 */
487 wake_up_process(default_backing_dev_info.wb.task);
488 }
489}
490
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 491int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 492 const char *fmt, ...)
198{ 493{
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 506 goto exit;
212 } 507 }
213 508
509 spin_lock(&bdi_lock);
510 list_add_tail(&bdi->bdi_list, &bdi_list);
511 spin_unlock(&bdi_lock);
512
214 bdi->dev = dev; 513 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 514
515 /*
516 * Just start the forker thread for our default backing_dev_info,
517 * and add other bdi's to the list. They will get a thread created
518 * on-demand when they need it.
519 */
520 if (bdi_cap_flush_forker(bdi)) {
521 struct bdi_writeback *wb = &bdi->wb;
522
523 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
524 dev_name(dev));
525 if (IS_ERR(wb->task)) {
526 wb->task = NULL;
527 ret = -ENOMEM;
528
529 spin_lock(&bdi_lock);
530 list_del(&bdi->bdi_list);
531 spin_unlock(&bdi_lock);
532 goto exit;
533 }
534 }
535
536 bdi_debug_register(bdi, dev_name(dev));
537 set_bit(BDI_registered, &bdi->state);
217exit: 538exit:
218 return ret; 539 return ret;
219} 540}
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 546}
226EXPORT_SYMBOL(bdi_register_dev); 547EXPORT_SYMBOL(bdi_register_dev);
227 548
549/*
550 * Remove bdi from the global list and shutdown any threads we have running
551 */
552static void bdi_wb_shutdown(struct backing_dev_info *bdi)
553{
554 struct bdi_writeback *wb;
555
556 if (!bdi_cap_writeback_dirty(bdi))
557 return;
558
559 /*
560 * If setup is pending, wait for that to complete first
561 */
562 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
563 TASK_UNINTERRUPTIBLE);
564
565 /*
566 * Make sure nobody finds us on the bdi_list anymore
567 */
568 spin_lock(&bdi_lock);
569 list_del(&bdi->bdi_list);
570 spin_unlock(&bdi_lock);
571
572 /*
573 * Finally, kill the kernel threads. We don't need to be RCU
574 * safe anymore, since the bdi is gone from visibility.
575 */
576 list_for_each_entry(wb, &bdi->wb_list, list)
577 kthread_stop(wb->task);
578}
579
228void bdi_unregister(struct backing_dev_info *bdi) 580void bdi_unregister(struct backing_dev_info *bdi)
229{ 581{
230 if (bdi->dev) { 582 if (bdi->dev) {
583 if (!bdi_cap_flush_forker(bdi))
584 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 585 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 586 device_unregister(bdi->dev);
233 bdi->dev = NULL; 587 bdi->dev = NULL;
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister);
237 591
238int bdi_init(struct backing_dev_info *bdi) 592int bdi_init(struct backing_dev_info *bdi)
239{ 593{
240 int i; 594 int i, err;
241 int err;
242 595
243 bdi->dev = NULL; 596 bdi->dev = NULL;
244 597
245 bdi->min_ratio = 0; 598 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 599 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 600 bdi->max_prop_frac = PROP_FRAC_BASE;
601 spin_lock_init(&bdi->wb_lock);
602 INIT_LIST_HEAD(&bdi->bdi_list);
603 INIT_LIST_HEAD(&bdi->wb_list);
604 INIT_LIST_HEAD(&bdi->work_list);
605
606 bdi_wb_init(&bdi->wb, bdi);
607
608 /*
609 * Just one thread support for now, hard code mask and count
610 */
611 bdi->wb_mask = 1;
612 bdi->wb_cnt = 1;
248 613
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 614 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 615 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 634{
270 int i; 635 int i;
271 636
637 WARN_ON(bdi_has_dirty_io(bdi));
638
272 bdi_unregister(bdi); 639 bdi_unregister(bdi);
273 640
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 641 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..25e7770309b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh))) {
600 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
601} 595}
602 596
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 675 }
682} 676}
683 677
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
740 679
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 681
744/* 682/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 684 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
807{ 687{
808 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 689 return 0;
815} 690}
816 691
817static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
818{
819 if (pdflush_operation(wb_kupdate, 0) < 0)
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
821}
822
823static void laptop_flush(unsigned long unused)
824{ 693{
825 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
826} 696}
827 697
828static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
829{ 699{
830 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
831} 707}
832 708
833/* 709/*
@@ -910,8 +786,6 @@ void __init page_writeback_init(void)
910{ 786{
911 int shift; 787 int shift;
912 788
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
917 791
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726