diff options
Diffstat (limited to 'mm/backing-dev.c')
-rw-r--r-- | mm/backing-dev.c | 381 |
1 files changed, 374 insertions, 7 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c86edd244294..d3ca0dac1111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -1,8 +1,11 @@ | |||
1 | 1 | ||
2 | #include <linux/wait.h> | 2 | #include <linux/wait.h> |
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/kthread.h> | ||
5 | #include <linux/freezer.h> | ||
4 | #include <linux/fs.h> | 6 | #include <linux/fs.h> |
5 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
8 | #include <linux/mm.h> | ||
6 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
7 | #include <linux/module.h> | 10 | #include <linux/module.h> |
8 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
14 | EXPORT_SYMBOL(default_unplug_io_fn); | 17 | EXPORT_SYMBOL(default_unplug_io_fn); |
15 | 18 | ||
16 | struct backing_dev_info default_backing_dev_info = { | 19 | struct backing_dev_info default_backing_dev_info = { |
20 | .name = "default", | ||
17 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 21 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
18 | .state = 0, | 22 | .state = 0, |
19 | .capabilities = BDI_CAP_MAP_COPY, | 23 | .capabilities = BDI_CAP_MAP_COPY, |
@@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = { | |||
22 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 26 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
23 | 27 | ||
24 | static struct class *bdi_class; | 28 | static struct class *bdi_class; |
29 | DEFINE_SPINLOCK(bdi_lock); | ||
30 | LIST_HEAD(bdi_list); | ||
31 | LIST_HEAD(bdi_pending_list); | ||
32 | |||
33 | static struct task_struct *sync_supers_tsk; | ||
34 | static struct timer_list sync_supers_timer; | ||
35 | |||
36 | static int bdi_sync_supers(void *); | ||
37 | static void sync_supers_timer_fn(unsigned long); | ||
38 | static void arm_supers_timer(void); | ||
39 | |||
40 | static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); | ||
25 | 41 | ||
26 | #ifdef CONFIG_DEBUG_FS | 42 | #ifdef CONFIG_DEBUG_FS |
27 | #include <linux/debugfs.h> | 43 | #include <linux/debugfs.h> |
@@ -37,9 +53,29 @@ static void bdi_debug_init(void) | |||
37 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 53 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
38 | { | 54 | { |
39 | struct backing_dev_info *bdi = m->private; | 55 | struct backing_dev_info *bdi = m->private; |
56 | struct bdi_writeback *wb; | ||
40 | unsigned long background_thresh; | 57 | unsigned long background_thresh; |
41 | unsigned long dirty_thresh; | 58 | unsigned long dirty_thresh; |
42 | unsigned long bdi_thresh; | 59 | unsigned long bdi_thresh; |
60 | unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; | ||
61 | struct inode *inode; | ||
62 | |||
63 | /* | ||
64 | * inode lock is enough here, the bdi->wb_list is protected by | ||
65 | * RCU on the reader side | ||
66 | */ | ||
67 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | ||
68 | spin_lock(&inode_lock); | ||
69 | list_for_each_entry(wb, &bdi->wb_list, list) { | ||
70 | nr_wb++; | ||
71 | list_for_each_entry(inode, &wb->b_dirty, i_list) | ||
72 | nr_dirty++; | ||
73 | list_for_each_entry(inode, &wb->b_io, i_list) | ||
74 | nr_io++; | ||
75 | list_for_each_entry(inode, &wb->b_more_io, i_list) | ||
76 | nr_more_io++; | ||
77 | } | ||
78 | spin_unlock(&inode_lock); | ||
43 | 79 | ||
44 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 80 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
45 | 81 | ||
@@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
49 | "BdiReclaimable: %8lu kB\n" | 85 | "BdiReclaimable: %8lu kB\n" |
50 | "BdiDirtyThresh: %8lu kB\n" | 86 | "BdiDirtyThresh: %8lu kB\n" |
51 | "DirtyThresh: %8lu kB\n" | 87 | "DirtyThresh: %8lu kB\n" |
52 | "BackgroundThresh: %8lu kB\n", | 88 | "BackgroundThresh: %8lu kB\n" |
89 | "WriteBack threads:%8lu\n" | ||
90 | "b_dirty: %8lu\n" | ||
91 | "b_io: %8lu\n" | ||
92 | "b_more_io: %8lu\n" | ||
93 | "bdi_list: %8u\n" | ||
94 | "state: %8lx\n" | ||
95 | "wb_mask: %8lx\n" | ||
96 | "wb_list: %8u\n" | ||
97 | "wb_cnt: %8u\n", | ||
53 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 98 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
54 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 99 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
55 | K(bdi_thresh), | 100 | K(bdi_thresh), K(dirty_thresh), |
56 | K(dirty_thresh), | 101 | K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, |
57 | K(background_thresh)); | 102 | !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, |
103 | !list_empty(&bdi->wb_list), bdi->wb_cnt); | ||
58 | #undef K | 104 | #undef K |
59 | 105 | ||
60 | return 0; | 106 | return 0; |
@@ -185,6 +231,13 @@ static int __init default_bdi_init(void) | |||
185 | { | 231 | { |
186 | int err; | 232 | int err; |
187 | 233 | ||
234 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); | ||
235 | BUG_ON(IS_ERR(sync_supers_tsk)); | ||
236 | |||
237 | init_timer(&sync_supers_timer); | ||
238 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | ||
239 | arm_supers_timer(); | ||
240 | |||
188 | err = bdi_init(&default_backing_dev_info); | 241 | err = bdi_init(&default_backing_dev_info); |
189 | if (!err) | 242 | if (!err) |
190 | bdi_register(&default_backing_dev_info, NULL, "default"); | 243 | bdi_register(&default_backing_dev_info, NULL, "default"); |
@@ -193,6 +246,248 @@ static int __init default_bdi_init(void) | |||
193 | } | 246 | } |
194 | subsys_initcall(default_bdi_init); | 247 | subsys_initcall(default_bdi_init); |
195 | 248 | ||
249 | static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | ||
250 | { | ||
251 | memset(wb, 0, sizeof(*wb)); | ||
252 | |||
253 | wb->bdi = bdi; | ||
254 | wb->last_old_flush = jiffies; | ||
255 | INIT_LIST_HEAD(&wb->b_dirty); | ||
256 | INIT_LIST_HEAD(&wb->b_io); | ||
257 | INIT_LIST_HEAD(&wb->b_more_io); | ||
258 | } | ||
259 | |||
260 | static void bdi_task_init(struct backing_dev_info *bdi, | ||
261 | struct bdi_writeback *wb) | ||
262 | { | ||
263 | struct task_struct *tsk = current; | ||
264 | |||
265 | spin_lock(&bdi->wb_lock); | ||
266 | list_add_tail_rcu(&wb->list, &bdi->wb_list); | ||
267 | spin_unlock(&bdi->wb_lock); | ||
268 | |||
269 | tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
270 | set_freezable(); | ||
271 | |||
272 | /* | ||
273 | * Our parent may run at a different priority, just set us to normal | ||
274 | */ | ||
275 | set_user_nice(tsk, 0); | ||
276 | } | ||
277 | |||
278 | static int bdi_start_fn(void *ptr) | ||
279 | { | ||
280 | struct bdi_writeback *wb = ptr; | ||
281 | struct backing_dev_info *bdi = wb->bdi; | ||
282 | int ret; | ||
283 | |||
284 | /* | ||
285 | * Add us to the active bdi_list | ||
286 | */ | ||
287 | spin_lock(&bdi_lock); | ||
288 | list_add(&bdi->bdi_list, &bdi_list); | ||
289 | spin_unlock(&bdi_lock); | ||
290 | |||
291 | bdi_task_init(bdi, wb); | ||
292 | |||
293 | /* | ||
294 | * Clear pending bit and wakeup anybody waiting to tear us down | ||
295 | */ | ||
296 | clear_bit(BDI_pending, &bdi->state); | ||
297 | smp_mb__after_clear_bit(); | ||
298 | wake_up_bit(&bdi->state, BDI_pending); | ||
299 | |||
300 | ret = bdi_writeback_task(wb); | ||
301 | |||
302 | /* | ||
303 | * Remove us from the list | ||
304 | */ | ||
305 | spin_lock(&bdi->wb_lock); | ||
306 | list_del_rcu(&wb->list); | ||
307 | spin_unlock(&bdi->wb_lock); | ||
308 | |||
309 | /* | ||
310 | * Flush any work that raced with us exiting. No new work | ||
311 | * will be added, since this bdi isn't discoverable anymore. | ||
312 | */ | ||
313 | if (!list_empty(&bdi->work_list)) | ||
314 | wb_do_writeback(wb, 1); | ||
315 | |||
316 | wb->task = NULL; | ||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | int bdi_has_dirty_io(struct backing_dev_info *bdi) | ||
321 | { | ||
322 | return wb_has_dirty_io(&bdi->wb); | ||
323 | } | ||
324 | |||
325 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
326 | { | ||
327 | struct writeback_control wbc = { | ||
328 | .bdi = bdi, | ||
329 | .sync_mode = WB_SYNC_NONE, | ||
330 | .older_than_this = NULL, | ||
331 | .range_cyclic = 1, | ||
332 | .nr_to_write = 1024, | ||
333 | }; | ||
334 | |||
335 | writeback_inodes_wbc(&wbc); | ||
336 | } | ||
337 | |||
338 | /* | ||
339 | * kupdated() used to do this. We cannot do it from the bdi_forker_task() | ||
340 | * or we risk deadlocking on ->s_umount. The longer term solution would be | ||
341 | * to implement sync_supers_bdi() or similar and simply do it from the | ||
342 | * bdi writeback tasks individually. | ||
343 | */ | ||
344 | static int bdi_sync_supers(void *unused) | ||
345 | { | ||
346 | set_user_nice(current, 0); | ||
347 | |||
348 | while (!kthread_should_stop()) { | ||
349 | set_current_state(TASK_INTERRUPTIBLE); | ||
350 | schedule(); | ||
351 | |||
352 | /* | ||
353 | * Do this periodically, like kupdated() did before. | ||
354 | */ | ||
355 | sync_supers(); | ||
356 | } | ||
357 | |||
358 | return 0; | ||
359 | } | ||
360 | |||
361 | static void arm_supers_timer(void) | ||
362 | { | ||
363 | unsigned long next; | ||
364 | |||
365 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | ||
366 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | ||
367 | } | ||
368 | |||
369 | static void sync_supers_timer_fn(unsigned long unused) | ||
370 | { | ||
371 | wake_up_process(sync_supers_tsk); | ||
372 | arm_supers_timer(); | ||
373 | } | ||
374 | |||
375 | static int bdi_forker_task(void *ptr) | ||
376 | { | ||
377 | struct bdi_writeback *me = ptr; | ||
378 | |||
379 | bdi_task_init(me->bdi, me); | ||
380 | |||
381 | for (;;) { | ||
382 | struct backing_dev_info *bdi, *tmp; | ||
383 | struct bdi_writeback *wb; | ||
384 | |||
385 | /* | ||
386 | * Temporary measure, we want to make sure we don't see | ||
387 | * dirty data on the default backing_dev_info | ||
388 | */ | ||
389 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) | ||
390 | wb_do_writeback(me, 0); | ||
391 | |||
392 | spin_lock(&bdi_lock); | ||
393 | |||
394 | /* | ||
395 | * Check if any existing bdi's have dirty data without | ||
396 | * a thread registered. If so, set that up. | ||
397 | */ | ||
398 | list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { | ||
399 | if (bdi->wb.task) | ||
400 | continue; | ||
401 | if (list_empty(&bdi->work_list) && | ||
402 | !bdi_has_dirty_io(bdi)) | ||
403 | continue; | ||
404 | |||
405 | bdi_add_default_flusher_task(bdi); | ||
406 | } | ||
407 | |||
408 | set_current_state(TASK_INTERRUPTIBLE); | ||
409 | |||
410 | if (list_empty(&bdi_pending_list)) { | ||
411 | unsigned long wait; | ||
412 | |||
413 | spin_unlock(&bdi_lock); | ||
414 | wait = msecs_to_jiffies(dirty_writeback_interval * 10); | ||
415 | schedule_timeout(wait); | ||
416 | try_to_freeze(); | ||
417 | continue; | ||
418 | } | ||
419 | |||
420 | __set_current_state(TASK_RUNNING); | ||
421 | |||
422 | /* | ||
423 | * This is our real job - check for pending entries in | ||
424 | * bdi_pending_list, and create the tasks that got added | ||
425 | */ | ||
426 | bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, | ||
427 | bdi_list); | ||
428 | list_del_init(&bdi->bdi_list); | ||
429 | spin_unlock(&bdi_lock); | ||
430 | |||
431 | wb = &bdi->wb; | ||
432 | wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", | ||
433 | dev_name(bdi->dev)); | ||
434 | /* | ||
435 | * If task creation fails, then readd the bdi to | ||
436 | * the pending list and force writeout of the bdi | ||
437 | * from this forker thread. That will free some memory | ||
438 | * and we can try again. | ||
439 | */ | ||
440 | if (IS_ERR(wb->task)) { | ||
441 | wb->task = NULL; | ||
442 | |||
443 | /* | ||
444 | * Add this 'bdi' to the back, so we get | ||
445 | * a chance to flush other bdi's to free | ||
446 | * memory. | ||
447 | */ | ||
448 | spin_lock(&bdi_lock); | ||
449 | list_add_tail(&bdi->bdi_list, &bdi_pending_list); | ||
450 | spin_unlock(&bdi_lock); | ||
451 | |||
452 | bdi_flush_io(bdi); | ||
453 | } | ||
454 | } | ||
455 | |||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | /* | ||
460 | * Add the default flusher task that gets created for any bdi | ||
461 | * that has dirty data pending writeout | ||
462 | */ | ||
463 | void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) | ||
464 | { | ||
465 | if (!bdi_cap_writeback_dirty(bdi)) | ||
466 | return; | ||
467 | |||
468 | if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { | ||
469 | printk(KERN_ERR "bdi %p/%s is not registered!\n", | ||
470 | bdi, bdi->name); | ||
471 | return; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Check with the helper whether to proceed adding a task. Will only | ||
476 | * abort if we two or more simultanous calls to | ||
477 | * bdi_add_default_flusher_task() occured, further additions will block | ||
478 | * waiting for previous additions to finish. | ||
479 | */ | ||
480 | if (!test_and_set_bit(BDI_pending, &bdi->state)) { | ||
481 | list_move_tail(&bdi->bdi_list, &bdi_pending_list); | ||
482 | |||
483 | /* | ||
484 | * We are now on the pending list, wake up bdi_forker_task() | ||
485 | * to finish the job and add us back to the active bdi_list | ||
486 | */ | ||
487 | wake_up_process(default_backing_dev_info.wb.task); | ||
488 | } | ||
489 | } | ||
490 | |||
196 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 491 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
197 | const char *fmt, ...) | 492 | const char *fmt, ...) |
198 | { | 493 | { |
@@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, | |||
211 | goto exit; | 506 | goto exit; |
212 | } | 507 | } |
213 | 508 | ||
509 | spin_lock(&bdi_lock); | ||
510 | list_add_tail(&bdi->bdi_list, &bdi_list); | ||
511 | spin_unlock(&bdi_lock); | ||
512 | |||
214 | bdi->dev = dev; | 513 | bdi->dev = dev; |
215 | bdi_debug_register(bdi, dev_name(dev)); | ||
216 | 514 | ||
515 | /* | ||
516 | * Just start the forker thread for our default backing_dev_info, | ||
517 | * and add other bdi's to the list. They will get a thread created | ||
518 | * on-demand when they need it. | ||
519 | */ | ||
520 | if (bdi_cap_flush_forker(bdi)) { | ||
521 | struct bdi_writeback *wb = &bdi->wb; | ||
522 | |||
523 | wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", | ||
524 | dev_name(dev)); | ||
525 | if (IS_ERR(wb->task)) { | ||
526 | wb->task = NULL; | ||
527 | ret = -ENOMEM; | ||
528 | |||
529 | spin_lock(&bdi_lock); | ||
530 | list_del(&bdi->bdi_list); | ||
531 | spin_unlock(&bdi_lock); | ||
532 | goto exit; | ||
533 | } | ||
534 | } | ||
535 | |||
536 | bdi_debug_register(bdi, dev_name(dev)); | ||
537 | set_bit(BDI_registered, &bdi->state); | ||
217 | exit: | 538 | exit: |
218 | return ret; | 539 | return ret; |
219 | } | 540 | } |
@@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) | |||
225 | } | 546 | } |
226 | EXPORT_SYMBOL(bdi_register_dev); | 547 | EXPORT_SYMBOL(bdi_register_dev); |
227 | 548 | ||
549 | /* | ||
550 | * Remove bdi from the global list and shutdown any threads we have running | ||
551 | */ | ||
552 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | ||
553 | { | ||
554 | struct bdi_writeback *wb; | ||
555 | |||
556 | if (!bdi_cap_writeback_dirty(bdi)) | ||
557 | return; | ||
558 | |||
559 | /* | ||
560 | * If setup is pending, wait for that to complete first | ||
561 | */ | ||
562 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, | ||
563 | TASK_UNINTERRUPTIBLE); | ||
564 | |||
565 | /* | ||
566 | * Make sure nobody finds us on the bdi_list anymore | ||
567 | */ | ||
568 | spin_lock(&bdi_lock); | ||
569 | list_del(&bdi->bdi_list); | ||
570 | spin_unlock(&bdi_lock); | ||
571 | |||
572 | /* | ||
573 | * Finally, kill the kernel threads. We don't need to be RCU | ||
574 | * safe anymore, since the bdi is gone from visibility. | ||
575 | */ | ||
576 | list_for_each_entry(wb, &bdi->wb_list, list) | ||
577 | kthread_stop(wb->task); | ||
578 | } | ||
579 | |||
228 | void bdi_unregister(struct backing_dev_info *bdi) | 580 | void bdi_unregister(struct backing_dev_info *bdi) |
229 | { | 581 | { |
230 | if (bdi->dev) { | 582 | if (bdi->dev) { |
583 | if (!bdi_cap_flush_forker(bdi)) | ||
584 | bdi_wb_shutdown(bdi); | ||
231 | bdi_debug_unregister(bdi); | 585 | bdi_debug_unregister(bdi); |
232 | device_unregister(bdi->dev); | 586 | device_unregister(bdi->dev); |
233 | bdi->dev = NULL; | 587 | bdi->dev = NULL; |
@@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); | |||
237 | 591 | ||
238 | int bdi_init(struct backing_dev_info *bdi) | 592 | int bdi_init(struct backing_dev_info *bdi) |
239 | { | 593 | { |
240 | int i; | 594 | int i, err; |
241 | int err; | ||
242 | 595 | ||
243 | bdi->dev = NULL; | 596 | bdi->dev = NULL; |
244 | 597 | ||
245 | bdi->min_ratio = 0; | 598 | bdi->min_ratio = 0; |
246 | bdi->max_ratio = 100; | 599 | bdi->max_ratio = 100; |
247 | bdi->max_prop_frac = PROP_FRAC_BASE; | 600 | bdi->max_prop_frac = PROP_FRAC_BASE; |
601 | spin_lock_init(&bdi->wb_lock); | ||
602 | INIT_LIST_HEAD(&bdi->bdi_list); | ||
603 | INIT_LIST_HEAD(&bdi->wb_list); | ||
604 | INIT_LIST_HEAD(&bdi->work_list); | ||
605 | |||
606 | bdi_wb_init(&bdi->wb, bdi); | ||
607 | |||
608 | /* | ||
609 | * Just one thread support for now, hard code mask and count | ||
610 | */ | ||
611 | bdi->wb_mask = 1; | ||
612 | bdi->wb_cnt = 1; | ||
248 | 613 | ||
249 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 614 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
250 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 615 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
@@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
269 | { | 634 | { |
270 | int i; | 635 | int i; |
271 | 636 | ||
637 | WARN_ON(bdi_has_dirty_io(bdi)); | ||
638 | |||
272 | bdi_unregister(bdi); | 639 | bdi_unregister(bdi); |
273 | 640 | ||
274 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 641 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |