aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page-writeback.c
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2009-09-09 03:08:54 -0400
committerJens Axboe <jens.axboe@oracle.com>2009-09-11 03:20:25 -0400
commit03ba3782e8dcc5b0e1efe440d33084f066e38cae (patch)
treee5a6513b411de16a46199530ec98ef9b7f1efc50 /mm/page-writeback.c
parent66f3b8e2e103a0b93b945764d98e9ba46cb926dd (diff)
writeback: switch to per-bdi threads for flushing data
This gets rid of pdflush for bdi writeout and kupdated style cleaning. pdflush writeout suffers from lack of locality and also requires more threads to handle the same workload, since it has to work in a non-blocking fashion against each queue. This also introduces lumpy behaviour and potential request starvation, since pdflush can be starved for queue access if others are accessing it. A sample ffsb workload that does random writes to files is about 8% faster here on a simple SATA drive during the benchmark phase. File layout also seems a LOT more smooth in vmstat: r b swpd free buff cache si so bi bo in cs us sy id wa 0 1 0 608848 2652 375372 0 0 0 71024 604 24 1 10 48 42 0 1 0 549644 2712 433736 0 0 0 60692 505 27 1 8 48 44 1 0 0 476928 2784 505192 0 0 4 29540 553 24 0 9 53 37 0 1 0 457972 2808 524008 0 0 0 54876 331 16 0 4 38 58 0 1 0 366128 2928 614284 0 0 4 92168 710 58 0 13 53 34 0 1 0 295092 3000 684140 0 0 0 62924 572 23 0 9 53 37 0 1 0 236592 3064 741704 0 0 4 58256 523 17 0 8 48 44 0 1 0 165608 3132 811464 0 0 0 57460 560 21 0 8 54 38 0 1 0 102952 3200 873164 0 0 4 74748 540 29 1 10 48 41 0 1 0 48604 3252 926472 0 0 0 53248 469 29 0 7 47 45 where vanilla tends to fluctuate a lot in the creation phase: r b swpd free buff cache si so bi bo in cs us sy id wa 1 1 0 678716 5792 303380 0 0 0 74064 565 50 1 11 52 36 1 0 0 662488 5864 319396 0 0 4 352 302 329 0 2 47 51 0 1 0 599312 5924 381468 0 0 0 78164 516 55 0 9 51 40 0 1 0 519952 6008 459516 0 0 4 78156 622 56 1 11 52 37 1 1 0 436640 6092 541632 0 0 0 82244 622 54 0 11 48 41 0 1 0 436640 6092 541660 0 0 0 8 152 39 0 0 51 49 0 1 0 332224 6200 644252 0 0 4 102800 728 46 1 13 49 36 1 0 0 274492 6260 701056 0 0 4 12328 459 49 0 7 50 43 0 1 0 211220 6324 763356 0 0 0 106940 515 37 1 10 51 39 1 0 0 160412 6376 813468 0 0 0 8224 415 43 0 6 49 45 1 1 0 85980 6452 886556 0 0 4 113516 575 39 1 11 54 34 0 2 0 85968 6452 886620 0 0 0 1640 158 211 0 0 46 54 A 10 disk test with btrfs performs 26% faster with per-bdi flushing. A SSD based writeback test on XFS performs over 20% better as well, with the throughput being very stable around 1GB/sec, where pdflush only manages 750MB/sec and fluctuates wildly while doing so. Random buffered writes to many files behave a lot better as well, as does random mmap'ed writes. A separate thread is added to sync the super blocks. In the long term, adding sync_supers_bdi() functionality could get rid of this thread again. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'mm/page-writeback.c')
-rw-r--r--mm/page-writeback.c179
1 files changed, 28 insertions, 151 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f8341b6019b..25e7770309b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -326,7 +315,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
326{ 315{
327 int ret = 0; 316 int ret = 0;
328 317
329 mutex_lock(&bdi_lock); 318 spin_lock(&bdi_lock);
330 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
331 ret = -EINVAL; 320 ret = -EINVAL;
332 } else { 321 } else {
@@ -338,7 +327,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
338 ret = -EINVAL; 327 ret = -EINVAL;
339 } 328 }
340 } 329 }
341 mutex_unlock(&bdi_lock); 330 spin_unlock(&bdi_lock);
342 331
343 return ret; 332 return ret;
344} 333}
@@ -350,14 +339,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
350 if (max_ratio > 100) 339 if (max_ratio > 100)
351 return -EINVAL; 340 return -EINVAL;
352 341
353 mutex_lock(&bdi_lock); 342 spin_lock(&bdi_lock);
354 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
355 ret = -EINVAL; 344 ret = -EINVAL;
356 } else { 345 } else {
357 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
358 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
359 } 348 }
360 mutex_unlock(&bdi_lock); 349 spin_unlock(&bdi_lock);
361 350
362 return ret; 351 return ret;
363} 352}
@@ -543,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
543 * up. 532 * up.
544 */ 533 */
545 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
546 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
547 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
548 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
549 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -572,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
572 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
573 break; /* We've done our duty */ 562 break; /* We've done our duty */
574 563
575 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
576 } 565 }
577 566
578 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -591,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping)
591 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
592 */ 581 */
593 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
594 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
595 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
596 > background_thresh))) 585 > background_thresh))) {
597 pdflush_operation(background_writeout, 0); 586 struct writeback_control wbc = {
587 .bdi = bdi,
588 .sync_mode = WB_SYNC_NONE,
589 .nr_to_write = nr_writeback,
590 };
591
592
593 bdi_start_writeback(&wbc);
594 }
598} 595}
599 596
600void set_page_dirty_balance(struct page *page, int page_mkwrite) 597void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -678,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
678 } 675 }
679} 676}
680 677
681/*
682 * writeback at least _min_pages, and keep writing until the amount of dirty
683 * memory is less than the background threshold, or until we're all clean.
684 */
685static void background_writeout(unsigned long _min_pages)
686{
687 long min_pages = _min_pages;
688 struct writeback_control wbc = {
689 .bdi = NULL,
690 .sync_mode = WB_SYNC_NONE,
691 .older_than_this = NULL,
692 .nr_to_write = 0,
693 .nonblocking = 1,
694 .range_cyclic = 1,
695 };
696
697 for ( ; ; ) {
698 unsigned long background_thresh;
699 unsigned long dirty_thresh;
700
701 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
702 if (global_page_state(NR_FILE_DIRTY) +
703 global_page_state(NR_UNSTABLE_NFS) < background_thresh
704 && min_pages <= 0)
705 break;
706 wbc.more_io = 0;
707 wbc.encountered_congestion = 0;
708 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
709 wbc.pages_skipped = 0;
710 writeback_inodes(&wbc);
711 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
712 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
713 /* Wrote less than expected */
714 if (wbc.encountered_congestion || wbc.more_io)
715 congestion_wait(BLK_RW_ASYNC, HZ/10);
716 else
717 break;
718 }
719 }
720}
721
722/*
723 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
724 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
725 * -1 if all pdflush threads were busy.
726 */
727int wakeup_pdflush(long nr_pages)
728{
729 if (nr_pages == 0)
730 nr_pages = global_page_state(NR_FILE_DIRTY) +
731 global_page_state(NR_UNSTABLE_NFS);
732 return pdflush_operation(background_writeout, nr_pages);
733}
734
735static void wb_timer_fn(unsigned long unused);
736static void laptop_timer_fn(unsigned long unused); 678static void laptop_timer_fn(unsigned long unused);
737 679
738static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
739static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 680static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
740 681
741/* 682/*
742 * Periodic writeback of "old" data.
743 *
744 * Define "old": the first time one of an inode's pages is dirtied, we mark the
745 * dirtying-time in the inode's address_space. So this periodic writeback code
746 * just walks the superblock inode list, writing back any inodes which are
747 * older than a specific point in time.
748 *
749 * Try to run once per dirty_writeback_interval. But if a writeback event
750 * takes longer than a dirty_writeback_interval interval, then leave a
751 * one-second gap.
752 *
753 * older_than_this takes precedence over nr_to_write. So we'll only write back
754 * all dirty pages if they are all attached to "old" mappings.
755 */
756static void wb_kupdate(unsigned long arg)
757{
758 unsigned long oldest_jif;
759 unsigned long start_jif;
760 unsigned long next_jif;
761 long nr_to_write;
762 struct writeback_control wbc = {
763 .bdi = NULL,
764 .sync_mode = WB_SYNC_NONE,
765 .older_than_this = &oldest_jif,
766 .nr_to_write = 0,
767 .nonblocking = 1,
768 .for_kupdate = 1,
769 .range_cyclic = 1,
770 };
771
772 sync_supers();
773
774 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
775 start_jif = jiffies;
776 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
777 nr_to_write = global_page_state(NR_FILE_DIRTY) +
778 global_page_state(NR_UNSTABLE_NFS) +
779 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
780 while (nr_to_write > 0) {
781 wbc.more_io = 0;
782 wbc.encountered_congestion = 0;
783 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
784 writeback_inodes(&wbc);
785 if (wbc.nr_to_write > 0) {
786 if (wbc.encountered_congestion || wbc.more_io)
787 congestion_wait(BLK_RW_ASYNC, HZ/10);
788 else
789 break; /* All the old data is written */
790 }
791 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
792 }
793 if (time_before(next_jif, jiffies + HZ))
794 next_jif = jiffies + HZ;
795 if (dirty_writeback_interval)
796 mod_timer(&wb_timer, next_jif);
797}
798
799/*
800 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 683 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
801 */ 684 */
802int dirty_writeback_centisecs_handler(ctl_table *table, int write, 685int dirty_writeback_centisecs_handler(ctl_table *table, int write,
803 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 686 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
804{ 687{
805 proc_dointvec(table, write, file, buffer, length, ppos); 688 proc_dointvec(table, write, file, buffer, length, ppos);
806 if (dirty_writeback_interval)
807 mod_timer(&wb_timer, jiffies +
808 msecs_to_jiffies(dirty_writeback_interval * 10));
809 else
810 del_timer(&wb_timer);
811 return 0; 689 return 0;
812} 690}
813 691
814static void wb_timer_fn(unsigned long unused) 692static void do_laptop_sync(struct work_struct *work)
815{
816 if (pdflush_operation(wb_kupdate, 0) < 0)
817 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
818}
819
820static void laptop_flush(unsigned long unused)
821{ 693{
822 sys_sync(); 694 wakeup_flusher_threads(0);
695 kfree(work);
823} 696}
824 697
825static void laptop_timer_fn(unsigned long unused) 698static void laptop_timer_fn(unsigned long unused)
826{ 699{
827 pdflush_operation(laptop_flush, 0); 700 struct work_struct *work;
701
702 work = kmalloc(sizeof(*work), GFP_ATOMIC);
703 if (work) {
704 INIT_WORK(work, do_laptop_sync);
705 schedule_work(work);
706 }
828} 707}
829 708
830/* 709/*
@@ -907,8 +786,6 @@ void __init page_writeback_init(void)
907{ 786{
908 int shift; 787 int shift;
909 788
910 mod_timer(&wb_timer,
911 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
912 writeback_set_ratelimit(); 789 writeback_set_ratelimit();
913 register_cpu_notifier(&ratelimit_nb); 790 register_cpu_notifier(&ratelimit_nb);
914 791