1 files changed, 251 insertions, 49 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d821321326e..7845462064f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
@@ -36,7 +37,7 @@
 /*
 * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation.  We do this so we don't hold I_LOCK against an inode for
+ * operation.  We do this so we don't hold I_SYNC against an inode for
 * enormous amounts of time, which would block a userspace task which has
 * been forced to throttle against that inode.  Also, the code reevaluates
 * the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
 */
 static long ratelimit_pages = 32;
-static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+static unsigned long determine_dirtyable_memory(void);
+/*
+ * couple the period to the dirty_ratio:
+ *
+ *   period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+        unsigned long dirty_total;
+        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        return 2 + ilog2(dirty_total - 1);
+}
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_ratio = vm_dirty_ratio;
+        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+                int shift = calc_period_shift();
+                prop_change_shift(&vm_completions, shift);
+                prop_change_shift(&vm_dirties, shift);
+        }
+        return ret;
+}
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+        __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+        prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+                long *numerator, long *denominator)
+{
+        if (bdi_cap_writeback_dirty(bdi)) {
+                prop_fraction_percpu(&vm_completions, &bdi->completions,
+                                numerator, denominator);
+        } else {
+                *numerator = 0;
+                *denominator = 1;
+        }
+}
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+        long avail_dirty;
+        avail_dirty = dirty -
+                (global_page_state(NR_FILE_DIRTY) +
+                 global_page_state(NR_WRITEBACK) +
+                 global_page_state(NR_UNSTABLE_NFS));
+        if (avail_dirty < 0)
+                avail_dirty = 0;
+        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+                bdi_stat(bdi, BDI_WRITEBACK);
+        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+static inline void task_dirties_fraction(struct task_struct *tsk,
+                long *numerator, long *denominator)
+{
+        prop_fraction_single(&vm_dirties, &tsk->dirties,
+                                numerator, denominator);
+}
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ *   dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+        long numerator, denominator;
+        long dirty = *pdirty;
+        u64 inv = dirty >> 3;
+        task_dirties_fraction(tsk, &numerator, &denominator);
+        inv *= numerator;
+        do_div(inv, denominator);
+        dirty -= inv;
+        if (dirty < *pdirty/2)
+                dirty = *pdirty/2;
+        *pdirty = dirty;
+}
+/*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
 *
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
 }
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                                        struct address_space *mapping)
+                 struct backing_dev_info *bdi)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+        if (bdi) {
+                u64 bdi_dirty = dirty;
+                long numerator, denominator;
+                /*
+                 * Calculate this BDI's share of the dirty ratio.
+                 */
+                bdi_writeout_fraction(bdi, &numerator, &denominator);
+                bdi_dirty *= numerator;
+                do_div(bdi_dirty, denominator);
+                *pbdi_dirty = bdi_dirty;
+                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+                task_dirty_limit(current, pbdi_dirty);
+        }
 }
 /*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
 */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-        long nr_reclaimable;
+        long bdi_nr_reclaimable;
+        long bdi_nr_writeback;
        long background_thresh;
        long dirty_thresh;
+        long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+                get_dirty_limits(&background_thresh, &dirty_thresh,
-                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                &bdi_thresh, bdi);
-                                        global_page_state(NR_UNSTABLE_NFS);
+                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                        dirty_thresh)
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                                break;
+                        break;
-                if (!dirty_exceeded)
+                if (!bdi->dirty_exceeded)
-                        dirty_exceeded = 1;
+                        bdi->dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
                 */
-                if (nr_reclaimable) {
+                if (bdi_nr_reclaimable) {
                        writeback_inodes(&wbc);
-                        get_dirty_limits(&background_thresh,
-                                                &dirty_thresh, mapping);
-                        nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                        global_page_state(NR_UNSTABLE_NFS);
-                        if (nr_reclaimable +
-                                global_page_state(NR_WRITEBACK)
-                                        <= dirty_thresh)
-                                                break;
                        pages_written += write_chunk - wbc.nr_to_write;
-                        if (pages_written >= write_chunk)
+                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                break;          /* We've done our duty */
+                                       &bdi_thresh, bdi);
+                }
+                /*
+                 * In order to avoid the stacked BDI deadlock we need
+                 * to ensure we accurately count the 'dirty' pages when
+                 * the threshold is low.
+                 *
+                 * Otherwise it would be possible to get thresh+n pages
+                 * reported dirty, even though there are thresh-m pages
+                 * actually dirty; with m+n sitting in the percpu
+                 * deltas.
+                 */
+                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                } else if (bdi_nr_reclaimable) {
+                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                        break;
+                if (pages_written >= write_chunk)
+                        break;          /* We've done our duty */
                congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
-                <= dirty_thresh && dirty_exceeded)
+                        bdi->dirty_exceeded)
-                        dirty_exceeded = 0;
+                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-             (!laptop_mode && (nr_reclaimable > background_thresh)))
+                        (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+                                          + global_page_state(NR_UNSTABLE_NFS)
+                                          > background_thresh)))
                pdflush_operation(background_writeout, 0);
 }
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
        unsigned long *p;
        ratelimit = ratelimit_pages;
-        if (dirty_exceeded)
+        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
        /*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        long background_thresh;
        long dirty_thresh;
-        if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
-                /*
-                 * The caller might hold locks which can prevent IO completion
-                 * or progress in the filesystem.  So we cannot just sit here
-                 * waiting for IO to complete.
-                 */
-                congestion_wait(WRITE, HZ/10);
-                return;
-        }
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
                congestion_wait(WRITE, HZ/10);
+                /*
+                 * The caller might hold locks which can prevent IO completion
+                 * or progress in the filesystem.  So we cannot just sit here
+                 * waiting for IO to complete.
+                 */
+                if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
+                        break;
        }
 }
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
                long background_thresh;
                long dirty_thresh;
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
-                        congestion_wait(WRITE, HZ/10);
+                        if (wbc.encountered_congestion || wbc.more_io)
-                        if (!wbc.encountered_congestion)
+                                congestion_wait(WRITE, HZ/10);
+                        else
                                break;
                }
        }
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
-                        if (wbc.encountered_congestion)
+                        if (wbc.encountered_congestion || wbc.more_io)
                                congestion_wait(WRITE, HZ/10);
                        else
                                break;  /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
+        int shift;
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
+        shift = calc_period_shift();
+        prop_descriptor_init(&vm_completions, shift);
+        prop_descriptor_init(&vm_dirties, shift);
 }
 /**
@@ -672,8 +850,10 @@ retry:
                        ret = (*writepage)(page, wbc, data);
-                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
                                unlock_page(page);
+                                ret = 0;
+                        }
                        if (ret || (--(wbc->nr_to_write) <= 0))
                                done = 1;
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
                        if (mapping_cap_account_dirty(mapping)) {
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                                __inc_bdi_stat(mapping->backing_dev_info,
+                                                BDI_RECLAIMABLE);
                                task_io_account_write(PAGE_CACHE_SIZE);
                        }
                        radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
        }
        return 0;
 }
+int fastcall set_page_dirty(struct page *page)
+{
+        int ret = __set_page_dirty(page);
+        if (ret)
+                task_dirty_inc(current);
+        return ret;
+}
 EXPORT_SYMBOL(set_page_dirty);
 /*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        dec_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
                        return 1;
                }
                return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
-                if (ret)
+                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi)) {
+                                __dec_bdi_stat(bdi, BDI_WRITEBACK);
+                                __bdi_writeout_inc(bdi);
+                        }
+                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
-                if (!ret)
+                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi))
+                                __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                }
                if (!PageDirty(page))
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),