1 files changed, 65 insertions, 179 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..a3b14090b1fb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
 #include <linux/pagevec.h>
 /*
- * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
-/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
@@ -53,18 +44,21 @@ static long ratelimit_pages = 32;
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
 * large amounts of I/O are submitted.
 */
-static inline long sync_writeback_pages(void)
+static inline long sync_writeback_pages(unsigned long dirtied)
 {
-        return ratelimit_pages + ratelimit_pages / 2;
+        if (dirtied < ratelimit_pages)
+                dirtied = ratelimit_pages;
+        return dirtied + dirtied / 2;
 }
 /* The following parameters are exported via /proc/sys/vm */
 /*
- * Start background writeback (via pdflush) at this percentage
+ * Start background writeback (via writeback threads) at this percentage
 */
 int dirty_background_ratio = 10;
@@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
-static void background_writeout(unsigned long _min_pages);
 /*
 * Scale the writeback cache size proportional to the relative writeout speeds.
 *
@@ -166,37 +158,37 @@ static void update_completion_period(void)
 }
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
-                struct file *filp, void __user *buffer, size_t *lenp,
+                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret;
-        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
 }
 int dirty_background_bytes_handler(struct ctl_table *table, int write,
-                struct file *filp, void __user *buffer, size_t *lenp,
+                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int ret;
-        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_ratio = 0;
        return ret;
 }
 int dirty_ratio_handler(struct ctl_table *table, int write,
-                struct file *filp, void __user *buffer, size_t *lenp,
+                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
        int ret;
-        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                update_completion_period();
                vm_dirty_bytes = 0;
@@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
 int dirty_bytes_handler(struct ctl_table *table, int write,
-                struct file *filp, void __user *buffer, size_t *lenp,
+                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;
-        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                update_completion_period();
                vm_dirty_ratio = 0;
@@ -320,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
 /*
 *
 */
-static DEFINE_SPINLOCK(bdi_lock);
 static unsigned int bdi_min_ratio;
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
 {
        int ret = 0;
-        unsigned long flags;
-        spin_lock_irqsave(&bdi_lock, flags);
+        spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
@@ -340,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
                        ret = -EINVAL;
                }
        }
-        spin_unlock_irqrestore(&bdi_lock, flags);
+        spin_unlock_bh(&bdi_lock);
        return ret;
 }
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 {
-        unsigned long flags;
        int ret = 0;
        if (max_ratio > 100)
                return -EINVAL;
-        spin_lock_irqsave(&bdi_lock, flags);
+        spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
        }
-        spin_unlock_irqrestore(&bdi_lock, flags);
+        spin_unlock_bh(&bdi_lock);
        return ret;
 }
@@ -394,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-                x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+                x += zone_page_state(z, NR_FREE_PAGES) +
+                     zone_reclaimable_pages(z);
        }
        /*
         * Make sure that the number of highmem pages is never larger
@@ -418,7 +408,7 @@ unsigned long determine_dirtyable_memory(void)
 {
        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -487,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
+ * If we're over `background_thresh' then the writeback threads are woken to
- * writeout.
+ * perform some writeout.
 */
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+                                unsigned long write_chunk)
 {
        long nr_reclaimable, bdi_nr_reclaimable;
        long nr_writeback, bdi_nr_writeback;
@@ -498,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping)
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
-        unsigned long write_chunk = sync_writeback_pages();
+        unsigned long pause = 1;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -546,7 +537,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * up.
                 */
                if (bdi_nr_reclaimable > bdi_thresh) {
-                        writeback_inodes(&wbc);
+                        writeback_inodes_wbc(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
                                       &bdi_thresh, bdi);
@@ -575,7 +566,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                schedule_timeout_interruptible(pause);
+                /*
+                 * Increase the delay for each loop, up to our previous
+                 * default of taking a 100ms nap.
+                 */
+                pause <<= 1;
+                if (pause > HZ / 10)
+                        pause = HZ / 10;
        }
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -583,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
-                return;         /* pdflush is already working this queue */
+                return;
        /*
         * In laptop mode, we wait until hitting the higher threshold before
@@ -594,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-                        (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
-                                          + global_page_state(NR_UNSTABLE_NFS)
+                               + global_page_state(NR_UNSTABLE_NFS))
                                          > background_thresh)))
-                pdflush_operation(background_writeout, 0);
+                bdi_start_writeback(bdi, NULL, 0);
 }
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -610,6 +609,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
        }
 }
+static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 /**
 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
 * @mapping: address_space which was dirtied
@@ -627,7 +628,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
 {
-        static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
        unsigned long ratelimit;
        unsigned long *p;
@@ -640,12 +640,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
         * tasks in balance_dirty_pages(). Period.
         */
        preempt_disable();
-        p =  &__get_cpu_var(ratelimits);
+        p =  &__get_cpu_var(bdp_ratelimits);
        *p += nr_pages_dirtied;
        if (unlikely(*p >= ratelimit)) {
+                ratelimit = sync_writeback_pages(*p);
                *p = 0;
                preempt_enable();
-                balance_dirty_pages(mapping);
+                balance_dirty_pages(mapping, ratelimit);
                return;
        }
        preempt_enable();
@@ -681,153 +682,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        }
 }
-/*
- * writeback at least _min_pages, and keep writing until the amount of dirty
- * memory is less than the background threshold, or until we're all clean.
- */
-static void background_writeout(unsigned long _min_pages)
-{
-        long min_pages = _min_pages;
-        struct writeback_control wbc = {
-                .bdi            = NULL,
-                .sync_mode      = WB_SYNC_NONE,
-                .older_than_this = NULL,
-                .nr_to_write    = 0,
-                .nonblocking    = 1,
-                .range_cyclic   = 1,
-        };
-        for ( ; ; ) {
-                unsigned long background_thresh;
-                unsigned long dirty_thresh;
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
-                if (global_page_state(NR_FILE_DIRTY) +
-                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
-                                && min_pages <= 0)
-                        break;
-                wbc.more_io = 0;
-                wbc.encountered_congestion = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-                wbc.pages_skipped = 0;
-                writeback_inodes(&wbc);
-                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-                        /* Wrote less than expected */
-                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                        else
-                                break;
-                }
-        }
-}
-/*
- * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
- * -1 if all pdflush threads were busy.
- */
-int wakeup_pdflush(long nr_pages)
-{
-        if (nr_pages == 0)
-                nr_pages = global_page_state(NR_FILE_DIRTY) +
-                                global_page_state(NR_UNSTABLE_NFS);
-        return pdflush_operation(background_writeout, nr_pages);
-}
-static void wb_timer_fn(unsigned long unused);
 static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 /*
- * Periodic writeback of "old" data.
- *
- * Define "old": the first time one of an inode's pages is dirtied, we mark the
- * dirtying-time in the inode's address_space.  So this periodic writeback code
- * just walks the superblock inode list, writing back any inodes which are
- * older than a specific point in time.
- *
- * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_interval interval, then leave a
- * one-second gap.
- *
- * older_than_this takes precedence over nr_to_write.  So we'll only write back
- * all dirty pages if they are all attached to "old" mappings.
- */
-static void wb_kupdate(unsigned long arg)
-{
-        unsigned long oldest_jif;
-        unsigned long start_jif;
-        unsigned long next_jif;
-        long nr_to_write;
-        struct writeback_control wbc = {
-                .bdi            = NULL,
-                .sync_mode      = WB_SYNC_NONE,
-                .older_than_this = &oldest_jif,
-                .nr_to_write    = 0,
-                .nonblocking    = 1,
-                .for_kupdate    = 1,
-                .range_cyclic   = 1,
-        };
-        sync_supers();
-        oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
-        start_jif = jiffies;
-        next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
-        nr_to_write = global_page_state(NR_FILE_DIRTY) +
-                        global_page_state(NR_UNSTABLE_NFS) +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        while (nr_to_write > 0) {
-                wbc.more_io = 0;
-                wbc.encountered_congestion = 0;
-                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-                writeback_inodes(&wbc);
-                if (wbc.nr_to_write > 0) {
-                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                        else
-                                break;  /* All the old data is written */
-                }
-                nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-        }
-        if (time_before(next_jif, jiffies + HZ))
-                next_jif = jiffies + HZ;
-        if (dirty_writeback_interval)
-                mod_timer(&wb_timer, next_jif);
-}
-/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
-        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+        void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec(table, write, file, buffer, length, ppos);
+        proc_dointvec(table, write, buffer, length, ppos);
-        if (dirty_writeback_interval)
-                mod_timer(&wb_timer, jiffies +
-                        msecs_to_jiffies(dirty_writeback_interval * 10));
-        else
-                del_timer(&wb_timer);
        return 0;
 }
-static void wb_timer_fn(unsigned long unused)
+static void do_laptop_sync(struct work_struct *work)
 {
-        if (pdflush_operation(wb_kupdate, 0) < 0)
+        wakeup_flusher_threads(0);
-                mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+        kfree(work);
-}
-static void laptop_flush(unsigned long unused)
-{
-        sys_sync();
 }
 static void laptop_timer_fn(unsigned long unused)
 {
-        pdflush_operation(laptop_flush, 0);
+        struct work_struct *work;
+        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (work) {
+                INIT_WORK(work, do_laptop_sync);
+                schedule_work(work);
+        }
 }
 /*
@@ -910,8 +793,6 @@ void __init page_writeback_init(void)
 {
        int shift;
-        mod_timer(&wb_timer,
-                  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
@@ -1145,12 +1026,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
        if (wbc->nr_to_write <= 0)
                return 0;
-        wbc->for_writepages = 1;
        if (mapping->a_ops->writepages)
                ret = mapping->a_ops->writepages(mapping, wbc);
        else
                ret = generic_writepages(mapping, wbc);
-        wbc->for_writepages = 0;
        return ret;
 }
@@ -1274,6 +1153,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 EXPORT_SYMBOL(redirty_page_for_writepage);
 /*
+ * Dirty a page.
+ *
+ * For pages with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
+ *
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */