19 files changed, 484 insertions, 187 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a2811f9dc..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -5,6 +5,41 @@
 #include <linux/sched.h>
 #include <linux/module.h>
+int bdi_init(struct backing_dev_info *bdi)
+{
+        int i, j;
+        int err;
+        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+                err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+                if (err)
+                        goto err;
+        }
+        bdi->dirty_exceeded = 0;
+        err = prop_local_init_percpu(&bdi->completions);
+        if (err) {
+err:
+                for (j = 0; j < i; j++)
+                        percpu_counter_destroy(&bdi->bdi_stat[i]);
+        }
+        return err;
+}
+EXPORT_SYMBOL(bdi_init);
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+        int i;
+        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+                percpu_counter_destroy(&bdi->bdi_stat[i]);
+        prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
 static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
-        wait_queue_head_t *wqh = &congestion_wqh[rw];
-        if (waitqueue_active(wqh))
-                wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);
diff --git a/mm/filemap.c b/mm/filemap.c
index c6049e947cd9..79f24a969cb4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
+ *          ->zone.lock
 *
 *  ->i_mutex
 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
@@ -1626,12 +1627,18 @@ int __remove_suid(struct dentry *dentry, int kill)
 int remove_suid(struct dentry *dentry)
 {
-        int kill = should_remove_suid(dentry);
+        int killsuid = should_remove_suid(dentry);
+        int killpriv = security_inode_need_killpriv(dentry);
+        int error = 0;
-        if (unlikely(kill))
+        if (killpriv < 0)
-                return __remove_suid(dentry, kill);
+                return killpriv;
+        if (killpriv)
+                error = security_inode_killpriv(dentry);
+        if (!error && killsuid)
+                error = __remove_suid(dentry, killsuid);
-        return 0;
+        return error;
 }
 EXPORT_SYMBOL(remove_suid);
diff --git a/mm/fremap.c b/mm/fremap.c
index 95bcb5641c72..14bd3bf7826e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,7 @@
 *
 * started by Ingo Molnar, Copyright (C) 2002, 2003
 */
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/file.h>
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
 }
-/***
+/**
- * sys_remap_file_pages - remap arbitrary pages of a shared backing store
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- *                        file within an existing vma.
 * @start: start of the remapped virtual memory range
 * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range
+ * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to be mapped page of the backing store file
+ * @pgoff: to-be-mapped page of the backing store file
 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
 *
- * this syscall works purely via pagetables, so it's the most efficient
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
 * way to map the same (large) file into a given virtual window. Unlike
 * mmap()/mremap() it does not create any new vmas. The new mappings are
 * also safe across swapout.
 *
- * NOTE: the 'prot' parameter right now is ignored, and the vma's default
+ * NOTE: the 'prot' parameter right now is ignored (but must be zero),
- * protection is used. Arbitrary protections might be implemented in the
+ * and the vma's default protection is used. Arbitrary protections
- * future.
+ * might be implemented in the future.
 */
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
-        unsigned long __prot, unsigned long pgoff, unsigned long flags)
+        unsigned long prot, unsigned long pgoff, unsigned long flags)
 {
        struct mm_struct *mm = current->mm;
        struct address_space *mapping;
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
        int err = -EINVAL;
        int has_write_lock = 0;
-        if (__prot)
+        if (prot)
                return err;
        /*
         * Sanitize the syscall parameters:
diff --git a/mm/mmap.c b/mm/mmap.c
index 0d40e66c841b..4275e81e25ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -7,6 +7,7 @@
 */
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
@@ -180,8 +181,6 @@ error:
        return -ENOMEM;
 }
-EXPORT_SYMBOL(__vm_enough_memory);
 /*
 * Requires inode->i_mapping->i_mmap_lock
 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 8ed0cb43118a..42fb84e9e815 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(__vm_enough_memory);
 EXPORT_SYMBOL(num_physpages);
 /* list of shareable VMAs */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 41b4e362221d..a64decb5b13f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,8 @@
 #include <linux/notifier.h>
 int sysctl_panic_on_oom;
+int sysctl_oom_kill_allocating_task;
+static DEFINE_SPINLOCK(zone_scan_mutex);
 /* #define DEBUG */
 /**
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * because p may have allocated or otherwise mapped memory on
         * this node before. However it will be less likely.
         */
-        if (!cpuset_excl_nodes_overlap(p))
+        if (!cpuset_mems_allowed_intersects(current, p))
                points /= 8;
        /*
@@ -164,16 +166,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 }
 /*
- * Types of limitations to the nodes from which allocations may occur
- */
-#define CONSTRAINT_NONE 1
-#define CONSTRAINT_MEMORY_POLICY 2
-#define CONSTRAINT_CPUSET 3
-/*
 * Determine the type of allocation constraint.
 */
-static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+                                                    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
        struct zone **z;
@@ -337,12 +333,20 @@ static int oom_kill_task(struct task_struct *p)
        return 0;
 }
-static int oom_kill_process(struct task_struct *p, unsigned long points,
+static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                const char *message)
+                            unsigned long points, const char *message)
 {
        struct task_struct *c;
        struct list_head *tsk;
+        if (printk_ratelimit()) {
+                printk(KERN_WARNING "%s invoked oom-killer: "
+                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+                        current->comm, gfp_mask, order, current->oomkilladj);
+                dump_stack();
+                show_mem();
+        }
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
@@ -380,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
+ */
+int try_set_zone_oom(struct zonelist *zonelist)
+{
+        struct zone **z;
+        int ret = 1;
+        z = zonelist->zones;
+        spin_lock(&zone_scan_mutex);
+        do {
+                if (zone_is_oom_locked(*z)) {
+                        ret = 0;
+                        goto out;
+                }
+        } while (*(++z) != NULL);
+        /*
+         * Lock each zone in the zonelist under zone_scan_mutex so a parallel
+         * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
+         */
+        z = zonelist->zones;
+        do {
+                zone_set_flag(*z, ZONE_OOM_LOCKED);
+        } while (*(++z) != NULL);
+out:
+        spin_unlock(&zone_scan_mutex);
+        return ret;
+}
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void clear_zonelist_oom(struct zonelist *zonelist)
+{
+        struct zone **z;
+        z = zonelist->zones;
+        spin_lock(&zone_scan_mutex);
+        do {
+                zone_clear_flag(*z, ZONE_OOM_LOCKED);
+        } while (*(++z) != NULL);
+        spin_unlock(&zone_scan_mutex);
+}
 /**
 * out_of_memory - kill the "best" process when we run out of memory
 *
@@ -393,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
        struct task_struct *p;
        unsigned long points = 0;
        unsigned long freed = 0;
-        int constraint;
+        enum oom_constraint constraint;
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
                return;
-        if (printk_ratelimit()) {
-                printk(KERN_WARNING "%s invoked oom-killer: "
-                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-                        current->comm, gfp_mask, order, current->oomkilladj);
-                dump_stack();
-                show_mem();
-        }
        if (sysctl_panic_on_oom == 2)
                panic("out of memory. Compulsory panic_on_oom is selected.\n");
@@ -416,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
         * NUMA) that may require different handling.
         */
        constraint = constrained_alloc(zonelist, gfp_mask);
-        cpuset_lock();
        read_lock(&tasklist_lock);
        switch (constraint) {
        case CONSTRAINT_MEMORY_POLICY:
-                oom_kill_process(current, points,
+                oom_kill_process(current, gfp_mask, order, points,
                                "No available memory (MPOL_BIND)");
                break;
-        case CONSTRAINT_CPUSET:
-                oom_kill_process(current, points,
-                                "No available memory in cpuset");
-                break;
        case CONSTRAINT_NONE:
                if (sysctl_panic_on_oom)
                        panic("out of memory. panic_on_oom is selected\n");
+                /* Fall-through */
+        case CONSTRAINT_CPUSET:
+                if (sysctl_oom_kill_allocating_task) {
+                        oom_kill_process(current, gfp_mask, order, points,
+                                        "Out of memory (oom_kill_allocating_task)");
+                        break;
+                }
 retry:
                /*
                 * Rambo mode: Shoot down a process and hope it solves whatever
@@ -446,11 +494,11 @@ retry:
                /* Found nothing?!?! Either we hang forever, or we panic. */
                if (!p) {
                        read_unlock(&tasklist_lock);
-                        cpuset_unlock();
                        panic("Out of memory and no killable processes...\n");
                }
-                if (oom_kill_process(p, points, "Out of memory"))
+                if (oom_kill_process(p, points, gfp_mask, order,
+                                     "Out of memory"))
                        goto retry;
                break;
@@ -458,7 +506,6 @@ retry:
 out:
        read_unlock(&tasklist_lock);
-        cpuset_unlock();
        /*
         * Give "p" a good chance of killing itself before we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d821321326e3..7845462064f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
@@ -36,7 +37,7 @@
 /*
 * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation.  We do this so we don't hold I_LOCK against an inode for
+ * operation.  We do this so we don't hold I_SYNC against an inode for
 * enormous amounts of time, which would block a userspace task which has
 * been forced to throttle against that inode.  Also, the code reevaluates
 * the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
 */
 static long ratelimit_pages = 32;
-static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+static unsigned long determine_dirtyable_memory(void);
+/*
+ * couple the period to the dirty_ratio:
+ *
+ *   period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+        unsigned long dirty_total;
+        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        return 2 + ilog2(dirty_total - 1);
+}
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_ratio = vm_dirty_ratio;
+        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+                int shift = calc_period_shift();
+                prop_change_shift(&vm_completions, shift);
+                prop_change_shift(&vm_dirties, shift);
+        }
+        return ret;
+}
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+        __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+        prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+                long *numerator, long *denominator)
+{
+        if (bdi_cap_writeback_dirty(bdi)) {
+                prop_fraction_percpu(&vm_completions, &bdi->completions,
+                                numerator, denominator);
+        } else {
+                *numerator = 0;
+                *denominator = 1;
+        }
+}
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+        long avail_dirty;
+        avail_dirty = dirty -
+                (global_page_state(NR_FILE_DIRTY) +
+                 global_page_state(NR_WRITEBACK) +
+                 global_page_state(NR_UNSTABLE_NFS));
+        if (avail_dirty < 0)
+                avail_dirty = 0;
+        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+                bdi_stat(bdi, BDI_WRITEBACK);
+        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+static inline void task_dirties_fraction(struct task_struct *tsk,
+                long *numerator, long *denominator)
+{
+        prop_fraction_single(&vm_dirties, &tsk->dirties,
+                                numerator, denominator);
+}
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ *   dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+        long numerator, denominator;
+        long dirty = *pdirty;
+        u64 inv = dirty >> 3;
+        task_dirties_fraction(tsk, &numerator, &denominator);
+        inv *= numerator;
+        do_div(inv, denominator);
+        dirty -= inv;
+        if (dirty < *pdirty/2)
+                dirty = *pdirty/2;
+        *pdirty = dirty;
+}
+/*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
 *
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
 }
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                                        struct address_space *mapping)
+                 struct backing_dev_info *bdi)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+        if (bdi) {
+                u64 bdi_dirty = dirty;
+                long numerator, denominator;
+                /*
+                 * Calculate this BDI's share of the dirty ratio.
+                 */
+                bdi_writeout_fraction(bdi, &numerator, &denominator);
+                bdi_dirty *= numerator;
+                do_div(bdi_dirty, denominator);
+                *pbdi_dirty = bdi_dirty;
+                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+                task_dirty_limit(current, pbdi_dirty);
+        }
 }
 /*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
 */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-        long nr_reclaimable;
+        long bdi_nr_reclaimable;
+        long bdi_nr_writeback;
        long background_thresh;
        long dirty_thresh;
+        long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+                get_dirty_limits(&background_thresh, &dirty_thresh,
-                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                &bdi_thresh, bdi);
-                                        global_page_state(NR_UNSTABLE_NFS);
+                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                        dirty_thresh)
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                                break;
+                        break;
-                if (!dirty_exceeded)
+                if (!bdi->dirty_exceeded)
-                        dirty_exceeded = 1;
+                        bdi->dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
                 */
-                if (nr_reclaimable) {
+                if (bdi_nr_reclaimable) {
                        writeback_inodes(&wbc);
-                        get_dirty_limits(&background_thresh,
-                                                &dirty_thresh, mapping);
-                        nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                        global_page_state(NR_UNSTABLE_NFS);
-                        if (nr_reclaimable +
-                                global_page_state(NR_WRITEBACK)
-                                        <= dirty_thresh)
-                                                break;
                        pages_written += write_chunk - wbc.nr_to_write;
-                        if (pages_written >= write_chunk)
+                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                break;          /* We've done our duty */
+                                       &bdi_thresh, bdi);
+                }
+                /*
+                 * In order to avoid the stacked BDI deadlock we need
+                 * to ensure we accurately count the 'dirty' pages when
+                 * the threshold is low.
+                 *
+                 * Otherwise it would be possible to get thresh+n pages
+                 * reported dirty, even though there are thresh-m pages
+                 * actually dirty; with m+n sitting in the percpu
+                 * deltas.
+                 */
+                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                } else if (bdi_nr_reclaimable) {
+                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                        break;
+                if (pages_written >= write_chunk)
+                        break;          /* We've done our duty */
                congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
-                <= dirty_thresh && dirty_exceeded)
+                        bdi->dirty_exceeded)
-                        dirty_exceeded = 0;
+                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-             (!laptop_mode && (nr_reclaimable > background_thresh)))
+                        (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+                                          + global_page_state(NR_UNSTABLE_NFS)
+                                          > background_thresh)))
                pdflush_operation(background_writeout, 0);
 }
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
        unsigned long *p;
        ratelimit = ratelimit_pages;
-        if (dirty_exceeded)
+        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
        /*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        long background_thresh;
        long dirty_thresh;
-        if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
-                /*
-                 * The caller might hold locks which can prevent IO completion
-                 * or progress in the filesystem.  So we cannot just sit here
-                 * waiting for IO to complete.
-                 */
-                congestion_wait(WRITE, HZ/10);
-                return;
-        }
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
                congestion_wait(WRITE, HZ/10);
+                /*
+                 * The caller might hold locks which can prevent IO completion
+                 * or progress in the filesystem.  So we cannot just sit here
+                 * waiting for IO to complete.
+                 */
+                if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
+                        break;
        }
 }
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
                long background_thresh;
                long dirty_thresh;
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
-                        congestion_wait(WRITE, HZ/10);
+                        if (wbc.encountered_congestion || wbc.more_io)
-                        if (!wbc.encountered_congestion)
+                                congestion_wait(WRITE, HZ/10);
+                        else
                                break;
                }
        }
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
-                        if (wbc.encountered_congestion)
+                        if (wbc.encountered_congestion || wbc.more_io)
                                congestion_wait(WRITE, HZ/10);
                        else
                                break;  /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
+        int shift;
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
+        shift = calc_period_shift();
+        prop_descriptor_init(&vm_completions, shift);
+        prop_descriptor_init(&vm_dirties, shift);
 }
 /**
@@ -672,8 +850,10 @@ retry:
                        ret = (*writepage)(page, wbc, data);
-                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
                                unlock_page(page);
+                                ret = 0;
+                        }
                        if (ret || (--(wbc->nr_to_write) <= 0))
                                done = 1;
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
                        if (mapping_cap_account_dirty(mapping)) {
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                                __inc_bdi_stat(mapping->backing_dev_info,
+                                                BDI_RECLAIMABLE);
                                task_io_account_write(PAGE_CACHE_SIZE);
                        }
                        radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
        }
        return 0;
 }
+int fastcall set_page_dirty(struct page *page)
+{
+        int ret = __set_page_dirty(page);
+        if (ret)
+                task_dirty_inc(current);
+        return ret;
+}
 EXPORT_SYMBOL(set_page_dirty);
 /*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        dec_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
                        return 1;
                }
                return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
-                if (ret)
+                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi)) {
+                                __dec_bdi_stat(bdi, BDI_WRITEBACK);
+                                __bdi_writeout_inc(bdi);
+                        }
+                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
-                if (!ret)
+                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi))
+                                __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                }
                if (!PageDirty(page))
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d315e1127dc9..43f757fcf30f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
@@ -489,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count,
                                        struct list_head *list, int order)
 {
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
+        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
        while (count--) {
                struct page *page;
@@ -506,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count,
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
+        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order);
        spin_unlock(&zone->lock);
@@ -1586,6 +1587,11 @@ nofail_alloc:
                if (page)
                        goto got_pg;
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                if (!try_set_zone_oom(zonelist)) {
+                        schedule_timeout_uninterruptible(1);
+                        goto restart;
+                }
                /*
                 * Go through the zonelist yet one more time, keep
                 * very high watermark here, this is only to catch
@@ -1594,14 +1600,19 @@ nofail_alloc:
                 */
                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                                zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-                if (page)
+                if (page) {
+                        clear_zonelist_oom(zonelist);
                        goto got_pg;
+                }
                /* The OOM killer will not help higher order allocs so fail */
-                if (order > PAGE_ALLOC_COSTLY_ORDER)
+                if (order > PAGE_ALLOC_COSTLY_ORDER) {
+                        clear_zonelist_oom(zonelist);
                        goto nopage;
+                }
                out_of_memory(zonelist, gfp_mask, order);
+                clear_zonelist_oom(zonelist);
                goto restart;
        }
@@ -1850,7 +1861,7 @@ void show_free_areas(void)
                        K(zone_page_state(zone, NR_INACTIVE)),
                        K(zone->present_pages),
                        zone->pages_scanned,
-                        (zone->all_unreclaimable ? "yes" : "no")
+                        (zone_is_all_unreclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3371,7 +3382,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_active = 0;
                zone->nr_scan_inactive = 0;
                zap_zone_vm_stats(zone);
-                atomic_set(&zone->reclaim_in_progress, 0);
+                zone->flags = 0;
                if (!size)
                        continue;
diff --git a/mm/readahead.c b/mm/readahead.c
index 229788884010..c9c50ca1ec38 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -233,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr)
                + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
+static int __init readahead_init(void)
+{
+        return bdi_init(&default_backing_dev_info);
+}
+subsys_initcall(readahead_init);
 /*
 * Submit IO for the read-ahead request in file_ra_state.
 */
diff --git a/mm/rmap.c b/mm/rmap.c
index 2b9f413c9c00..8990f909492f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,7 @@
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within inode_lock in __sync_single_inode)
+ *                   zone->lock (within radix tree node alloc)
 */
 #include <linux/mm.h>
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
-                          unsigned long flags)
 {
        struct anon_vma *anon_vma = data;
diff --git a/mm/shmem.c b/mm/shmem.c
index 8a82342a8595..289dbb0a6fd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2328,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(void *foo, struct kmem_cache *cachep,
+static void init_once(struct kmem_cache *cachep, void *foo)
-                      unsigned long flags)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2344,9 +2343,7 @@ static int init_inodecache(void)
 {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
-                                0, 0, init_once);
+                                0, SLAB_PANIC, init_once);
-        if (shmem_inode_cachep == NULL)
-                return -ENOMEM;
        return 0;
 }
@@ -2464,6 +2461,10 @@ static int __init init_tmpfs(void)
 {
        int error;
+        error = bdi_init(&shmem_backing_dev_info);
+        if (error)
+                goto out4;
        error = init_inodecache();
        if (error)
                goto out3;
@@ -2488,6 +2489,8 @@ out1:
 out2:
        destroy_inodecache();
 out3:
+        bdi_destroy(&shmem_backing_dev_info);
+out4:
        shm_mnt = ERR_PTR(error);
        return error;
 }
@@ -2540,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        d_instantiate(dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_path.mnt = mntget(shm_mnt);
+        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-        file->f_path.dentry = dentry;
+                        &shmem_file_operations);
-        file->f_mapping = inode->i_mapping;
-        file->f_op = &shmem_file_operations;
-        file->f_mode = FMODE_WRITE | FMODE_READ;
        return file;
 close_file:
diff --git a/mm/slab.c b/mm/slab.c
index e34bcb87a6ee..3ce9bc024d67 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -267,11 +267,10 @@ struct array_cache {
        unsigned int batchcount;
        unsigned int touched;
        spinlock_t lock;
-        void *entry[0]; /*
+        void *entry[];  /*
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
-                         * [0] is for gcc 2.95. It should really be [].
                         */
 };
@@ -408,7 +407,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor) (void *, struct kmem_cache *, unsigned long);
+        void (*ctor)(struct kmem_cache *, void *);
 /* 5) cache creation/removal */
        const char *name;
@@ -2129,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags,
-        void (*ctor)(void*, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2636,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(objp + obj_offset(cachep), cachep,
+                        cachep->ctor(cachep, objp + obj_offset(cachep));
-                                     0);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2653,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(objp, cachep, 0);
+                        cachep->ctor(cachep, objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -3078,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(objp, cachep, 0);
+                cachep->ctor(cachep, objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de5d5563a46c..5bc2ceb692ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -499,12 +499,12 @@ struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(void *, struct kmem_cache *, unsigned long);
+        void (*ctor)(struct kmem_cache *, void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        size_t align, unsigned long flags,
-        void (*ctor)(void*, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -548,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(b, c, 0);
+                c->ctor(c, b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index f426f9bc644b..e29a42988c78 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -980,7 +980,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(void *, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        /*
         * The page->offset field is only 16 bit wide. This is an offset
@@ -1027,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(void *, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        return flags;
 }
@@ -1071,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(object, s, 0);
+                s->ctor(s, object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1085,9 +1085,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        BUG_ON(flags & GFP_SLAB_BUG_MASK);
-        if (flags & __GFP_WAIT)
-                local_irq_enable();
        page = allocate_slab(s,
                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
        if (!page)
@@ -1120,8 +1117,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->freelist = start;
        page->inuse = 0;
 out:
-        if (flags & __GFP_WAIT)
-                local_irq_disable();
        return page;
 }
@@ -1505,7 +1500,14 @@ new_slab:
                goto load_freelist;
        }
+        if (gfpflags & __GFP_WAIT)
+                local_irq_enable();
        new = new_slab(s, gfpflags, node);
+        if (gfpflags & __GFP_WAIT)
+                local_irq_disable();
        if (new) {
                c = get_cpu_slab(s, smp_processor_id());
                if (c->page) {
@@ -2039,12 +2041,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
        init_kmem_cache_node(n);
        atomic_long_inc(&n->nr_slabs);
        add_partial(n, page);
-        /*
-         * new_slab() disables interupts. If we do not reenable interrupts here
-         * then bootup would continue with interrupts disabled.
-         */
-        local_irq_enable();
        return n;
 }
@@ -2215,7 +2211,7 @@ static int calculate_sizes(struct kmem_cache *s)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -2805,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
@@ -2846,7 +2842,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
diff --git a/mm/swap.c b/mm/swap.c
index d034b2128d2b..a65eff8a517a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -28,6 +28,7 @@
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/backing-dev.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -547,6 +548,10 @@ void __init swap_setup(void)
 {
        unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+#ifdef CONFIG_SWAP
+        bdi_init(swapper_space.backing_dev_info);
+#endif
        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8803471593fd..d436a9c82db7 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!dentry)
                goto put_memory;
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
        error = -ENOSPC;
        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
-                goto close_file;
+                goto put_dentry;
        d_instantiate(dentry, inode);
-        inode->i_nlink = 0;     /* It is unlinked */
+        error = -ENFILE;
+        file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+                        &ramfs_file_operations);
+        if (!file)
+                goto put_dentry;
-        file->f_path.mnt = mntget(shm_mnt);
+        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_path.dentry = dentry;
-        file->f_mapping = inode->i_mapping;
-        file->f_op = &ramfs_file_operations;
-        file->f_mode = FMODE_WRITE | FMODE_READ;
        /* notify everyone as to the change of file size */
        error = do_truncate(dentry, size, 0, file);
diff --git a/mm/truncate.c b/mm/truncate.c
index 5cdfbc1a59fd..cadc15653dde 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -8,6 +8,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/module.h>
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
                struct address_space *mapping = page->mapping;
                if (mapping && mapping_cap_account_dirty(mapping)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        dec_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
                        if (account_size)
                                task_io_account_cancelled_write(account_size);
                }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bbd194630c5b..e1471385d001 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1108,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
-        atomic_inc(&zone->reclaim_in_progress);
        /*
         * Add one to `nr_to_scan' just to make sure that the kernel will
         * slowly sift through the active list.
@@ -1148,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        }
        throttle_vm_writeout(sc->gfp_mask);
-        atomic_dec(&zone->reclaim_in_progress);
        return nr_reclaimed;
 }
@@ -1187,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                note_zone_scanning_priority(zone, priority);
-                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
                sc->all_unreclaimable = 0;
@@ -1368,7 +1364,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone_is_all_unreclaimable(zone) &&
+                            priority != DEF_PRIORITY)
                                continue;
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1403,7 +1400,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone_is_all_unreclaimable(zone) &&
+                                        priority != DEF_PRIORITY)
                                continue;
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1424,12 +1422,13 @@ loop_again:
                                                lru_pages);
                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
+                        if (zone_is_all_unreclaimable(zone))
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
                                (zone_page_state(zone, NR_ACTIVE)
                                + zone_page_state(zone, NR_INACTIVE)) * 6)
-                                        zone->all_unreclaimable = 1;
+                                        zone_set_flag(zone,
+                                                      ZONE_ALL_UNRECLAIMABLE);
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -1595,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
                /* For pass = 0 we don't shrink the active list */
@@ -1897,6 +1896,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
        int node_id;
+        int ret;
        /*
         * Zone reclaim reclaims unmapped file backed pages and
@@ -1914,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        <= zone->min_slab_pages)
                return 0;
+        if (zone_is_all_unreclaimable(zone))
+                return 0;
        /*
-         * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+         * Do not scan if the allocation should not be delayed.
-         * not have reclaimable pages and if we should not delay the allocation
-         * then do not scan.
         */
-        if (!(gfp_mask & __GFP_WAIT) ||
+        if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
-                zone->all_unreclaimable ||
-                atomic_read(&zone->reclaim_in_progress) > 0 ||
-                (current->flags & PF_MEMALLOC))
                        return 0;
        /*
@@ -1934,6 +1932,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        node_id = zone_to_nid(zone);
        if (node_state(node_id, N_CPU) && node_id != numa_node_id())
                return 0;
-        return __zone_reclaim(zone, gfp_mask, order);
+        if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+                return 0;
+        ret = __zone_reclaim(zone, gfp_mask, order);
+        zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+        return ret;
 }
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3b5e9043e7db..4651bf153f35 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -704,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n  all_unreclaimable: %u"
                   "\n  prev_priority:     %i"
                   "\n  start_pfn:         %lu",
-                   zone->all_unreclaimable,
+                           zone_is_all_unreclaimable(zone),
                   zone->prev_priority,
                   zone->zone_start_pfn);
        seq_putc(m, '\n');