35 files changed, 3308 insertions, 1046 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5c0b0ea7572d..9f117bab5322 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,8 +13,10 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
                           page_isolation.o $(mmu-y)
+obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
@@ -30,4 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 00b02623f008..7e58322b7134 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
 */
 void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
-        void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
+        void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
        void *__pdata = __percpu_disguise(pdata);
        if (unlikely(!pdata))
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 00a96970b237..f6ff4337b424 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,11 +111,12 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
 * might be used for boot-time allocations - or it might get added
 * to the free page pool later on.
 */
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+static int __init reserve_bootmem_core(bootmem_data_t *bdata,
-                                        unsigned long size)
+                        unsigned long addr, unsigned long size, int flags)
 {
        unsigned long sidx, eidx;
        unsigned long i;
+        int ret;
        /*
         * round up, partially reserved pages are considered
@@ -133,7 +134,20 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
 #ifdef CONFIG_DEBUG_BOOTMEM
                        printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
 #endif
+                        if (flags & BOOTMEM_EXCLUSIVE) {
+                                ret = -EBUSY;
+                                goto err;
+                        }
                }
+        return 0;
+err:
+        /* unreserve memory we accidentally reserved */
+        for (i--; i >= sidx; i--)
+                clear_bit(i, bdata->node_bootmem_map);
+        return ret;
 }
 static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -374,9 +388,9 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
 }
 void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
-                                 unsigned long size)
+                                 unsigned long size, int flags)
 {
-        reserve_bootmem_core(pgdat->bdata, physaddr, size);
+        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
 }
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
@@ -398,9 +412,10 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 }
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-void __init reserve_bootmem(unsigned long addr, unsigned long size)
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+                            int flags)
 {
-        reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+        return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 000000000000..34aaac451a96
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,500 @@
+/*
+ * DMA Pool allocator
+ *
+ * Copyright 2001 David Brownell
+ * Copyright 2007 Intel Corporation
+ *   Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 as published by the
+ * Free Software Foundation.
+ *
+ * This allocator returns small blocks of a given size which are DMA-able by
+ * the given device.  It uses the dma_alloc_coherent page allocator to get
+ * new pages, then splits them up into blocks of the required size.
+ * Many older drivers still have their own code to do this.
+ *
+ * The current design of this allocator is fairly simple.  The pool is
+ * represented by the 'struct dma_pool' which keeps a doubly-linked list of
+ * allocated pages.  Each page in the page_list is split into blocks of at
+ * least 'size' bytes.  Free blocks are tracked in an unsorted singly-linked
+ * list of free blocks within the page.  Used blocks aren't tracked, but we
+ * keep a count of how many are currently allocated from each page.
+ */
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+struct dma_pool {               /* the pool */
+        struct list_head page_list;
+        spinlock_t lock;
+        size_t size;
+        struct device *dev;
+        size_t allocation;
+        size_t boundary;
+        char name[32];
+        wait_queue_head_t waitq;
+        struct list_head pools;
+};
+struct dma_page {               /* cacheable header for 'allocation' bytes */
+        struct list_head page_list;
+        void *vaddr;
+        dma_addr_t dma;
+        unsigned int in_use;
+        unsigned int offset;
+};
+#define POOL_TIMEOUT_JIFFIES    ((100 /* msec */ * HZ) / 1000)
+static DEFINE_MUTEX(pools_lock);
+static ssize_t
+show_pools(struct device *dev, struct device_attribute *attr, char *buf)
+{
+        unsigned temp;
+        unsigned size;
+        char *next;
+        struct dma_page *page;
+        struct dma_pool *pool;
+        next = buf;
+        size = PAGE_SIZE;
+        temp = scnprintf(next, size, "poolinfo - 0.1\n");
+        size -= temp;
+        next += temp;
+        mutex_lock(&pools_lock);
+        list_for_each_entry(pool, &dev->dma_pools, pools) {
+                unsigned pages = 0;
+                unsigned blocks = 0;
+                list_for_each_entry(page, &pool->page_list, page_list) {
+                        pages++;
+                        blocks += page->in_use;
+                }
+                /* per-pool info, no real statistics yet */
+                temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
+                                 pool->name, blocks,
+                                 pages * (pool->allocation / pool->size),
+                                 pool->size, pages);
+                size -= temp;
+                next += temp;
+        }
+        mutex_unlock(&pools_lock);
+        return PAGE_SIZE - size;
+}
+static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
+/**
+ * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @boundary: returned blocks won't cross this power of two boundary
+ * Context: !in_interrupt()
+ *
+ * Returns a dma allocation pool with the requested characteristics, or
+ * null if one can't be created.  Given one of these pools, dma_pool_alloc()
+ * may be used to allocate memory.  Such memory will all have "consistent"
+ * DMA mappings, accessible by the device and its driver without using
+ * cache flushing primitives.  The actual size of blocks allocated may be
+ * larger than requested because of alignment.
+ *
+ * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
+ * cross that size boundary.  This is useful for devices which have
+ * addressing restrictions on individual DMA transfers, such as not crossing
+ * boundaries of 4KBytes.
+ */
+struct dma_pool *dma_pool_create(const char *name, struct device *dev,
+                                 size_t size, size_t align, size_t boundary)
+{
+        struct dma_pool *retval;
+        size_t allocation;
+        if (align == 0) {
+                align = 1;
+        } else if (align & (align - 1)) {
+                return NULL;
+        }
+        if (size == 0) {
+                return NULL;
+        } else if (size < 4) {
+                size = 4;
+        }
+        if ((size % align) != 0)
+                size = ALIGN(size, align);
+        allocation = max_t(size_t, size, PAGE_SIZE);
+        if (!boundary) {
+                boundary = allocation;
+        } else if ((boundary < size) || (boundary & (boundary - 1))) {
+                return NULL;
+        }
+        retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+        if (!retval)
+                return retval;
+        strlcpy(retval->name, name, sizeof(retval->name));
+        retval->dev = dev;
+        INIT_LIST_HEAD(&retval->page_list);
+        spin_lock_init(&retval->lock);
+        retval->size = size;
+        retval->boundary = boundary;
+        retval->allocation = allocation;
+        init_waitqueue_head(&retval->waitq);
+        if (dev) {
+                int ret;
+                mutex_lock(&pools_lock);
+                if (list_empty(&dev->dma_pools))
+                        ret = device_create_file(dev, &dev_attr_pools);
+                else
+                        ret = 0;
+                /* note:  not currently insisting "name" be unique */
+                if (!ret)
+                        list_add(&retval->pools, &dev->dma_pools);
+                else {
+                        kfree(retval);
+                        retval = NULL;
+                }
+                mutex_unlock(&pools_lock);
+        } else
+                INIT_LIST_HEAD(&retval->pools);
+        return retval;
+}
+EXPORT_SYMBOL(dma_pool_create);
+static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
+{
+        unsigned int offset = 0;
+        unsigned int next_boundary = pool->boundary;
+        do {
+                unsigned int next = offset + pool->size;
+                if (unlikely((next + pool->size) >= next_boundary)) {
+                        next = next_boundary;
+                        next_boundary += pool->boundary;
+                }
+                *(int *)(page->vaddr + offset) = next;
+                offset = next;
+        } while (offset < pool->allocation);
+}
+static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
+{
+        struct dma_page *page;
+        page = kmalloc(sizeof(*page), mem_flags);
+        if (!page)
+                return NULL;
+        page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
+                                         &page->dma, mem_flags);
+        if (page->vaddr) {
+#ifdef  CONFIG_DEBUG_SLAB
+                memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+                pool_initialise_page(pool, page);
+                list_add(&page->page_list, &pool->page_list);
+                page->in_use = 0;
+                page->offset = 0;
+        } else {
+                kfree(page);
+                page = NULL;
+        }
+        return page;
+}
+static inline int is_page_busy(struct dma_page *page)
+{
+        return page->in_use != 0;
+}
+static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
+{
+        dma_addr_t dma = page->dma;
+#ifdef  CONFIG_DEBUG_SLAB
+        memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
+#endif
+        dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
+        list_del(&page->page_list);
+        kfree(page);
+}
+/**
+ * dma_pool_destroy - destroys a pool of dma memory blocks.
+ * @pool: dma pool that will be destroyed
+ * Context: !in_interrupt()
+ *
+ * Caller guarantees that no more memory from the pool is in use,
+ * and that nothing will try to use the pool after this call.
+ */
+void dma_pool_destroy(struct dma_pool *pool)
+{
+        mutex_lock(&pools_lock);
+        list_del(&pool->pools);
+        if (pool->dev && list_empty(&pool->dev->dma_pools))
+                device_remove_file(pool->dev, &dev_attr_pools);
+        mutex_unlock(&pools_lock);
+        while (!list_empty(&pool->page_list)) {
+                struct dma_page *page;
+                page = list_entry(pool->page_list.next,
+                                  struct dma_page, page_list);
+                if (is_page_busy(page)) {
+                        if (pool->dev)
+                                dev_err(pool->dev,
+                                        "dma_pool_destroy %s, %p busy\n",
+                                        pool->name, page->vaddr);
+                        else
+                                printk(KERN_ERR
+                                       "dma_pool_destroy %s, %p busy\n",
+                                       pool->name, page->vaddr);
+                        /* leak the still-in-use consistent memory */
+                        list_del(&page->page_list);
+                        kfree(page);
+                } else
+                        pool_free_page(pool, page);
+        }
+        kfree(pool);
+}
+EXPORT_SYMBOL(dma_pool_destroy);
+/**
+ * dma_pool_alloc - get a block of consistent memory
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * This returns the kernel virtual address of a currently unused block,
+ * and reports its dma address through the handle.
+ * If such a memory block can't be allocated, %NULL is returned.
+ */
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
+                     dma_addr_t *handle)
+{
+        unsigned long flags;
+        struct dma_page *page;
+        size_t offset;
+        void *retval;
+        spin_lock_irqsave(&pool->lock, flags);
+ restart:
+        list_for_each_entry(page, &pool->page_list, page_list) {
+                if (page->offset < pool->allocation)
+                        goto ready;
+        }
+        page = pool_alloc_page(pool, GFP_ATOMIC);
+        if (!page) {
+                if (mem_flags & __GFP_WAIT) {
+                        DECLARE_WAITQUEUE(wait, current);
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                        __add_wait_queue(&pool->waitq, &wait);
+                        spin_unlock_irqrestore(&pool->lock, flags);
+                        schedule_timeout(POOL_TIMEOUT_JIFFIES);
+                        spin_lock_irqsave(&pool->lock, flags);
+                        __remove_wait_queue(&pool->waitq, &wait);
+                        goto restart;
+                }
+                retval = NULL;
+                goto done;
+        }
+ ready:
+        page->in_use++;
+        offset = page->offset;
+        page->offset = *(int *)(page->vaddr + offset);
+        retval = offset + page->vaddr;
+        *handle = offset + page->dma;
+#ifdef  CONFIG_DEBUG_SLAB
+        memset(retval, POOL_POISON_ALLOCATED, pool->size);
+#endif
+ done:
+        spin_unlock_irqrestore(&pool->lock, flags);
+        return retval;
+}
+EXPORT_SYMBOL(dma_pool_alloc);
+static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
+{
+        unsigned long flags;
+        struct dma_page *page;
+        spin_lock_irqsave(&pool->lock, flags);
+        list_for_each_entry(page, &pool->page_list, page_list) {
+                if (dma < page->dma)
+                        continue;
+                if (dma < (page->dma + pool->allocation))
+                        goto done;
+        }
+        page = NULL;
+ done:
+        spin_unlock_irqrestore(&pool->lock, flags);
+        return page;
+}
+/**
+ * dma_pool_free - put block back into dma pool
+ * @pool: the dma pool holding the block
+ * @vaddr: virtual address of block
+ * @dma: dma address of block
+ *
+ * Caller promises neither device nor driver will again touch this block
+ * unless it is first re-allocated.
+ */
+void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
+{
+        struct dma_page *page;
+        unsigned long flags;
+        unsigned int offset;
+        page = pool_find_page(pool, dma);
+        if (!page) {
+                if (pool->dev)
+                        dev_err(pool->dev,
+                                "dma_pool_free %s, %p/%lx (bad dma)\n",
+                                pool->name, vaddr, (unsigned long)dma);
+                else
+                        printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+                               pool->name, vaddr, (unsigned long)dma);
+                return;
+        }
+        offset = vaddr - page->vaddr;
+#ifdef  CONFIG_DEBUG_SLAB
+        if ((dma - page->dma) != offset) {
+                if (pool->dev)
+                        dev_err(pool->dev,
+                                "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+                                pool->name, vaddr, (unsigned long long)dma);
+                else
+                        printk(KERN_ERR
+                               "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+                               pool->name, vaddr, (unsigned long long)dma);
+                return;
+        }
+        {
+                unsigned int chain = page->offset;
+                while (chain < pool->allocation) {
+                        if (chain != offset) {
+                                chain = *(int *)(page->vaddr + chain);
+                                continue;
+                        }
+                        if (pool->dev)
+                                dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
+                                        "already free\n", pool->name,
+                                        (unsigned long long)dma);
+                        else
+                                printk(KERN_ERR "dma_pool_free %s, dma %Lx "
+                                        "already free\n", pool->name,
+                                        (unsigned long long)dma);
+                        return;
+                }
+        }
+        memset(vaddr, POOL_POISON_FREED, pool->size);
+#endif
+        spin_lock_irqsave(&pool->lock, flags);
+        page->in_use--;
+        *(int *)vaddr = page->offset;
+        page->offset = offset;
+        if (waitqueue_active(&pool->waitq))
+                wake_up_locked(&pool->waitq);
+        /*
+         * Resist a temptation to do
+         *    if (!is_page_busy(page)) pool_free_page(pool, page);
+         * Better have a few empty pages hang around.
+         */
+        spin_unlock_irqrestore(&pool->lock, flags);
+}
+EXPORT_SYMBOL(dma_pool_free);
+/*
+ * Managed DMA pool
+ */
+static void dmam_pool_release(struct device *dev, void *res)
+{
+        struct dma_pool *pool = *(struct dma_pool **)res;
+        dma_pool_destroy(pool);
+}
+static int dmam_pool_match(struct device *dev, void *res, void *match_data)
+{
+        return *(struct dma_pool **)res == match_data;
+}
+/**
+ * dmam_pool_create - Managed dma_pool_create()
+ * @name: name of pool, for diagnostics
+ * @dev: device that will be doing the DMA
+ * @size: size of the blocks in this pool.
+ * @align: alignment requirement for blocks; must be a power of two
+ * @allocation: returned blocks won't cross this boundary (or zero)
+ *
+ * Managed dma_pool_create().  DMA pool created with this function is
+ * automatically destroyed on driver detach.
+ */
+struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
+                                  size_t size, size_t align, size_t allocation)
+{
+        struct dma_pool **ptr, *pool;
+        ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
+        if (!ptr)
+                return NULL;
+        pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
+        if (pool)
+                devres_add(dev, ptr);
+        else
+                devres_free(ptr);
+        return pool;
+}
+EXPORT_SYMBOL(dmam_pool_create);
+/**
+ * dmam_pool_destroy - Managed dma_pool_destroy()
+ * @pool: dma pool that will be destroyed
+ *
+ * Managed dma_pool_destroy().
+ */
+void dmam_pool_destroy(struct dma_pool *pool)
+{
+        struct device *dev = pool->dev;
+        dma_pool_destroy(pool);
+        WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+}
+EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0df4c899e979..3c0f1e99f5e4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                goto out;
        }
-        if (mapping->a_ops->get_xip_page)
+        if (mapping->a_ops->get_xip_page) {
-                /* no bad return value, but ignore advice */
+                switch (advice) {
+                case POSIX_FADV_NORMAL:
+                case POSIX_FADV_RANDOM:
+                case POSIX_FADV_SEQUENTIAL:
+                case POSIX_FADV_WILLNEED:
+                case POSIX_FADV_NOREUSE:
+                case POSIX_FADV_DONTNEED:
+                        /* no bad return value, but ignore advice */
+                        break;
+                default:
+                        ret = -EINVAL;
+                }
                goto out;
+        }
        /* Careful about overflows. Len == 0 means "as much as possible" */
        endbyte = offset + len;
diff --git a/mm/filemap.c b/mm/filemap.c
index 76bea88cbebc..5357fcc4643b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
 #include "internal.h"
 /*
@@ -65,7 +66,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
- *          ->zone.lock
 *
 *  ->i_mutex
 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
@@ -119,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        mem_cgroup_uncharge_page(page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
@@ -459,8 +460,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 int add_to_page_cache(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+        int error = mem_cgroup_cache_charge(page, current->mm,
+                                        gfp_mask & ~__GFP_HIGHMEM);
+        if (error)
+                goto out;
+        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
                write_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
@@ -471,10 +476,14 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                }
+                } else
+                        mem_cgroup_uncharge_page(page);
                write_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
-        }
+        } else
+                mem_cgroup_uncharge_page(page);
+out:
        return error;
 }
 EXPORT_SYMBOL(add_to_page_cache);
@@ -528,7 +537,7 @@ static inline void wake_up_page(struct page *page, int bit)
        __wake_up_bit(page_waitqueue(page), &page->flags, bit);
 }
-void fastcall wait_on_page_bit(struct page *page, int bit_nr)
+void wait_on_page_bit(struct page *page, int bit_nr)
 {
        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
@@ -552,7 +561,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
 * parallel wait_on_page_locked()).
 */
-void fastcall unlock_page(struct page *page)
+void unlock_page(struct page *page)
 {
        smp_mb__before_clear_bit();
        if (!TestClearPageLocked(page))
@@ -586,7 +595,7 @@ EXPORT_SYMBOL(end_page_writeback);
 * chances are that on the second loop, the block layer's plug list is empty,
 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 */
-void fastcall __lock_page(struct page *page)
+void __lock_page(struct page *page)
 {
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
@@ -607,7 +616,7 @@ int fastcall __lock_page_killable(struct page *page)
 * Variant of lock_page that does not require the caller to hold a reference
 * on the page's mapping.
 */
-void fastcall __lock_page_nosync(struct page *page)
+void __lock_page_nosync(struct page *page)
 {
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
@@ -1277,7 +1286,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
-static int fastcall page_cache_read(struct file * file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f874ae818ad3..0420a0292b03 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
                else
                        return PTR_ERR(page);
        }
-        zero_user_page(page, offset, length, KM_USER0);
+        zero_user(page, offset, length);
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 14bd3bf7826e..69a37c2bdf81 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                 */
                if (mapping_cap_account_dirty(mapping)) {
                        unsigned long addr;
+                        struct file *file = vma->vm_file;
                        flags &= MAP_NONBLOCK;
-                        addr = mmap_region(vma->vm_file, start, size,
+                        get_file(file);
+                        addr = mmap_region(file, start, size,
                                        flags, vma->vm_flags, pgoff, 1);
+                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
                        } else {
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a967bc35152..35d47733cde4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -163,7 +163,7 @@ start:
        return vaddr;
 }
-void fastcall *kmap_high(struct page *page)
+void *kmap_high(struct page *page)
 {
        unsigned long vaddr;
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page)
 EXPORT_SYMBOL(kmap_high);
-void fastcall kunmap_high(struct page *page)
+void kunmap_high(struct page *page)
 {
        unsigned long vaddr;
        unsigned long nr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db861d8b6c28..1a5642074e34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(&mm->page_table_lock);
        copy_huge_page(new_page, old_page, address, vma);
+        __SetPageUptodate(new_page);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -858,6 +859,7 @@ retry:
                        goto out;
                }
                clear_huge_page(page, address);
+                __SetPageUptodate(page);
                if (vma->vm_flags & VM_SHARED) {
                        int err;
diff --git a/mm/internal.h b/mm/internal.h
index 953f941ea867..5a9a6200e034 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
 */
 static inline void set_page_refcounted(struct page *page)
 {
-        VM_BUG_ON(PageCompound(page) && PageTail(page));
+        VM_BUG_ON(PageTail(page));
        VM_BUG_ON(atomic_read(&page->_count));
        set_page_count(page, 1);
 }
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
-extern void fastcall __init __free_pages_bootmem(struct page *page,
+extern void __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
 /*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 000000000000..5c2c702af617
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,1192 @@
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+struct cgroup_subsys mem_cgroup_subsys;
+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+        /*
+         * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+         */
+        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
+        MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
+        MEM_CGROUP_STAT_NSTATS,
+};
+struct mem_cgroup_stat_cpu {
+        s64 count[MEM_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+struct mem_cgroup_stat {
+        struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+};
+/*
+ * For accounting under irq disable, no need for increment preempt count.
+ */
+static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
+                enum mem_cgroup_stat_index idx, int val)
+{
+        int cpu = smp_processor_id();
+        stat->cpustat[cpu].count[idx] += val;
+}
+static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
+                enum mem_cgroup_stat_index idx)
+{
+        int cpu;
+        s64 ret = 0;
+        for_each_possible_cpu(cpu)
+                ret += stat->cpustat[cpu].count[idx];
+        return ret;
+}
+/*
+ * per-zone information in memory controller.
+ */
+enum mem_cgroup_zstat_index {
+        MEM_CGROUP_ZSTAT_ACTIVE,
+        MEM_CGROUP_ZSTAT_INACTIVE,
+        NR_MEM_CGROUP_ZSTAT,
+};
+struct mem_cgroup_per_zone {
+        /*
+         * spin_lock to protect the per cgroup LRU
+         */
+        spinlock_t              lru_lock;
+        struct list_head        active_list;
+        struct list_head        inactive_list;
+        unsigned long count[NR_MEM_CGROUP_ZSTAT];
+};
+/* Macro for accessing counter */
+#define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
+struct mem_cgroup_per_node {
+        struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+struct mem_cgroup_lru_info {
+        struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+};
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ *
+ * TODO: Add a water mark for the memory controller. Reclaim will begin when
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
+ */
+struct mem_cgroup {
+        struct cgroup_subsys_state css;
+        /*
+         * the counter to account for memory usage
+         */
+        struct res_counter res;
+        /*
+         * Per cgroup active and inactive list, similar to the
+         * per zone LRU lists.
+         */
+        struct mem_cgroup_lru_info info;
+        int     prev_priority;  /* for recording reclaim priority */
+        /*
+         * statistics.
+         */
+        struct mem_cgroup_stat stat;
+};
+/*
+ * We use the lower bit of the page->page_cgroup pointer as a bit spin
+ * lock. We need to ensure that page->page_cgroup is atleast two
+ * byte aligned (based on comments from Nick Piggin)
+ */
+#define PAGE_CGROUP_LOCK_BIT    0x0
+#define PAGE_CGROUP_LOCK                (1 << PAGE_CGROUP_LOCK_BIT)
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ */
+struct page_cgroup {
+        struct list_head lru;           /* per cgroup LRU list */
+        struct page *page;
+        struct mem_cgroup *mem_cgroup;
+        atomic_t ref_cnt;               /* Helpful when pages move b/w  */
+                                        /* mapped and cached states     */
+        int      flags;
+};
+#define PAGE_CGROUP_FLAG_CACHE  (0x1)   /* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE (0x2)   /* page is active in this cgroup */
+static inline int page_cgroup_nid(struct page_cgroup *pc)
+{
+        return page_to_nid(pc->page);
+}
+static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+{
+        return page_zonenum(pc->page);
+}
+enum {
+        MEM_CGROUP_TYPE_UNSPEC = 0,
+        MEM_CGROUP_TYPE_MAPPED,
+        MEM_CGROUP_TYPE_CACHED,
+        MEM_CGROUP_TYPE_ALL,
+        MEM_CGROUP_TYPE_MAX,
+};
+enum charge_type {
+        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
+        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+};
+/*
+ * Always modified under lru lock. Then, not necessary to preempt_disable()
+ */
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
+                                        bool charge)
+{
+        int val = (charge)? 1 : -1;
+        struct mem_cgroup_stat *stat = &mem->stat;
+        VM_BUG_ON(!irqs_disabled());
+        if (flags & PAGE_CGROUP_FLAG_CACHE)
+                __mem_cgroup_stat_add_safe(stat,
+                                        MEM_CGROUP_STAT_CACHE, val);
+        else
+                __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+}
+static inline struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+        BUG_ON(!mem->info.nodeinfo[nid]);
+        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+static inline struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+        struct mem_cgroup *mem = pc->mem_cgroup;
+        int nid = page_cgroup_nid(pc);
+        int zid = page_cgroup_zid(pc);
+        return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
+                                        enum mem_cgroup_zstat_index idx)
+{
+        int nid, zid;
+        struct mem_cgroup_per_zone *mz;
+        u64 total = 0;
+        for_each_online_node(nid)
+                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                        total += MEM_CGROUP_ZSTAT(mz, idx);
+                }
+        return total;
+}
+static struct mem_cgroup init_mem_cgroup;
+static inline
+struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+{
+        return container_of(cgroup_subsys_state(cont,
+                                mem_cgroup_subsys_id), struct mem_cgroup,
+                                css);
+}
+static inline
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+                                struct mem_cgroup, css);
+}
+void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
+{
+        struct mem_cgroup *mem;
+        mem = mem_cgroup_from_task(p);
+        css_get(&mem->css);
+        mm->mem_cgroup = mem;
+}
+void mm_free_cgroup(struct mm_struct *mm)
+{
+        css_put(&mm->mem_cgroup->css);
+}
+static inline int page_cgroup_locked(struct page *page)
+{
+        return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
+                                        &page->page_cgroup);
+}
+void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+        int locked;
+        /*
+         * While resetting the page_cgroup we might not hold the
+         * page_cgroup lock. free_hot_cold_page() is an example
+         * of such a scenario
+         */
+        if (pc)
+                VM_BUG_ON(!page_cgroup_locked(page));
+        locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
+        page->page_cgroup = ((unsigned long)pc | locked);
+}
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+        return (struct page_cgroup *)
+                (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+}
+static void __always_inline lock_page_cgroup(struct page *page)
+{
+        bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+        VM_BUG_ON(!page_cgroup_locked(page));
+}
+static void __always_inline unlock_page_cgroup(struct page *page)
+{
+        bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+}
+/*
+ * Tie new page_cgroup to struct page under lock_page_cgroup()
+ * This can fail if the page has been tied to a page_cgroup.
+ * If success, returns 0.
+ */
+static int page_cgroup_assign_new_page_cgroup(struct page *page,
+                                                struct page_cgroup *pc)
+{
+        int ret = 0;
+        lock_page_cgroup(page);
+        if (!page_get_page_cgroup(page))
+                page_assign_page_cgroup(page, pc);
+        else /* A page is tied to other pc. */
+                ret = 1;
+        unlock_page_cgroup(page);
+        return ret;
+}
+/*
+ * Clear page->page_cgroup member under lock_page_cgroup().
+ * If given "pc" value is different from one page->page_cgroup,
+ * page->cgroup is not cleared.
+ * Returns a value of page->page_cgroup at lock taken.
+ * A can can detect failure of clearing by following
+ *  clear_page_cgroup(page, pc) == pc
+ */
+static struct page_cgroup *clear_page_cgroup(struct page *page,
+                                                struct page_cgroup *pc)
+{
+        struct page_cgroup *ret;
+        /* lock and clear */
+        lock_page_cgroup(page);
+        ret = page_get_page_cgroup(page);
+        if (likely(ret == pc))
+                page_assign_page_cgroup(page, NULL);
+        unlock_page_cgroup(page);
+        return ret;
+}
+static void __mem_cgroup_remove_list(struct page_cgroup *pc)
+{
+        int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        if (from)
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
+        else
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
+        list_del_init(&pc->lru);
+}
+static void __mem_cgroup_add_list(struct page_cgroup *pc)
+{
+        int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        if (!to) {
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+                list_add(&pc->lru, &mz->inactive_list);
+        } else {
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+                list_add(&pc->lru, &mz->active_list);
+        }
+        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
+}
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+        int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        if (from)
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
+        else
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+        if (active) {
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+                pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+                list_move(&pc->lru, &mz->active_list);
+        } else {
+                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+                pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
+                list_move(&pc->lru, &mz->inactive_list);
+        }
+}
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+{
+        int ret;
+        task_lock(task);
+        ret = task->mm && mm_cgroup(task->mm) == mem;
+        task_unlock(task);
+        return ret;
+}
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+        struct mem_cgroup_per_zone *mz;
+        unsigned long flags;
+        if (!pc)
+                return;
+        mz = page_cgroup_zoneinfo(pc);
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_move_lists(pc, active);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
+}
+/*
+ * Calculate mapped_ratio under memory controller. This will be used in
+ * vmscan.c for deteremining we have to reclaim mapped pages.
+ */
+int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
+{
+        long total, rss;
+        /*
+         * usage is recorded in bytes. But, here, we assume the number of
+         * physical pages can be represented by "long" on any arch.
+         */
+        total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
+        rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+        return (int)((rss * 100L) / total);
+}
+/*
+ * This function is called from vmscan.c. In page reclaiming loop. balance
+ * between active and inactive list is calculated. For memory controller
+ * page reclaiming, we should use using mem_cgroup's imbalance rather than
+ * zone's global lru imbalance.
+ */
+long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
+{
+        unsigned long active, inactive;
+        /* active and inactive are the number of pages. 'long' is ok.*/
+        active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
+        inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
+        return (long) (active / (inactive + 1));
+}
+/*
+ * prev_priority control...this will be used in memory reclaim path.
+ */
+int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
+{
+        return mem->prev_priority;
+}
+void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+        if (priority < mem->prev_priority)
+                mem->prev_priority = priority;
+}
+void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+        mem->prev_priority = priority;
+}
+/*
+ * Calculate # of pages to be scanned in this priority/zone.
+ * See also vmscan.c
+ *
+ * priority starts from "DEF_PRIORITY" and decremented in each loop.
+ * (see include/linux/mmzone.h)
+ */
+long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
+                                   struct zone *zone, int priority)
+{
+        long nr_active;
+        int nid = zone->zone_pgdat->node_id;
+        int zid = zone_idx(zone);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
+        return (nr_active >> priority);
+}
+long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
+                                        struct zone *zone, int priority)
+{
+        long nr_inactive;
+        int nid = zone->zone_pgdat->node_id;
+        int zid = zone_idx(zone);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
+        return (nr_inactive >> priority);
+}
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+                                        struct list_head *dst,
+                                        unsigned long *scanned, int order,
+                                        int mode, struct zone *z,
+                                        struct mem_cgroup *mem_cont,
+                                        int active)
+{
+        unsigned long nr_taken = 0;
+        struct page *page;
+        unsigned long scan;
+        LIST_HEAD(pc_list);
+        struct list_head *src;
+        struct page_cgroup *pc, *tmp;
+        int nid = z->zone_pgdat->node_id;
+        int zid = zone_idx(z);
+        struct mem_cgroup_per_zone *mz;
+        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+        if (active)
+                src = &mz->active_list;
+        else
+                src = &mz->inactive_list;
+        spin_lock(&mz->lru_lock);
+        scan = 0;
+        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+                if (scan >= nr_to_scan)
+                        break;
+                page = pc->page;
+                VM_BUG_ON(!pc);
+                if (unlikely(!PageLRU(page)))
+                        continue;
+                if (PageActive(page) && !active) {
+                        __mem_cgroup_move_lists(pc, true);
+                        continue;
+                }
+                if (!PageActive(page) && active) {
+                        __mem_cgroup_move_lists(pc, false);
+                        continue;
+                }
+                scan++;
+                list_move(&pc->lru, &pc_list);
+                if (__isolate_lru_page(page, mode) == 0) {
+                        list_move(&page->lru, dst);
+                        nr_taken++;
+                }
+        }
+        list_splice(&pc_list, src);
+        spin_unlock(&mz->lru_lock);
+        *scanned = scan;
+        return nr_taken;
+}
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask, enum charge_type ctype)
+{
+        struct mem_cgroup *mem;
+        struct page_cgroup *pc;
+        unsigned long flags;
+        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct mem_cgroup_per_zone *mz;
+        /*
+         * Should page_cgroup's go to their own slab?
+         * One could optimize the performance of the charging routine
+         * by saving a bit in the page_flags and using it as a lock
+         * to see if the cgroup page already has a page_cgroup associated
+         * with it
+         */
+retry:
+        if (page) {
+                lock_page_cgroup(page);
+                pc = page_get_page_cgroup(page);
+                /*
+                 * The page_cgroup exists and
+                 * the page has already been accounted.
+                 */
+                if (pc) {
+                        if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
+                                /* this page is under being uncharged ? */
+                                unlock_page_cgroup(page);
+                                cpu_relax();
+                                goto retry;
+                        } else {
+                                unlock_page_cgroup(page);
+                                goto done;
+                        }
+                }
+                unlock_page_cgroup(page);
+        }
+        pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
+        if (pc == NULL)
+                goto err;
+        /*
+         * We always charge the cgroup the mm_struct belongs to.
+         * The mm_struct's mem_cgroup changes on task migration if the
+         * thread group leader migrates. It's possible that mm is not
+         * set, if so charge the init_mm (happens for pagecache usage).
+         */
+        if (!mm)
+                mm = &init_mm;
+        rcu_read_lock();
+        mem = rcu_dereference(mm->mem_cgroup);
+        /*
+         * For every charge from the cgroup, increment reference
+         * count
+         */
+        css_get(&mem->css);
+        rcu_read_unlock();
+        /*
+         * If we created the page_cgroup, we should free it on exceeding
+         * the cgroup limit.
+         */
+        while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+                if (!(gfp_mask & __GFP_WAIT))
+                        goto out;
+                if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+                        continue;
+                /*
+                 * try_to_free_mem_cgroup_pages() might not give us a full
+                 * picture of reclaim. Some pages are reclaimed and might be
+                 * moved to swap cache or just unmapped from the cgroup.
+                 * Check the limit again to see if the reclaim reduced the
+                 * current usage of the cgroup before giving up
+                 */
+                if (res_counter_check_under_limit(&mem->res))
+                        continue;
+                if (!nr_retries--) {
+                        mem_cgroup_out_of_memory(mem, gfp_mask);
+                        goto out;
+                }
+                congestion_wait(WRITE, HZ/10);
+        }
+        atomic_set(&pc->ref_cnt, 1);
+        pc->mem_cgroup = mem;
+        pc->page = page;
+        pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
+                pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+        if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
+                /*
+                 * Another charge has been added to this page already.
+                 * We take lock_page_cgroup(page) again and read
+                 * page->cgroup, increment refcnt.... just retry is OK.
+                 */
+                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                css_put(&mem->css);
+                kfree(pc);
+                if (!page)
+                        goto done;
+                goto retry;
+        }
+        mz = page_cgroup_zoneinfo(pc);
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        /* Update statistics vector */
+        __mem_cgroup_add_list(pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
+done:
+        return 0;
+out:
+        css_put(&mem->css);
+        kfree(pc);
+err:
+        return -ENOMEM;
+}
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                        gfp_t gfp_mask)
+{
+        return mem_cgroup_charge_common(page, mm, gfp_mask,
+                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+/*
+ * See if the cached pages should be charged at all?
+ */
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
+{
+        int ret = 0;
+        if (!mm)
+                mm = &init_mm;
+        ret = mem_cgroup_charge_common(page, mm, gfp_mask,
+                                MEM_CGROUP_CHARGE_TYPE_CACHE);
+        return ret;
+}
+/*
+ * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge. This routine should be called with lock_page_cgroup held
+ */
+void mem_cgroup_uncharge(struct page_cgroup *pc)
+{
+        struct mem_cgroup *mem;
+        struct mem_cgroup_per_zone *mz;
+        struct page *page;
+        unsigned long flags;
+        /*
+         * Check if our page_cgroup is valid
+         */
+        if (!pc)
+                return;
+        if (atomic_dec_and_test(&pc->ref_cnt)) {
+                page = pc->page;
+                mz = page_cgroup_zoneinfo(pc);
+                /*
+                 * get page->cgroup and clear it under lock.
+                 * force_empty can drop page->cgroup without checking refcnt.
+                 */
+                unlock_page_cgroup(page);
+                if (clear_page_cgroup(page, pc) == pc) {
+                        mem = pc->mem_cgroup;
+                        css_put(&mem->css);
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
+                        spin_lock_irqsave(&mz->lru_lock, flags);
+                        __mem_cgroup_remove_list(pc);
+                        spin_unlock_irqrestore(&mz->lru_lock, flags);
+                        kfree(pc);
+                }
+                lock_page_cgroup(page);
+        }
+}
+void mem_cgroup_uncharge_page(struct page *page)
+{
+        lock_page_cgroup(page);
+        mem_cgroup_uncharge(page_get_page_cgroup(page));
+        unlock_page_cgroup(page);
+}
+/*
+ * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Refcnt of page_cgroup is incremented.
+ */
+int mem_cgroup_prepare_migration(struct page *page)
+{
+        struct page_cgroup *pc;
+        int ret = 0;
+        lock_page_cgroup(page);
+        pc = page_get_page_cgroup(page);
+        if (pc && atomic_inc_not_zero(&pc->ref_cnt))
+                ret = 1;
+        unlock_page_cgroup(page);
+        return ret;
+}
+void mem_cgroup_end_migration(struct page *page)
+{
+        struct page_cgroup *pc;
+        lock_page_cgroup(page);
+        pc = page_get_page_cgroup(page);
+        mem_cgroup_uncharge(pc);
+        unlock_page_cgroup(page);
+}
+/*
+ * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
+ * And no race with uncharge() routines because page_cgroup for *page*
+ * has extra one reference by mem_cgroup_prepare_migration.
+ */
+void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+{
+        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
+        unsigned long flags;
+        struct mem_cgroup_per_zone *mz;
+retry:
+        pc = page_get_page_cgroup(page);
+        if (!pc)
+                return;
+        mem = pc->mem_cgroup;
+        mz = page_cgroup_zoneinfo(pc);
+        if (clear_page_cgroup(page, pc) != pc)
+                goto retry;
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_remove_list(pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        pc->page = newpage;
+        lock_page_cgroup(newpage);
+        page_assign_page_cgroup(newpage, pc);
+        unlock_page_cgroup(newpage);
+        mz = page_cgroup_zoneinfo(pc);
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_add_list(pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        return;
+}
+/*
+ * This routine traverse page_cgroup in given list and drop them all.
+ * This routine ignores page_cgroup->ref_cnt.
+ * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ */
+#define FORCE_UNCHARGE_BATCH    (128)
+static void
+mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+                            struct mem_cgroup_per_zone *mz,
+                            int active)
+{
+        struct page_cgroup *pc;
+        struct page *page;
+        int count;
+        unsigned long flags;
+        struct list_head *list;
+        if (active)
+                list = &mz->active_list;
+        else
+                list = &mz->inactive_list;
+        if (list_empty(list))
+                return;
+retry:
+        count = FORCE_UNCHARGE_BATCH;
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        while (--count && !list_empty(list)) {
+                pc = list_entry(list->prev, struct page_cgroup, lru);
+                page = pc->page;
+                /* Avoid race with charge */
+                atomic_set(&pc->ref_cnt, 0);
+                if (clear_page_cgroup(page, pc) == pc) {
+                        css_put(&mem->css);
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
+                        __mem_cgroup_remove_list(pc);
+                        kfree(pc);
+                } else  /* being uncharged ? ...do relax */
+                        break;
+        }
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        if (!list_empty(list)) {
+                cond_resched();
+                goto retry;
+        }
+        return;
+}
+/*
+ * make mem_cgroup's charge to be 0 if there is no task.
+ * This enables deleting this mem_cgroup.
+ */
+int mem_cgroup_force_empty(struct mem_cgroup *mem)
+{
+        int ret = -EBUSY;
+        int node, zid;
+        css_get(&mem->css);
+        /*
+         * page reclaim code (kswapd etc..) will move pages between
+`        * active_list <-> inactive_list while we don't take a lock.
+         * So, we have to do loop here until all lists are empty.
+         */
+        while (mem->res.usage > 0) {
+                if (atomic_read(&mem->css.cgroup->count) > 0)
+                        goto out;
+                for_each_node_state(node, N_POSSIBLE)
+                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                                struct mem_cgroup_per_zone *mz;
+                                mz = mem_cgroup_zoneinfo(mem, node, zid);
+                                /* drop all page_cgroup in active_list */
+                                mem_cgroup_force_empty_list(mem, mz, 1);
+                                /* drop all page_cgroup in inactive_list */
+                                mem_cgroup_force_empty_list(mem, mz, 0);
+                        }
+        }
+        ret = 0;
+out:
+        css_put(&mem->css);
+        return ret;
+}
+int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+        *tmp = memparse(buf, &buf);
+        if (*buf != '\0')
+                return -EINVAL;
+        /*
+         * Round up the value to the closest page size
+         */
+        *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
+        return 0;
+}
+static ssize_t mem_cgroup_read(struct cgroup *cont,
+                        struct cftype *cft, struct file *file,
+                        char __user *userbuf, size_t nbytes, loff_t *ppos)
+{
+        return res_counter_read(&mem_cgroup_from_cont(cont)->res,
+                                cft->private, userbuf, nbytes, ppos,
+                                NULL);
+}
+static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+                                struct file *file, const char __user *userbuf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return res_counter_write(&mem_cgroup_from_cont(cont)->res,
+                                cft->private, userbuf, nbytes, ppos,
+                                mem_cgroup_write_strategy);
+}
+static ssize_t mem_force_empty_write(struct cgroup *cont,
+                                struct cftype *cft, struct file *file,
+                                const char __user *userbuf,
+                                size_t nbytes, loff_t *ppos)
+{
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        int ret;
+        ret = mem_cgroup_force_empty(mem);
+        if (!ret)
+                ret = nbytes;
+        return ret;
+}
+/*
+ * Note: This should be removed if cgroup supports write-only file.
+ */
+static ssize_t mem_force_empty_read(struct cgroup *cont,
+                                struct cftype *cft,
+                                struct file *file, char __user *userbuf,
+                                size_t nbytes, loff_t *ppos)
+{
+        return -EINVAL;
+}
+static const struct mem_cgroup_stat_desc {
+        const char *msg;
+        u64 unit;
+} mem_cgroup_stat_desc[] = {
+        [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
+        [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+};
+static int mem_control_stat_show(struct seq_file *m, void *arg)
+{
+        struct cgroup *cont = m->private;
+        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+        struct mem_cgroup_stat *stat = &mem_cont->stat;
+        int i;
+        for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+                s64 val;
+                val = mem_cgroup_read_stat(stat, i);
+                val *= mem_cgroup_stat_desc[i].unit;
+                seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
+                                (long long)val);
+        }
+        /* showing # of active pages */
+        {
+                unsigned long active, inactive;
+                inactive = mem_cgroup_get_all_zonestat(mem_cont,
+                                                MEM_CGROUP_ZSTAT_INACTIVE);
+                active = mem_cgroup_get_all_zonestat(mem_cont,
+                                                MEM_CGROUP_ZSTAT_ACTIVE);
+                seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
+                seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+        }
+        return 0;
+}
+static const struct file_operations mem_control_stat_file_operations = {
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int mem_control_stat_open(struct inode *unused, struct file *file)
+{
+        /* XXX __d_cont */
+        struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+        file->f_op = &mem_control_stat_file_operations;
+        return single_open(file, mem_control_stat_show, cont);
+}
+static struct cftype mem_cgroup_files[] = {
+        {
+                .name = "usage_in_bytes",
+                .private = RES_USAGE,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "limit_in_bytes",
+                .private = RES_LIMIT,
+                .write = mem_cgroup_write,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "failcnt",
+                .private = RES_FAILCNT,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "force_empty",
+                .write = mem_force_empty_write,
+                .read = mem_force_empty_read,
+        },
+        {
+                .name = "stat",
+                .open = mem_control_stat_open,
+        },
+};
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+        struct mem_cgroup_per_node *pn;
+        struct mem_cgroup_per_zone *mz;
+        int zone;
+        /*
+         * This routine is called against possible nodes.
+         * But it's BUG to call kmalloc() against offline node.
+         *
+         * TODO: this routine can waste much memory for nodes which will
+         *       never be onlined. It's better to use memory hotplug callback
+         *       function.
+         */
+        if (node_state(node, N_HIGH_MEMORY))
+                pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
+        else
+                pn = kmalloc(sizeof(*pn), GFP_KERNEL);
+        if (!pn)
+                return 1;
+        mem->info.nodeinfo[node] = pn;
+        memset(pn, 0, sizeof(*pn));
+        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                mz = &pn->zoneinfo[zone];
+                INIT_LIST_HEAD(&mz->active_list);
+                INIT_LIST_HEAD(&mz->inactive_list);
+                spin_lock_init(&mz->lru_lock);
+        }
+        return 0;
+}
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+        kfree(mem->info.nodeinfo[node]);
+}
+static struct mem_cgroup init_mem_cgroup;
+static struct cgroup_subsys_state *
+mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+        struct mem_cgroup *mem;
+        int node;
+        if (unlikely((cont->parent) == NULL)) {
+                mem = &init_mem_cgroup;
+                init_mm.mem_cgroup = mem;
+        } else
+                mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
+        if (mem == NULL)
+                return NULL;
+        res_counter_init(&mem->res);
+        memset(&mem->info, 0, sizeof(mem->info));
+        for_each_node_state(node, N_POSSIBLE)
+                if (alloc_mem_cgroup_per_zone_info(mem, node))
+                        goto free_out;
+        return &mem->css;
+free_out:
+        for_each_node_state(node, N_POSSIBLE)
+                free_mem_cgroup_per_zone_info(mem, node);
+        if (cont->parent != NULL)
+                kfree(mem);
+        return NULL;
+}
+static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+                                        struct cgroup *cont)
+{
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        mem_cgroup_force_empty(mem);
+}
+static void mem_cgroup_destroy(struct cgroup_subsys *ss,
+                                struct cgroup *cont)
+{
+        int node;
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        for_each_node_state(node, N_POSSIBLE)
+                free_mem_cgroup_per_zone_info(mem, node);
+        kfree(mem_cgroup_from_cont(cont));
+}
+static int mem_cgroup_populate(struct cgroup_subsys *ss,
+                                struct cgroup *cont)
+{
+        return cgroup_add_files(cont, ss, mem_cgroup_files,
+                                        ARRAY_SIZE(mem_cgroup_files));
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+                                struct cgroup *cont,
+                                struct cgroup *old_cont,
+                                struct task_struct *p)
+{
+        struct mm_struct *mm;
+        struct mem_cgroup *mem, *old_mem;
+        mm = get_task_mm(p);
+        if (mm == NULL)
+                return;
+        mem = mem_cgroup_from_cont(cont);
+        old_mem = mem_cgroup_from_cont(old_cont);
+        if (mem == old_mem)
+                goto out;
+        /*
+         * Only thread group leaders are allowed to migrate, the mm_struct is
+         * in effect owned by the leader
+         */
+        if (p->tgid != p->pid)
+                goto out;
+        css_get(&mem->css);
+        rcu_assign_pointer(mm->mem_cgroup, mem);
+        css_put(&old_mem->css);
+out:
+        mmput(mm);
+        return;
+}
+struct cgroup_subsys mem_cgroup_subsys = {
+        .name = "memory",
+        .subsys_id = mem_cgroup_subsys_id,
+        .create = mem_cgroup_create,
+        .pre_destroy = mem_cgroup_pre_destroy,
+        .destroy = mem_cgroup_destroy,
+        .populate = mem_cgroup_populate,
+        .attach = mem_cgroup_move_task,
+        .early_init = 0,
+};
diff --git a/mm/memory.c b/mm/memory.c
index d902d0e25edc..153a54b2013c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
+#include <linux/memcontrol.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -82,7 +83,18 @@ void * high_memory;
 EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
-int randomize_va_space __read_mostly = 1;
+/*
+ * Randomize the address space (stacks, mmaps, brk, etc.).
+ *
+ * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
+ *   as ancient (libc5 based) binaries can segfault. )
+ */
+int randomize_va_space __read_mostly =
+#ifdef CONFIG_COMPAT_BRK
+                                        1;
+#else
+                                        2;
+#endif
 static int __init disable_randmaps(char *s)
 {
@@ -305,7 +317,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        spin_lock(&mm->page_table_lock);
        if (pmd_present(*pmd)) {        /* Another has populated it */
                pte_lock_deinit(new);
-                pte_free(new);
+                pte_free(mm, new);
        } else {
                mm->nr_ptes++;
                inc_zone_page_state(new, NR_PAGETABLE);
@@ -323,7 +335,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        spin_lock(&init_mm.page_table_lock);
        if (pmd_present(*pmd))          /* Another has populated it */
-                pte_free_kernel(new);
+                pte_free_kernel(&init_mm, new);
        else
                pmd_populate_kernel(&init_mm, pmd, new);
        spin_unlock(&init_mm.page_table_lock);
@@ -1109,7 +1121,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
-pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+                        spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
        pud_t * pud = pud_alloc(mm, pgd, addr);
@@ -1132,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
 {
        int retval;
        pte_t *pte;
-        spinlock_t *ptl;  
+        spinlock_t *ptl;
+        retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
+        if (retval)
+                goto out;
        retval = -EINVAL;
        if (PageAnon(page))
-                goto out;
+                goto out_uncharge;
        retval = -ENOMEM;
        flush_dcache_page(page);
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
-                goto out;
+                goto out_uncharge;
        retval = -EBUSY;
        if (!pte_none(*pte))
                goto out_unlock;
@@ -1153,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
        retval = 0;
+        pte_unmap_unlock(pte, ptl);
+        return retval;
 out_unlock:
        pte_unmap_unlock(pte, ptl);
+out_uncharge:
+        mem_cgroup_uncharge_page(page);
 out:
        return retval;
 }
@@ -1517,10 +1538,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                        memset(kaddr, 0, PAGE_SIZE);
                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(dst);
-                return;
+        } else
+                copy_user_highpage(dst, src, va, vma);
-        }
-        copy_user_highpage(dst, src, va, vma);
 }
 /*
@@ -1629,6 +1648,10 @@ gotten:
        if (!new_page)
                goto oom;
        cow_user_page(new_page, old_page, address, vma);
+        __SetPageUptodate(new_page);
+        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+                goto oom_free_new;
        /*
         * Re-check the pte - we dropped the lock
@@ -1661,7 +1684,9 @@ gotten:
                /* Free the old page.. */
                new_page = old_page;
                ret |= VM_FAULT_WRITE;
-        }
+        } else
+                mem_cgroup_uncharge_page(new_page);
        if (new_page)
                page_cache_release(new_page);
        if (old_page)
@@ -1685,6 +1710,8 @@ unlock:
                put_page(dirty_page);
        }
        return ret;
+oom_free_new:
+        __free_page(new_page);
 oom:
        if (old_page)
                page_cache_release(old_page);
@@ -1909,50 +1936,49 @@ EXPORT_SYMBOL(unmap_mapping_range);
 */
 int vmtruncate(struct inode * inode, loff_t offset)
 {
-        struct address_space *mapping = inode->i_mapping;
+        if (inode->i_size < offset) {
-        unsigned long limit;
+                unsigned long limit;
-        if (inode->i_size < offset)
+                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-                goto do_expand;
+                if (limit != RLIM_INFINITY && offset > limit)
-        /*
+                        goto out_sig;
-         * truncation of in-use swapfiles is disallowed - it would cause
+                if (offset > inode->i_sb->s_maxbytes)
-         * subsequent swapout to scribble on the now-freed blocks.
+                        goto out_big;
-         */
+                i_size_write(inode, offset);
-        if (IS_SWAPFILE(inode))
+        } else {
-                goto out_busy;
+                struct address_space *mapping = inode->i_mapping;
-        i_size_write(inode, offset);
+                /*
+                 * truncation of in-use swapfiles is disallowed - it would
+                 * cause subsequent swapout to scribble on the now-freed
+                 * blocks.
+                 */
+                if (IS_SWAPFILE(inode))
+                        return -ETXTBSY;
+                i_size_write(inode, offset);
+                /*
+                 * unmap_mapping_range is called twice, first simply for
+                 * efficiency so that truncate_inode_pages does fewer
+                 * single-page unmaps.  However after this first call, and
+                 * before truncate_inode_pages finishes, it is possible for
+                 * private pages to be COWed, which remain after
+                 * truncate_inode_pages finishes, hence the second
+                 * unmap_mapping_range call must be made for correctness.
+                 */
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+                truncate_inode_pages(mapping, offset);
+                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+        }
-        /*
-         * unmap_mapping_range is called twice, first simply for efficiency
-         * so that truncate_inode_pages does fewer single-page unmaps. However
-         * after this first call, and before truncate_inode_pages finishes,
-         * it is possible for private pages to be COWed, which remain after
-         * truncate_inode_pages finishes, hence the second unmap_mapping_range
-         * call must be made for correctness.
-         */
-        unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-        truncate_inode_pages(mapping, offset);
-        unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
-        goto out_truncate;
-do_expand:
-        limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
-        if (limit != RLIM_INFINITY && offset > limit)
-                goto out_sig;
-        if (offset > inode->i_sb->s_maxbytes)
-                goto out_big;
-        i_size_write(inode, offset);
-out_truncate:
        if (inode->i_op && inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
        send_sig(SIGXFSZ, current, 0);
 out_big:
        return -EFBIG;
-out_busy:
-        return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
@@ -1980,67 +2006,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
        return 0;
 }
-/**
- * swapin_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
- * @addr: address to start
- * @vma: user vma this addresses belong to
- *
- * Primitive swap readahead code. We simply read an aligned block of
- * (1 << page_cluster) entries in the swap area. This method is chosen
- * because it doesn't cost us any seek time.  We also make sure to queue
- * the 'original' request together with the readahead ones...
- *
- * This has been extended to use the NUMA policies from the mm triggering
- * the readahead.
- *
- * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
- */
-void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
-{
-#ifdef CONFIG_NUMA
-        struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
-#endif
-        int i, num;
-        struct page *new_page;
-        unsigned long offset;
-        /*
-         * Get the number of handles we should do readahead io to.
-         */
-        num = valid_swaphandles(entry, &offset);
-        for (i = 0; i < num; offset++, i++) {
-                /* Ok, do the async read-ahead now */
-                new_page = read_swap_cache_async(swp_entry(swp_type(entry),
-                                                           offset), vma, addr);
-                if (!new_page)
-                        break;
-                page_cache_release(new_page);
-#ifdef CONFIG_NUMA
-                /*
-                 * Find the next applicable VMA for the NUMA policy.
-                 */
-                addr += PAGE_SIZE;
-                if (addr == 0)
-                        vma = NULL;
-                if (vma) {
-                        if (addr >= vma->vm_end) {
-                                vma = next_vma;
-                                next_vma = vma ? vma->vm_next : NULL;
-                        }
-                        if (vma && addr < vma->vm_start)
-                                vma = NULL;
-                } else {
-                        if (next_vma && addr >= next_vma->vm_start) {
-                                vma = next_vma;
-                                next_vma = vma->vm_next;
-                        }
-                }
-#endif
-        }
-        lru_add_drain();        /* Push any new pages onto the LRU now */
-}
 /*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2068,8 +2033,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = lookup_swap_cache(entry);
        if (!page) {
                grab_swap_token(); /* Contend for token _before_ read-in */
-                swapin_readahead(entry, address, vma);
+                page = swapin_readahead(entry,
-                page = read_swap_cache_async(entry, vma, address);
+                                        GFP_HIGHUSER_MOVABLE, vma, address);
                if (!page) {
                        /*
                         * Back out if somebody else faulted in this pte
@@ -2087,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(PGMAJFAULT);
        }
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                ret = VM_FAULT_OOM;
+                goto out;
+        }
        mark_page_accessed(page);
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2124,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (write_access) {
                /* XXX: We could OR the do_wp_page code with this one? */
                if (do_wp_page(mm, vma, address,
-                                page_table, pmd, ptl, pte) & VM_FAULT_OOM)
+                                page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
+                        mem_cgroup_uncharge_page(page);
                        ret = VM_FAULT_OOM;
+                }
                goto out;
        }
@@ -2136,6 +2109,7 @@ unlock:
 out:
        return ret;
 out_nomap:
+        mem_cgroup_uncharge_page(page);
        pte_unmap_unlock(page_table, ptl);
        unlock_page(page);
        page_cache_release(page);
@@ -2163,6 +2137,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
+        __SetPageUptodate(page);
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2181,8 +2159,11 @@ unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
 release:
+        mem_cgroup_uncharge_page(page);
        page_cache_release(page);
        goto unlock;
+oom_free_page:
+        __free_page(page);
 oom:
        return VM_FAULT_OOM;
 }
@@ -2263,6 +2244,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        copy_user_highpage(page, vmf.page, address, vma);
+                        __SetPageUptodate(page);
                } else {
                        /*
                         * If the page will be shareable, see if the backing
@@ -2295,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
+        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                ret = VM_FAULT_OOM;
+                goto out;
+        }
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        /*
@@ -2330,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, entry);
        } else {
+                mem_cgroup_uncharge_page(page);
                if (anon)
                        page_cache_release(page);
                else
@@ -2563,7 +2551,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd))          /* Another has populated it */
-                pud_free(new);
+                pud_free(mm, new);
        else
                pgd_populate(mm, pgd, new);
        spin_unlock(&mm->page_table_lock);
@@ -2585,12 +2573,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
        if (pud_present(*pud))          /* Another has populated it */
-                pmd_free(new);
+                pmd_free(mm, new);
        else
                pud_populate(mm, pud, new);
 #else
        if (pgd_present(*pud))          /* Another has populated it */
-                pmd_free(new);
+                pmd_free(mm, new);
        else
                pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
@@ -2618,46 +2606,6 @@ int make_pages_present(unsigned long addr, unsigned long end)
        return ret == len ? 0 : -1;
 }
-/* 
- * Map a vmalloc()-space virtual address to the physical page.
- */
-struct page * vmalloc_to_page(void * vmalloc_addr)
-{
-        unsigned long addr = (unsigned long) vmalloc_addr;
-        struct page *page = NULL;
-        pgd_t *pgd = pgd_offset_k(addr);
-        pud_t *pud;
-        pmd_t *pmd;
-        pte_t *ptep, pte;
-  
-        if (!pgd_none(*pgd)) {
-                pud = pud_offset(pgd, addr);
-                if (!pud_none(*pud)) {
-                        pmd = pmd_offset(pud, addr);
-                        if (!pmd_none(*pmd)) {
-                                ptep = pte_offset_map(pmd, addr);
-                                pte = *ptep;
-                                if (pte_present(pte))
-                                        page = pte_page(pte);
-                                pte_unmap(ptep);
-                        }
-                }
-        }
-        return page;
-}
-EXPORT_SYMBOL(vmalloc_to_page);
-/*
- * Map a vmalloc()-space virtual address to the physical page frame number.
- */
-unsigned long vmalloc_to_pfn(void * vmalloc_addr)
-{
-        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
-}
-EXPORT_SYMBOL(vmalloc_to_pfn);
 #if !defined(__HAVE_ARCH_GATE_AREA)
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9512a544d044..7469c503580d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-extern void drain_all_local_pages(void);
 int offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
@@ -540,7 +538,7 @@ repeat:
                lru_add_drain_all();
                flush_scheduled_work();
                cond_resched();
-                drain_all_local_pages();
+                drain_all_pages();
        }
        pfn = scan_lru_pages(start_pfn, end_pfn);
@@ -563,7 +561,7 @@ repeat:
        flush_scheduled_work();
        yield();
        /* drain pcp pages , this is synchrouns. */
-        drain_all_local_pages();
+        drain_all_pages();
        /* check again */
        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
        if (offlined_pages < 0) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a207e8d17ea..a73504ff5ab9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
 #include <linux/mempolicy.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
@@ -115,11 +116,6 @@ int putback_lru_pages(struct list_head *l)
        return count;
 }
-static inline int is_swap_pte(pte_t pte)
-{
-        return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
-}
 /*
 * Restore a potential migration pte to a working pte entry
 */
@@ -157,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
                return;
        }
+        if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
+                pte_unmap(ptep);
+                return;
+        }
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
@@ -592,9 +593,10 @@ static int move_to_new_page(struct page *newpage, struct page *page)
        else
                rc = fallback_migrate_page(mapping, newpage, page);
-        if (!rc)
+        if (!rc) {
+                mem_cgroup_page_migration(page, newpage);
                remove_migration_ptes(page, newpage);
-        else
+        } else
                newpage->mapping = NULL;
        unlock_page(newpage);
@@ -613,6 +615,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
+        int charge = 0;
        if (!newpage)
                return -ENOMEM;
@@ -645,23 +648,46 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                rcu_read_lock();
                rcu_locked = 1;
        }
        /*
-         * This is a corner case handling.
+         * Corner case handling:
-         * When a new swap-cache is read into, it is linked to LRU
+         * 1. When a new swap-cache page is read into, it is added to the LRU
-         * and treated as swapcache but has no rmap yet.
+         * and treated as swapcache but it has no rmap yet.
-         * Calling try_to_unmap() against a page->mapping==NULL page is
+         * Calling try_to_unmap() against a page->mapping==NULL page will
-         * BUG. So handle it here.
+         * trigger a BUG.  So handle it here.
+         * 2. An orphaned page (see truncate_complete_page) might have
+         * fs-private metadata. The page can be picked up due to memory
+         * offlining.  Everywhere else except page reclaim, the page is
+         * invisible to the vm, so the page can not be migrated.  So try to
+         * free the metadata, so the page can be freed.
         */
-        if (!page->mapping)
+        if (!page->mapping) {
+                if (!PageAnon(page) && PagePrivate(page)) {
+                        /*
+                         * Go direct to try_to_free_buffers() here because
+                         * a) that's what try_to_release_page() would do anyway
+                         * b) we may be under rcu_read_lock() here, so we can't
+                         *    use GFP_KERNEL which is what try_to_release_page()
+                         *    needs to be effective.
+                         */
+                        try_to_free_buffers(page);
+                }
                goto rcu_unlock;
+        }
+        charge = mem_cgroup_prepare_migration(page);
        /* Establish migration ptes or remove ptes */
        try_to_unmap(page, 1);
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
-        if (rc)
+        if (rc) {
                remove_migration_ptes(page, page);
+                if (charge)
+                        mem_cgroup_end_migration(page);
+        } else if (charge)
+                mem_cgroup_end_migration(newpage);
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index d2b6d44962b7..ad6e4eaf34f8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -36,6 +36,10 @@
 #define arch_mmap_check(addr, len, flags)       (0)
 #endif
+#ifndef arch_rebalance_pgtables
+#define arch_rebalance_pgtables(addr, len)              (addr)
+#endif
 static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
                unsigned long start, unsigned long end);
@@ -241,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
        down_write(&mm->mmap_sem);
-        if (brk < mm->end_code)
+        if (brk < mm->start_brk)
                goto out;
        /*
@@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
        if (addr & ~PAGE_MASK)
                return -EINVAL;
-        return addr;
+        return arch_rebalance_pgtables(addr, len);
 }
 EXPORT_SYMBOL(get_unmapped_area);
@@ -2216,7 +2220,7 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_start = addr;
        vma->vm_end = addr + len;
-        vma->vm_flags = vm_flags | mm->def_flags;
+        vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/nommu.c b/mm/nommu.c
index b989cb928a7c..5d8ae086f74e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,6 +10,7 @@
 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
+ *  Copyright (c) 2007      Paul Mundt <lethal@linux-sh.org>
 */
 #include <linux/module.h>
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
-void vfree(void *addr)
+void vfree(const void *addr)
 {
        kfree(addr);
 }
@@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
-struct page * vmalloc_to_page(void *addr)
+void *vmalloc_user(unsigned long size)
+{
+        void *ret;
+        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                        PAGE_KERNEL);
+        if (ret) {
+                struct vm_area_struct *vma;
+                down_write(&current->mm->mmap_sem);
+                vma = find_vma(current->mm, (unsigned long)ret);
+                if (vma)
+                        vma->vm_flags |= VM_USERMAP;
+                up_write(&current->mm->mmap_sem);
+        }
+        return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+struct page *vmalloc_to_page(const void *addr)
 {
        return virt_to_page(addr);
 }
 EXPORT_SYMBOL(vmalloc_to_page);
-unsigned long vmalloc_to_pfn(void *addr)
+unsigned long vmalloc_to_pfn(const void *addr)
 {
        return page_to_pfn(virt_to_page(addr));
 }
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32);
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
+ *
+ * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
+ * remap_vmalloc_range() are permissible.
 */
 void *vmalloc_32_user(unsigned long size)
 {
-        return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+        /*
+         * We'll have to sort out the ZONE_DMA bits for 64-bit,
+         * but for now this can simply use vmalloc_user() directly.
+         */
+        return vmalloc_user(size);
 }
 EXPORT_SYMBOL(vmalloc_32_user);
@@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_
 }
 EXPORT_SYMBOL(vmap);
-void vunmap(void *addr)
+void vunmap(const void *addr)
 {
        BUG();
 }
@@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 }
 EXPORT_SYMBOL(remap_pfn_range);
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+                        unsigned long pgoff)
+{
+        unsigned int size = vma->vm_end - vma->vm_start;
+        if (!(vma->vm_flags & VM_USERMAP))
+                return -EINVAL;
+        vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
+        vma->vm_end = vma->vm_start + size;
+        return 0;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 96473b482099..4194b9db0104 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -25,9 +25,11 @@
 #include <linux/cpuset.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
+#include <linux/memcontrol.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
+int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_mutex);
 /* #define DEBUG */
@@ -50,7 +52,8 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
 *    of least surprise ... (be careful when you change it)
 */
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+                        struct mem_cgroup *mem)
 {
        unsigned long points, cpu_time, run_time, s;
        struct mm_struct *mm;
@@ -125,8 +128,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * Superuser processes are usually more important, so we make it
         * less likely that we kill those.
         */
-        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+        if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
-                                p->uid == 0 || p->euid == 0)
                points /= 4;
        /*
@@ -135,7 +137,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * tend to only have this flag set on applications they think
         * of as important.
         */
-        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+        if (__capable(p, CAP_SYS_RAWIO))
                points /= 4;
        /*
@@ -194,7 +196,8 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+                                                struct mem_cgroup *mem)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
@@ -214,6 +217,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
                /* skip the init task */
                if (is_global_init(p))
                        continue;
+                if (mem && !task_in_mem_cgroup(p, mem))
+                        continue;
                /*
                 * This task already has access to memory reserves and is
@@ -248,7 +253,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
                if (p->oomkilladj == OOM_DISABLE)
                        continue;
-                points = badness(p, uptime.tv_sec);
+                points = badness(p, uptime.tv_sec, mem);
                if (points > *ppoints || !chosen) {
                        chosen = p;
                        *ppoints = points;
@@ -259,6 +264,41 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 }
 /**
+ * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * score, and name.
+ *
+ * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
+ * shown.
+ *
+ * Call with tasklist_lock read-locked.
+ */
+static void dump_tasks(const struct mem_cgroup *mem)
+{
+        struct task_struct *g, *p;
+        printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
+               "name\n");
+        do_each_thread(g, p) {
+                /*
+                 * total_vm and rss sizes do not exist for tasks with a
+                 * detached mm so there's no need to report them.
+                 */
+                if (!p->mm)
+                        continue;
+                if (mem && !task_in_mem_cgroup(p, mem))
+                        continue;
+                task_lock(p);
+                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
+                       p->pid, p->uid, p->tgid, p->mm->total_vm,
+                       get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
+                       p->comm);
+                task_unlock(p);
+        } while_each_thread(g, p);
+}
+/**
 * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
 * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
 * set.
@@ -335,7 +375,8 @@ static int oom_kill_task(struct task_struct *p)
 }
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                            unsigned long points, const char *message)
+                            unsigned long points, struct mem_cgroup *mem,
+                            const char *message)
 {
        struct task_struct *c;
@@ -345,6 +386,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        current->comm, gfp_mask, order, current->oomkilladj);
                dump_stack();
                show_mem();
+                if (sysctl_oom_dump_tasks)
+                        dump_tasks(mem);
        }
        /*
@@ -369,6 +412,31 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        return oom_kill_task(p);
 }
+#ifdef CONFIG_CGROUP_MEM_CONT
+void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
+{
+        unsigned long points = 0;
+        struct task_struct *p;
+        cgroup_lock();
+        rcu_read_lock();
+retry:
+        p = select_bad_process(&points, mem);
+        if (PTR_ERR(p) == -1UL)
+                goto out;
+        if (!p)
+                p = current;
+        if (oom_kill_process(p, gfp_mask, 0, points, mem,
+                                "Memory cgroup out of memory"))
+                goto retry;
+out:
+        rcu_read_unlock();
+        cgroup_unlock();
+}
+#endif
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 int register_oom_notifier(struct notifier_block *nb)
@@ -466,7 +534,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
        switch (constraint) {
        case CONSTRAINT_MEMORY_POLICY:
-                oom_kill_process(current, gfp_mask, order, points,
+                oom_kill_process(current, gfp_mask, order, points, NULL,
                                "No available memory (MPOL_BIND)");
                break;
@@ -476,7 +544,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                /* Fall-through */
        case CONSTRAINT_CPUSET:
                if (sysctl_oom_kill_allocating_task) {
-                        oom_kill_process(current, gfp_mask, order, points,
+                        oom_kill_process(current, gfp_mask, order, points, NULL,
                                        "Out of memory (oom_kill_allocating_task)");
                        break;
                }
@@ -485,7 +553,7 @@ retry:
                 * Rambo mode: Shoot down a process and hope it solves whatever
                 * issues we may have.
                 */
-                p = select_bad_process(&points);
+                p = select_bad_process(&points, NULL);
                if (PTR_ERR(p) == -1UL)
                        goto out;
@@ -496,7 +564,7 @@ retry:
                        panic("Out of memory and no killable processes...\n");
                }
-                if (oom_kill_process(p, gfp_mask, order, points,
+                if (oom_kill_process(p, gfp_mask, order, points, NULL,
                                     "Out of memory"))
                        goto retry;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3d3848fa6324..5e00f1772c20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
 int dirty_background_ratio = 5;
 /*
+ * free highmem will not be subtracted from the total free memory
+ * for calculating free ratios if vm_highmem_is_dirtyable is true
+ */
+int vm_highmem_is_dirtyable;
+/*
 * The generator of dirty data starts writeback at this percentage
 */
 int vm_dirty_ratio = 10;
@@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
 *
 *   dirty -= (dirty/8) * p_{t}
 */
-void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
 {
        long numerator, denominator;
        long dirty = *pdirty;
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void)
        x = global_page_state(NR_FREE_PAGES)
                + global_page_state(NR_INACTIVE)
                + global_page_state(NR_ACTIVE);
-        x -= highmem_dirtyable_memory(x);
+        if (!vm_highmem_is_dirtyable)
+                x -= highmem_dirtyable_memory(x);
        return x + 1;   /* Ensure that we never return 0 */
 }
@@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages)
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
@@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages)
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
-                        congestion_wait(WRITE, HZ/10);
+                        if (wbc.encountered_congestion || wbc.more_io)
-                        if (!wbc.encountered_congestion)
+                                congestion_wait(WRITE, HZ/10);
+                        else
                                break;
                }
        }
@@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg)
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
-                        if (wbc.encountered_congestion)
+                        if (wbc.encountered_congestion || wbc.more_io)
                                congestion_wait(WRITE, HZ/10);
                        else
                                break;  /* All the old data is written */
@@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page)
        return 0;
 }
-int fastcall set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
        int ret = __set_page_dirty(page);
        if (ret)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b2838c24e582..26a54a17dc9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
+#include <linux/memcontrol.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -537,7 +538,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
-void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
        if (order == 0) {
                __ClearPageReserved(page);
@@ -890,31 +891,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 }
 #endif
-static void __drain_pages(unsigned int cpu)
+/*
+ * Drain pages of the indicated processor.
+ *
+ * The processor must either be the current processor and the
+ * thread pinned to the current processor or a processor that
+ * is not online.
+ */
+static void drain_pages(unsigned int cpu)
 {
        unsigned long flags;
        struct zone *zone;
-        int i;
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
+                struct per_cpu_pages *pcp;
                if (!populated_zone(zone))
                        continue;
                pset = zone_pcp(zone, cpu);
-                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                        struct per_cpu_pages *pcp;
+                pcp = &pset->pcp;
+                local_irq_save(flags);
-                        pcp = &pset->pcp[i];
+                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                        local_irq_save(flags);
+                pcp->count = 0;
-                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                local_irq_restore(flags);
-                        pcp->count = 0;
-                        local_irq_restore(flags);
-                }
        }
 }
+/*
+ * Spill all of this CPU's per-cpu pages back into the buddy allocator.
+ */
+void drain_local_pages(void *arg)
+{
+        drain_pages(smp_processor_id());
+}
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_pages(void)
+{
+        on_each_cpu(drain_local_pages, NULL, 0, 1);
+}
 #ifdef CONFIG_HIBERNATION
 void mark_free_pages(struct zone *zone)
@@ -952,40 +973,9 @@ void mark_free_pages(struct zone *zone)
 #endif /* CONFIG_PM */
 /*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);  
-        __drain_pages(smp_processor_id());
-        local_irq_restore(flags);       
-}
-void smp_drain_local_pages(void *arg)
-{
-        drain_local_pages();
-}
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
- */
-void drain_all_local_pages(void)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        __drain_pages(smp_processor_id());
-        local_irq_restore(flags);
-        smp_call_function(smp_drain_local_pages, NULL, 0, 1);
-}
-/*
 * Free a 0-order page
 */
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+static void free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
@@ -998,13 +988,17 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (!PageHighMem(page))
                debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+        VM_BUG_ON(page_get_page_cgroup(page));
        arch_free_page(page, 0);
        kernel_map_pages(page, 1, 0);
-        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp;
        local_irq_save(flags);
        __count_vm_event(PGFREE);
-        list_add(&page->lru, &pcp->list);
+        if (cold)
+                list_add_tail(&page->lru, &pcp->list);
+        else
+                list_add(&page->lru, &pcp->list);
        set_page_private(page, get_pageblock_migratetype(page));
        pcp->count++;
        if (pcp->count >= pcp->high) {
@@ -1015,12 +1009,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        put_cpu();
 }
-void fastcall free_hot_page(struct page *page)
+void free_hot_page(struct page *page)
 {
        free_hot_cold_page(page, 0);
 }
        
-void fastcall free_cold_page(struct page *page)
+void free_cold_page(struct page *page)
 {
        free_hot_cold_page(page, 1);
 }
@@ -1062,7 +1056,7 @@ again:
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
-                pcp = &zone_pcp(zone, cpu)->pcp[cold];
+                pcp = &zone_pcp(zone, cpu)->pcp;
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
@@ -1072,9 +1066,15 @@ again:
                }
                /* Find a page of the appropriate migrate type */
-                list_for_each_entry(page, &pcp->list, lru)
+                if (cold) {
-                        if (page_private(page) == migratetype)
+                        list_for_each_entry_reverse(page, &pcp->list, lru)
-                                break;
+                                if (page_private(page) == migratetype)
+                                        break;
+                } else {
+                        list_for_each_entry(page, &pcp->list, lru)
+                                if (page_private(page) == migratetype)
+                                        break;
+                }
                /* Allocate more to the pcp list if necessary */
                if (unlikely(&page->lru == &pcp->list)) {
@@ -1569,7 +1569,7 @@ nofail_alloc:
        cond_resched();
        if (order != 0)
-                drain_all_local_pages();
+                drain_all_pages();
        if (likely(did_some_progress)) {
                page = get_page_from_freelist(gfp_mask, order,
@@ -1643,7 +1643,7 @@ EXPORT_SYMBOL(__alloc_pages);
 /*
 * Common helper functions.
 */
-fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
        struct page * page;
        page = alloc_pages(gfp_mask, order);
@@ -1654,7 +1654,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 EXPORT_SYMBOL(__get_free_pages);
-fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
+unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
        struct page * page;
@@ -1680,7 +1680,7 @@ void __pagevec_free(struct pagevec *pvec)
                free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
-fastcall void __free_pages(struct page *page, unsigned int order)
+void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
                if (order == 0)
@@ -1692,7 +1692,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
 EXPORT_SYMBOL(__free_pages);
-fastcall void free_pages(unsigned long addr, unsigned int order)
+void free_pages(unsigned long addr, unsigned int order)
 {
        if (addr != 0) {
                VM_BUG_ON(!virt_addr_valid((void *)addr));
@@ -1801,12 +1801,9 @@ void show_free_areas(void)
                        pageset = zone_pcp(zone, cpu);
-                        printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d   "
+                        printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-                               "Cold: hi:%5d, btch:%4d usd:%4d\n",
+                               cpu, pageset->pcp.high,
-                               cpu, pageset->pcp[0].high,
+                               pageset->pcp.batch, pageset->pcp.count);
-                               pageset->pcp[0].batch, pageset->pcp[0].count,
-                               pageset->pcp[1].high, pageset->pcp[1].batch,
-                               pageset->pcp[1].count);
                }
        }
@@ -1879,6 +1876,8 @@ void show_free_areas(void)
                printk("= %lukB\n", K(total));
        }
+        printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
        show_swap_cache_info();
 }
@@ -2528,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                set_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
+                page_assign_page_cgroup(page, NULL);
                SetPageReserved(page);
                /*
@@ -2551,8 +2551,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        }
 }
-static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
+static void __meminit zone_init_free_lists(struct zone *zone)
-                                struct zone *zone, unsigned long size)
 {
        int order, t;
        for_each_migratetype_order(order, t) {
@@ -2604,17 +2603,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        memset(p, 0, sizeof(*p));
-        pcp = &p->pcp[0];               /* hot */
+        pcp = &p->pcp;
        pcp->count = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
        INIT_LIST_HEAD(&pcp->list);
-        pcp = &p->pcp[1];               /* cold*/
-        pcp->count = 0;
-        pcp->high = 2 * batch;
-        pcp->batch = max(1UL, batch/2);
-        INIT_LIST_HEAD(&pcp->list);
 }
 /*
@@ -2627,7 +2620,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
 {
        struct per_cpu_pages *pcp;
-        pcp = &p->pcp[0]; /* hot list */
+        pcp = &p->pcp;
        pcp->high = high;
        pcp->batch = max(1UL, high/4);
        if ((high/4) > (PAGE_SHIFT * 8))
@@ -2831,7 +2824,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
-        zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+        zone_init_free_lists(zone);
        return 0;
 }
@@ -3978,10 +3971,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        int cpu = (unsigned long)hcpu;
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-                local_irq_disable();
+                drain_pages(cpu);
-                __drain_pages(cpu);
+                /*
+                 * Spill the event counters of the dead processor
+                 * into the current processors event counters.
+                 * This artificially elevates the count of the current
+                 * processor.
+                 */
                vm_events_fold_cpu(cpu);
-                local_irq_enable();
+                /*
+                 * Zero the differential counters of the dead processor
+                 * so that the vm statistics are consistent.
+                 *
+                 * This is only okay since the processor is dead and cannot
+                 * race with what we are doing.
+                 */
                refresh_cpu_vm_stats(cpu);
        }
        return NOTIFY_OK;
@@ -4480,7 +4486,7 @@ int set_migratetype_isolate(struct page *page)
 out:
        spin_unlock_irqrestore(&zone->lock, flags);
        if (!ret)
-                drain_all_local_pages();
+                drain_all_pages();
        return ret;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f6850273..065c4480eaf0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
        int ret = 0;
        BUG_ON(!PageLocked(page));
-        ClearPageUptodate(page);
+        BUG_ON(PageUptodate(page));
        bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
                                end_swap_bio_read);
        if (bio == NULL) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 000000000000..b4f27d22da91
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,131 @@
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/sched.h>
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+                          const struct mm_walk *walk, void *private)
+{
+        pte_t *pte;
+        int err = 0;
+        pte = pte_offset_map(pmd, addr);
+        do {
+                err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
+                if (err)
+                       break;
+        } while (pte++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(pte);
+        return err;
+}
+static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+                          const struct mm_walk *walk, void *private)
+{
+        pmd_t *pmd;
+        unsigned long next;
+        int err = 0;
+        pmd = pmd_offset(pud, addr);
+        do {
+                next = pmd_addr_end(addr, end);
+                if (pmd_none_or_clear_bad(pmd)) {
+                        if (walk->pte_hole)
+                                err = walk->pte_hole(addr, next, private);
+                        if (err)
+                                break;
+                        continue;
+                }
+                if (walk->pmd_entry)
+                        err = walk->pmd_entry(pmd, addr, next, private);
+                if (!err && walk->pte_entry)
+                        err = walk_pte_range(pmd, addr, next, walk, private);
+                if (err)
+                        break;
+        } while (pmd++, addr = next, addr != end);
+        return err;
+}
+static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+                          const struct mm_walk *walk, void *private)
+{
+        pud_t *pud;
+        unsigned long next;
+        int err = 0;
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+                if (pud_none_or_clear_bad(pud)) {
+                        if (walk->pte_hole)
+                                err = walk->pte_hole(addr, next, private);
+                        if (err)
+                                break;
+                        continue;
+                }
+                if (walk->pud_entry)
+                        err = walk->pud_entry(pud, addr, next, private);
+                if (!err && (walk->pmd_entry || walk->pte_entry))
+                        err = walk_pmd_range(pud, addr, next, walk, private);
+                if (err)
+                        break;
+        } while (pud++, addr = next, addr != end);
+        return err;
+}
+/**
+ * walk_page_range - walk a memory map's page tables with a callback
+ * @mm - memory map to walk
+ * @addr - starting address
+ * @end - ending address
+ * @walk - set of callbacks to invoke for each level of the tree
+ * @private - private data passed to the callback function
+ *
+ * Recursively walk the page table for the memory area in a VMA,
+ * calling supplied callbacks. Callbacks are called in-order (first
+ * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
+ * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ *
+ * Each callback receives an entry pointer, the start and end of the
+ * associated range, and a caller-supplied private data pointer.
+ *
+ * No locks are taken, but the bottom level iterator will map PTE
+ * directories from highmem if necessary.
+ *
+ * If any callback returns a non-zero value, the walk is aborted and
+ * the return value is propagated back to the caller. Otherwise 0 is returned.
+ */
+int walk_page_range(const struct mm_struct *mm,
+                    unsigned long addr, unsigned long end,
+                    const struct mm_walk *walk, void *private)
+{
+        pgd_t *pgd;
+        unsigned long next;
+        int err = 0;
+        if (addr >= end)
+                return err;
+        pgd = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd)) {
+                        if (walk->pte_hole)
+                                err = walk->pte_hole(addr, next, private);
+                        if (err)
+                                break;
+                        continue;
+                }
+                if (walk->pgd_entry)
+                        err = walk->pgd_entry(pgd, addr, next, private);
+                if (!err &&
+                    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+                        err = walk_pud_range(pgd, addr, next, walk, private);
+                if (err)
+                        break;
+        } while (pgd++, addr = next, addr != end);
+        return err;
+}
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2057a5..a0e92a263d12 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within inode_lock in __sync_single_inode)
- *                   zone->lock (within radix tree node alloc)
 */
 #include <linux/mm.h>
@@ -49,6 +48,7 @@
 #include <linux/rcupdate.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/memcontrol.h>
 #include <asm/tlbflush.h>
@@ -284,7 +284,10 @@ static int page_referenced_one(struct page *page,
        if (!pte)
                goto out;
-        if (ptep_clear_flush_young(vma, address, pte))
+        if (vma->vm_flags & VM_LOCKED) {
+                referenced++;
+                *mapcount = 1;  /* break early from loop */
+        } else if (ptep_clear_flush_young(vma, address, pte))
                referenced++;
        /* Pretend the page is referenced if the task has the
@@ -299,7 +302,8 @@ out:
        return referenced;
 }
-static int page_referenced_anon(struct page *page)
+static int page_referenced_anon(struct page *page,
+                                struct mem_cgroup *mem_cont)
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
@@ -312,6 +316,13 @@ static int page_referenced_anon(struct page *page)
        mapcount = page_mapcount(page);
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                /*
+                 * If we are reclaiming on behalf of a cgroup, skip
+                 * counting on behalf of references from different
+                 * cgroups
+                 */
+                if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont))
+                        continue;
                referenced += page_referenced_one(page, vma, &mapcount);
                if (!mapcount)
                        break;
@@ -332,7 +343,8 @@ static int page_referenced_anon(struct page *page)
 *
 * This function is only called from page_referenced for object-based pages.
 */
-static int page_referenced_file(struct page *page)
+static int page_referenced_file(struct page *page,
+                                struct mem_cgroup *mem_cont)
 {
        unsigned int mapcount;
        struct address_space *mapping = page->mapping;
@@ -365,6 +377,13 @@ static int page_referenced_file(struct page *page)
        mapcount = page_mapcount(page);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                /*
+                 * If we are reclaiming on behalf of a cgroup, skip
+                 * counting on behalf of references from different
+                 * cgroups
+                 */
+                if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont))
+                        continue;
                if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
                                  == (VM_LOCKED|VM_MAYSHARE)) {
                        referenced++;
@@ -387,7 +406,8 @@ static int page_referenced_file(struct page *page)
 * Quick test_and_clear_referenced for all mappings to a page,
 * returns the number of ptes which referenced the page.
 */
-int page_referenced(struct page *page, int is_locked)
+int page_referenced(struct page *page, int is_locked,
+                        struct mem_cgroup *mem_cont)
 {
        int referenced = 0;
@@ -399,14 +419,15 @@ int page_referenced(struct page *page, int is_locked)
        if (page_mapped(page) && page->mapping) {
                if (PageAnon(page))
-                        referenced += page_referenced_anon(page);
+                        referenced += page_referenced_anon(page, mem_cont);
                else if (is_locked)
-                        referenced += page_referenced_file(page);
+                        referenced += page_referenced_file(page, mem_cont);
                else if (TestSetPageLocked(page))
                        referenced++;
                else {
                        if (page->mapping)
-                                referenced += page_referenced_file(page);
+                                referenced +=
+                                        page_referenced_file(page, mem_cont);
                        unlock_page(page);
                }
        }
@@ -552,8 +573,14 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (atomic_inc_and_test(&page->_mapcount))
                __page_set_anon_rmap(page, vma, address);
-        else
+        else {
                __page_check_anon_rmap(page, vma, address);
+                /*
+                 * We unconditionally charged during prepare, we uncharge here
+                 * This takes care of balancing the reference counts
+                 */
+                mem_cgroup_uncharge_page(page);
+        }
 }
 /*
@@ -584,6 +611,12 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_zone_page_state(page, NR_FILE_MAPPED);
+        else
+                /*
+                 * We unconditionally charged during prepare, we uncharge here
+                 * This takes care of balancing the reference counts
+                 */
+                mem_cgroup_uncharge_page(page);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -644,6 +677,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                        page_clear_dirty(page);
                        set_page_dirty(page);
                }
+                mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page,
                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index 51b3d6ccddab..85bed948fafc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,11 +78,10 @@
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
-        SGP_QUICK,      /* don't try more than file page cache lookup */
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
+        SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
        SGP_WRITE,      /* may exceed i_size, may allocate page */
-        SGP_FAULT,      /* same as SGP_CACHE, return with page locked */
 };
 static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 };
 static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
@@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages)
        }
 }
+static int shmem_reserve_inode(struct super_block *sb)
+{
+        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+        if (sbinfo->max_inodes) {
+                spin_lock(&sbinfo->stat_lock);
+                if (!sbinfo->free_inodes) {
+                        spin_unlock(&sbinfo->stat_lock);
+                        return -ENOSPC;
+                }
+                sbinfo->free_inodes--;
+                spin_unlock(&sbinfo->stat_lock);
+        }
+        return 0;
+}
+static void shmem_free_inode(struct super_block *sb)
+{
+        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+        if (sbinfo->max_inodes) {
+                spin_lock(&sbinfo->stat_lock);
+                sbinfo->free_inodes++;
+                spin_unlock(&sbinfo->stat_lock);
+        }
+}
 /*
 * shmem_recalc_inode - recalculate the size of an inode
 *
@@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                                (void) shmem_getpage(inode,
                                        attr->ia_size>>PAGE_CACHE_SHIFT,
                                                &page, SGP_READ, NULL);
+                                if (page)
+                                        unlock_page(page);
                        }
                        /*
                         * Reset SHMEM_PAGEIN flag so that shmem_truncate can
@@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 static void shmem_delete_inode(struct inode *inode)
 {
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        if (inode->i_op->truncate == shmem_truncate) {
@@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode)
                inode->i_size = 0;
                shmem_truncate(inode);
                if (!list_empty(&info->swaplist)) {
-                        spin_lock(&shmem_swaplist_lock);
+                        mutex_lock(&shmem_swaplist_mutex);
                        list_del_init(&info->swaplist);
-                        spin_unlock(&shmem_swaplist_lock);
+                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }
        BUG_ON(inode->i_blocks);
-        if (sbinfo->max_inodes) {
+        shmem_free_inode(inode->i_sb);
-                spin_lock(&sbinfo->stat_lock);
-                sbinfo->free_inodes++;
-                spin_unlock(&sbinfo->stat_lock);
-        }
        clear_inode(inode);
 }
@@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
        struct page *subdir;
        swp_entry_t *ptr;
        int offset;
+        int error;
        idx = 0;
        ptr = info->i_direct;
        spin_lock(&info->lock);
+        if (!info->swapped) {
+                list_del_init(&info->swaplist);
+                goto lost2;
+        }
        limit = info->next_index;
        size = limit;
        if (size > SHMEM_NR_DIRECT)
                size = SHMEM_NR_DIRECT;
        offset = shmem_find_swp(entry, ptr, ptr+size);
-        if (offset >= 0) {
+        if (offset >= 0)
-                shmem_swp_balance_unmap();
                goto found;
-        }
        if (!info->i_indirect)
                goto lost2;
@@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
        for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
                if (unlikely(idx == stage)) {
                        shmem_dir_unmap(dir-1);
+                        if (cond_resched_lock(&info->lock)) {
+                                /* check it has not been truncated */
+                                if (limit > info->next_index) {
+                                        limit = info->next_index;
+                                        if (idx >= limit)
+                                                goto lost2;
+                                }
+                        }
                        dir = shmem_dir_map(info->i_indirect) +
                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
                        while (!*dir) {
@@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        offset = shmem_find_swp(entry, ptr, ptr+size);
+                        shmem_swp_unmap(ptr);
                        if (offset >= 0) {
                                shmem_dir_unmap(dir);
                                goto found;
                        }
-                        shmem_swp_unmap(ptr);
                }
        }
 lost1:
@@ -863,19 +895,69 @@ lost2:
        return 0;
 found:
        idx += offset;
-        inode = &info->vfs_inode;
+        inode = igrab(&info->vfs_inode);
-        if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
-                info->flags |= SHMEM_PAGEIN;
-                shmem_swp_set(info, ptr + offset, 0);
-        }
-        shmem_swp_unmap(ptr);
        spin_unlock(&info->lock);
        /*
-         * Decrement swap count even when the entry is left behind:
+         * Move _head_ to start search for next from here.
-         * try_to_unuse will skip over mms, then reincrement count.
+         * But be careful: shmem_delete_inode checks list_empty without taking
+         * mutex, and there's an instant in list_move_tail when info->swaplist
+         * would appear empty, if it were the only one on shmem_swaplist.  We
+         * could avoid doing it if inode NULL; or use this minor optimization.
         */
-        swap_free(entry);
+        if (shmem_swaplist.next != &info->swaplist)
-        return 1;
+                list_move_tail(&shmem_swaplist, &info->swaplist);
+        mutex_unlock(&shmem_swaplist_mutex);
+        error = 1;
+        if (!inode)
+                goto out;
+        /* Precharge page while we can wait, compensate afterwards */
+        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+        if (error)
+                goto out;
+        error = radix_tree_preload(GFP_KERNEL);
+        if (error)
+                goto uncharge;
+        error = 1;
+        spin_lock(&info->lock);
+        ptr = shmem_swp_entry(info, idx, NULL);
+        if (ptr && ptr->val == entry.val)
+                error = add_to_page_cache(page, inode->i_mapping,
+                                                idx, GFP_NOWAIT);
+        if (error == -EEXIST) {
+                struct page *filepage = find_get_page(inode->i_mapping, idx);
+                error = 1;
+                if (filepage) {
+                        /*
+                         * There might be a more uptodate page coming down
+                         * from a stacked writepage: forget our swappage if so.
+                         */
+                        if (PageUptodate(filepage))
+                                error = 0;
+                        page_cache_release(filepage);
+                }
+        }
+        if (!error) {
+                delete_from_swap_cache(page);
+                set_page_dirty(page);
+                info->flags |= SHMEM_PAGEIN;
+                shmem_swp_set(info, ptr, 0);
+                swap_free(entry);
+                error = 1;      /* not an error, but entry was found */
+        }
+        if (ptr)
+                shmem_swp_unmap(ptr);
+        spin_unlock(&info->lock);
+        radix_tree_preload_end();
+uncharge:
+        mem_cgroup_uncharge_page(page);
+out:
+        unlock_page(page);
+        page_cache_release(page);
+        iput(inode);            /* allows for NULL */
+        return error;
 }
 /*
@@ -887,20 +969,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
        struct shmem_inode_info *info;
        int found = 0;
-        spin_lock(&shmem_swaplist_lock);
+        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_safe(p, next, &shmem_swaplist) {
                info = list_entry(p, struct shmem_inode_info, swaplist);
-                if (!info->swapped)
+                found = shmem_unuse_inode(info, entry, page);
-                        list_del_init(&info->swaplist);
+                cond_resched();
-                else if (shmem_unuse_inode(info, entry, page)) {
+                if (found)
-                        /* move head to start search for next from here */
+                        goto out;
-                        list_move_tail(&shmem_swaplist, &info->swaplist);
-                        found = 1;
-                        break;
-                }
        }
-        spin_unlock(&shmem_swaplist_lock);
+        mutex_unlock(&shmem_swaplist_mutex);
-        return found;
+out:    return found;   /* 0 or 1 or -ENOMEM */
 }
 /*
@@ -915,54 +993,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        struct inode *inode;
        BUG_ON(!PageLocked(page));
-        /*
-         * shmem_backing_dev_info's capabilities prevent regular writeback or
-         * sync from ever calling shmem_writepage; but a stacking filesystem
-         * may use the ->writepage of its underlying filesystem, in which case
-         * we want to do nothing when that underlying filesystem is tmpfs
-         * (writing out to swap is useful as a response to memory pressure, but
-         * of no use to stabilize the data) - just redirty the page, unlock it
-         * and claim success in this case.  AOP_WRITEPAGE_ACTIVATE, and the
-         * page_mapped check below, must be avoided unless we're in reclaim.
-         */
-        if (!wbc->for_reclaim) {
-                set_page_dirty(page);
-                unlock_page(page);
-                return 0;
-        }
-        BUG_ON(page_mapped(page));
        mapping = page->mapping;
        index = page->index;
        inode = mapping->host;
        info = SHMEM_I(inode);
        if (info->flags & VM_LOCKED)
                goto redirty;
-        swap = get_swap_page();
+        if (!total_swap_pages)
-        if (!swap.val)
                goto redirty;
+        /*
+         * shmem_backing_dev_info's capabilities prevent regular writeback or
+         * sync from ever calling shmem_writepage; but a stacking filesystem
+         * may use the ->writepage of its underlying filesystem, in which case
+         * tmpfs should write out to swap only in response to memory pressure,
+         * and not for pdflush or sync.  However, in those cases, we do still
+         * want to check if there's a redundant swappage to be discarded.
+         */
+        if (wbc->for_reclaim)
+                swap = get_swap_page();
+        else
+                swap.val = 0;
        spin_lock(&info->lock);
-        shmem_recalc_inode(inode);
        if (index >= info->next_index) {
                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
                goto unlock;
        }
        entry = shmem_swp_entry(info, index, NULL);
-        BUG_ON(!entry);
+        if (entry->val) {
-        BUG_ON(entry->val);
+                /*
+                 * The more uptodate page coming down from a stacked
+                 * writepage should replace our old swappage.
+                 */
+                free_swap_and_cache(*entry);
+                shmem_swp_set(info, entry, 0);
+        }
+        shmem_recalc_inode(inode);
-        if (move_to_swap_cache(page, swap) == 0) {
+        if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+                remove_from_page_cache(page);
                shmem_swp_set(info, entry, swap.val);
                shmem_swp_unmap(entry);
+                if (list_empty(&info->swaplist))
+                        inode = igrab(inode);
+                else
+                        inode = NULL;
                spin_unlock(&info->lock);
-                if (list_empty(&info->swaplist)) {
+                swap_duplicate(swap);
-                        spin_lock(&shmem_swaplist_lock);
+                BUG_ON(page_mapped(page));
+                page_cache_release(page);       /* pagecache ref */
+                set_page_dirty(page);
+                unlock_page(page);
+                if (inode) {
+                        mutex_lock(&shmem_swaplist_mutex);
                        /* move instead of add in case we're racing */
                        list_move_tail(&info->swaplist, &shmem_swaplist);
-                        spin_unlock(&shmem_swaplist_lock);
+                        mutex_unlock(&shmem_swaplist_mutex);
+                        iput(inode);
                }
-                unlock_page(page);
                return 0;
        }
@@ -972,7 +1061,10 @@ unlock:
        swap_free(swap);
 redirty:
        set_page_dirty(page);
-        return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
+        if (wbc->for_reclaim)
+                return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
+        unlock_page(page);
+        return 0;
 }
 #ifdef CONFIG_NUMA
@@ -1025,53 +1117,33 @@ out:
        return err;
 }
-static struct page *shmem_swapin_async(struct shared_policy *p,
+static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-                                       swp_entry_t entry, unsigned long idx)
+                        struct shmem_inode_info *info, unsigned long idx)
 {
-        struct page *page;
        struct vm_area_struct pvma;
+        struct page *page;
        /* Create a pseudo vma that just contains the policy */
-        memset(&pvma, 0, sizeof(struct vm_area_struct));
+        pvma.vm_start = 0;
-        pvma.vm_end = PAGE_SIZE;
        pvma.vm_pgoff = idx;
-        pvma.vm_policy = mpol_shared_policy_lookup(p, idx);
+        pvma.vm_ops = NULL;
-        page = read_swap_cache_async(entry, &pvma, 0);
+        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+        page = swapin_readahead(entry, gfp, &pvma, 0);
        mpol_free(pvma.vm_policy);
        return page;
 }
-static struct page *shmem_swapin(struct shmem_inode_info *info,
+static struct page *shmem_alloc_page(gfp_t gfp,
-                                 swp_entry_t entry, unsigned long idx)
+                        struct shmem_inode_info *info, unsigned long idx)
-{
-        struct shared_policy *p = &info->policy;
-        int i, num;
-        struct page *page;
-        unsigned long offset;
-        num = valid_swaphandles(entry, &offset);
-        for (i = 0; i < num; offset++, i++) {
-                page = shmem_swapin_async(p,
-                                swp_entry(swp_type(entry), offset), idx);
-                if (!page)
-                        break;
-                page_cache_release(page);
-        }
-        lru_add_drain();        /* Push any new pages onto the LRU now */
-        return shmem_swapin_async(p, entry, idx);
-}
-static struct page *
-shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
-                 unsigned long idx)
 {
        struct vm_area_struct pvma;
        struct page *page;
-        memset(&pvma, 0, sizeof(struct vm_area_struct));
+        /* Create a pseudo vma that just contains the policy */
-        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+        pvma.vm_start = 0;
        pvma.vm_pgoff = idx;
-        pvma.vm_end = PAGE_SIZE;
+        pvma.vm_ops = NULL;
+        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
        page = alloc_page_vma(gfp, &pvma, 0);
        mpol_free(pvma.vm_policy);
        return page;
@@ -1083,15 +1155,14 @@ static inline int shmem_parse_mpol(char *value, int *policy,
        return 1;
 }
-static inline struct page *
+static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
+                        struct shmem_inode_info *info, unsigned long idx)
 {
-        swapin_readahead(entry, 0, NULL);
+        return swapin_readahead(entry, gfp, NULL, 0);
-        return read_swap_cache_async(entry, NULL, 0);
 }
-static inline struct page *
+static inline struct page *shmem_alloc_page(gfp_t gfp,
-shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, unsigned long idx)
 {
        return alloc_page(gfp);
 }
@@ -1114,6 +1185,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
        struct page *swappage;
        swp_entry_t *entry;
        swp_entry_t swap;
+        gfp_t gfp;
        int error;
        if (idx >= SHMEM_MAX_INDEX)
@@ -1126,7 +1198,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
         * Normally, filepage is NULL on entry, and either found
         * uptodate immediately, or allocated and zeroed, or read
         * in under swappage, which is then assigned to filepage.
-         * But shmem_readpage and shmem_write_begin pass in a locked
+         * But shmem_readpage (required for splice) passes in a locked
         * filepage, which may be found not uptodate by other callers
         * too, and may need to be copied from the swappage read in.
         */
@@ -1136,8 +1208,17 @@ repeat:
        if (filepage && PageUptodate(filepage))
                goto done;
        error = 0;
-        if (sgp == SGP_QUICK)
+        gfp = mapping_gfp_mask(mapping);
-                goto failed;
+        if (!filepage) {
+                /*
+                 * Try to preload while we can wait, to not make a habit of
+                 * draining atomic reserves; but don't latch on to this cpu.
+                 */
+                error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+                if (error)
+                        goto failed;
+                radix_tree_preload_end();
+        }
        spin_lock(&info->lock);
        shmem_recalc_inode(inode);
@@ -1160,7 +1241,7 @@ repeat:
                                *type |= VM_FAULT_MAJOR;
                        }
                        spin_unlock(&info->lock);
-                        swappage = shmem_swapin(info, swap, idx);
+                        swappage = shmem_swapin(swap, gfp, info, idx);
                        if (!swappage) {
                                spin_lock(&info->lock);
                                entry = shmem_swp_alloc(info, idx, sgp);
@@ -1218,13 +1299,15 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = move_from_swap_cache(
+                } else if (!(error = add_to_page_cache(
-                                swappage, idx, mapping))) {
+                                swappage, mapping, idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
+                        delete_from_swap_cache(swappage);
                        spin_unlock(&info->lock);
                        filepage = swappage;
+                        set_page_dirty(filepage);
                        swap_free(swap);
                } else {
                        shmem_swp_unmap(entry);
@@ -1232,8 +1315,11 @@ repeat:
                        unlock_page(swappage);
                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
-                                /* let kswapd refresh zone for GFP_ATOMICs */
+                                /* allow reclaim from this memory cgroup */
-                                congestion_wait(WRITE, HZ/50);
+                                error = mem_cgroup_cache_charge(NULL,
+                                        current->mm, gfp & ~__GFP_HIGHMEM);
+                                if (error)
+                                        goto failed;
                        }
                        goto repeat;
                }
@@ -1272,9 +1358,7 @@ repeat:
                if (!filepage) {
                        spin_unlock(&info->lock);
-                        filepage = shmem_alloc_page(mapping_gfp_mask(mapping),
+                        filepage = shmem_alloc_page(gfp, info, idx);
-                                                    info,
-                                                    idx);
                        if (!filepage) {
                                shmem_unacct_blocks(info->flags, 1);
                                shmem_free_blocks(inode, 1);
@@ -1282,6 +1366,17 @@ repeat:
                                goto failed;
                        }
+                        /* Precharge page while we can wait, compensate after */
+                        error = mem_cgroup_cache_charge(filepage, current->mm,
+                                                        gfp & ~__GFP_HIGHMEM);
+                        if (error) {
+                                page_cache_release(filepage);
+                                shmem_unacct_blocks(info->flags, 1);
+                                shmem_free_blocks(inode, 1);
+                                filepage = NULL;
+                                goto failed;
+                        }
                        spin_lock(&info->lock);
                        entry = shmem_swp_alloc(info, idx, sgp);
                        if (IS_ERR(entry))
@@ -1291,8 +1386,9 @@ repeat:
                                shmem_swp_unmap(entry);
                        }
                        if (error || swap.val || 0 != add_to_page_cache_lru(
-                                        filepage, mapping, idx, GFP_ATOMIC)) {
+                                        filepage, mapping, idx, GFP_NOWAIT)) {
                                spin_unlock(&info->lock);
+                                mem_cgroup_uncharge_page(filepage);
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
                                shmem_free_blocks(inode, 1);
@@ -1301,6 +1397,7 @@ repeat:
                                        goto failed;
                                goto repeat;
                        }
+                        mem_cgroup_uncharge_page(filepage);
                        info->flags |= SHMEM_PAGEIN;
                }
@@ -1309,14 +1406,11 @@ repeat:
                clear_highpage(filepage);
                flush_dcache_page(filepage);
                SetPageUptodate(filepage);
+                if (sgp == SGP_DIRTY)
+                        set_page_dirty(filepage);
        }
 done:
-        if (*pagep != filepage) {
+        *pagep = filepage;
-                *pagep = filepage;
-                if (sgp != SGP_FAULT)
-                        unlock_page(filepage);
-        }
        return 0;
 failed:
@@ -1336,7 +1430,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
                return VM_FAULT_SIGBUS;
-        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
+        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1399,15 +1493,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-        if (sbinfo->max_inodes) {
+        if (shmem_reserve_inode(sb))
-                spin_lock(&sbinfo->stat_lock);
+                return NULL;
-                if (!sbinfo->free_inodes) {
-                        spin_unlock(&sbinfo->stat_lock);
-                        return NULL;
-                }
-                sbinfo->free_inodes--;
-                spin_unlock(&sbinfo->stat_lock);
-        }
        inode = new_inode(sb);
        if (inode) {
@@ -1451,11 +1538,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                                                NULL);
                        break;
                }
-        } else if (sbinfo->max_inodes) {
+        } else
-                spin_lock(&sbinfo->stat_lock);
+                shmem_free_inode(sb);
-                sbinfo->free_inodes++;
-                spin_unlock(&sbinfo->stat_lock);
-        }
        return inode;
 }
@@ -1494,123 +1578,30 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
+        if (pos + copied > inode->i_size)
+                i_size_write(inode, pos + copied);
+        unlock_page(page);
        set_page_dirty(page);
        page_cache_release(page);
-        if (pos+copied > inode->i_size)
-                i_size_write(inode, pos+copied);
        return copied;
 }
-static ssize_t
-shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-{
-        struct inode    *inode = file->f_path.dentry->d_inode;
-        loff_t          pos;
-        unsigned long   written;
-        ssize_t         err;
-        if ((ssize_t) count < 0)
-                return -EINVAL;
-        if (!access_ok(VERIFY_READ, buf, count))
-                return -EFAULT;
-        mutex_lock(&inode->i_mutex);
-        pos = *ppos;
-        written = 0;
-        err = generic_write_checks(file, &pos, &count, 0);
-        if (err || !count)
-                goto out;
-        err = remove_suid(file->f_path.dentry);
-        if (err)
-                goto out;
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-        do {
-                struct page *page = NULL;
-                unsigned long bytes, index, offset;
-                char *kaddr;
-                int left;
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = PAGE_CACHE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-                /*
-                 * We don't hold page lock across copy from user -
-                 * what would it guard against? - so no deadlock here.
-                 * But it still may be a good idea to prefault below.
-                 */
-                err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
-                if (err)
-                        break;
-                left = bytes;
-                if (PageHighMem(page)) {
-                        volatile unsigned char dummy;
-                        __get_user(dummy, buf);
-                        __get_user(dummy, buf + bytes - 1);
-                        kaddr = kmap_atomic(page, KM_USER0);
-                        left = __copy_from_user_inatomic(kaddr + offset,
-                                                        buf, bytes);
-                        kunmap_atomic(kaddr, KM_USER0);
-                }
-                if (left) {
-                        kaddr = kmap(page);
-                        left = __copy_from_user(kaddr + offset, buf, bytes);
-                        kunmap(page);
-                }
-                written += bytes;
-                count -= bytes;
-                pos += bytes;
-                buf += bytes;
-                if (pos > inode->i_size)
-                        i_size_write(inode, pos);
-                flush_dcache_page(page);
-                set_page_dirty(page);
-                mark_page_accessed(page);
-                page_cache_release(page);
-                if (left) {
-                        pos -= left;
-                        written -= left;
-                        err = -EFAULT;
-                        break;
-                }
-                /*
-                 * Our dirty pages are not counted in nr_dirty,
-                 * and we do not attempt to balance dirty pages.
-                 */
-                cond_resched();
-        } while (count);
-        *ppos = pos;
-        if (written)
-                err = written;
-out:
-        mutex_unlock(&inode->i_mutex);
-        return err;
-}
 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index, offset;
+        enum sgp_type sgp = SGP_READ;
+        /*
+         * Might this read be for a stacking filesystem?  Then when reading
+         * holes of a sparse file, we actually need to allocate those pages,
+         * and even mark them dirty, so it cannot exceed the max_blocks limit.
+         */
+        if (segment_eq(get_fs(), KERNEL_DS))
+                sgp = SGP_DIRTY;
        index = *ppos >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1629,12 +1620,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                                break;
                }
-                desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+                desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
                if (desc->error) {
                        if (desc->error == -EINVAL)
                                desc->error = 0;
                        break;
                }
+                if (page)
+                        unlock_page(page);
                /*
                 * We must evaluate after, since reads (unlike writes)
@@ -1798,22 +1791,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        int ret;
        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         */
-        if (sbinfo->max_inodes) {
+        ret = shmem_reserve_inode(inode->i_sb);
-                spin_lock(&sbinfo->stat_lock);
+        if (ret)
-                if (!sbinfo->free_inodes) {
+                goto out;
-                        spin_unlock(&sbinfo->stat_lock);
-                        return -ENOSPC;
-                }
-                sbinfo->free_inodes--;
-                spin_unlock(&sbinfo->stat_lock);
-        }
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1821,21 +1808,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
        atomic_inc(&inode->i_count);    /* New dentry reference */
        dget(dentry);           /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
-        return 0;
+out:
+        return ret;
 }
 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
-        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
+        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
-                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+                shmem_free_inode(inode->i_sb);
-                if (sbinfo->max_inodes) {
-                        spin_lock(&sbinfo->stat_lock);
-                        sbinfo->free_inodes++;
-                        spin_unlock(&sbinfo->stat_lock);
-                }
-        }
        dir->i_size -= BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1924,6 +1906,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        iput(inode);
                        return error;
                }
+                unlock_page(page);
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
@@ -1951,6 +1934,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
        struct page *page = NULL;
        int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
        nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+        if (page)
+                unlock_page(page);
        return page;
 }
@@ -1996,8 +1981,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return security_inode_getsecurity(inode, name, buffer, size,
+        return xattr_getsecurity(inode, name, buffer, size);
-                                          -EOPNOTSUPP);
 }
 static int shmem_xattr_security_set(struct inode *inode, const char *name,
@@ -2138,7 +2122,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
                        }
                        if (*rest)
                                goto bad_val;
-                        *blocks = size >> PAGE_CACHE_SHIFT;
+                        *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
                } else if (!strcmp(this_char,"nr_blocks")) {
                        *blocks = memparse(value,&rest);
                        if (*rest)
@@ -2375,7 +2359,8 @@ static const struct file_operations shmem_file_operations = {
 #ifdef CONFIG_TMPFS
        .llseek         = generic_file_llseek,
        .read           = shmem_file_read,
-        .write          = shmem_file_write,
+        .write          = do_sync_write,
+        .aio_write      = generic_file_aio_write,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
diff --git a/mm/slob.c b/mm/slob.c
index 773a7aa80ab5..e2c3c0ec5463 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -12,10 +12,17 @@
 * allocator is as little as 2 bytes, however typically most architectures
 * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
 *
- * The slob heap is a linked list of pages from alloc_pages(), and
+ * The slob heap is a set of linked list of pages from alloc_pages(),
- * within each page, there is a singly-linked list of free blocks (slob_t).
+ * and within each page, there is a singly-linked list of free blocks
- * The heap is grown on demand and allocation from the heap is currently
+ * (slob_t). The heap is grown on demand. To reduce fragmentation,
- * first-fit.
+ * heap pages are segregated into three lists, with objects less than
+ * 256 bytes, objects less than 1024 bytes, and all other objects.
+ *
+ * Allocation from heap involves first searching for a page with
+ * sufficient free blocks (using a next-fit-like approach) followed by
+ * a first-fit scan of the page. Deallocation inserts objects back
+ * into the free list in address order, so this is effectively an
+ * address-ordered first fit.
 *
 * Above this is an implementation of kmalloc/kfree. Blocks returned
 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp)
 }
 /*
- * All (partially) free slob pages go on this list.
+ * All partially free slob pages go on these lists.
 */
-static LIST_HEAD(free_slob_pages);
+#define SLOB_BREAK1 256
+#define SLOB_BREAK2 1024
+static LIST_HEAD(free_slob_small);
+static LIST_HEAD(free_slob_medium);
+static LIST_HEAD(free_slob_large);
 /*
 * slob_page: True for all slob pages (false for bigblock pages)
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp)
        return test_bit(PG_private, &sp->flags);
 }
-static inline void set_slob_page_free(struct slob_page *sp)
+static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
 {
-        list_add(&sp->list, &free_slob_pages);
+        list_add(&sp->list, list);
        __set_bit(PG_private, &sp->flags);
 }
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
        struct slob_page *sp;
        struct list_head *prev;
+        struct list_head *slob_list;
        slob_t *b = NULL;
        unsigned long flags;
+        if (size < SLOB_BREAK1)
+                slob_list = &free_slob_small;
+        else if (size < SLOB_BREAK2)
+                slob_list = &free_slob_medium;
+        else
+                slob_list = &free_slob_large;
        spin_lock_irqsave(&slob_lock, flags);
        /* Iterate through each partially free page, try to find room */
-        list_for_each_entry(sp, &free_slob_pages, list) {
+        list_for_each_entry(sp, slob_list, list) {
 #ifdef CONFIG_NUMA
                /*
                 * If there's a node specification, search for a partial
@@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                /* Improve fragment distribution and reduce our average
                 * search time by starting our next search here. (see
                 * Knuth vol 1, sec 2.5, pg 449) */
-                if (prev != free_slob_pages.prev &&
+                if (prev != slob_list->prev &&
-                                free_slob_pages.next != prev->next)
+                                slob_list->next != prev->next)
-                        list_move_tail(&free_slob_pages, prev->next);
+                        list_move_tail(slob_list, prev->next);
                break;
        }
        spin_unlock_irqrestore(&slob_lock, flags);
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                sp->free = b;
                INIT_LIST_HEAD(&sp->list);
                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
-                set_slob_page_free(sp);
+                set_slob_page_free(sp, slob_list);
                b = slob_page_alloc(sp, size, align);
                BUG_ON(!b);
                spin_unlock_irqrestore(&slob_lock, flags);
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size)
                set_slob(b, units,
                        (void *)((unsigned long)(b +
                                        SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
-                set_slob_page_free(sp);
+                set_slob_page_free(sp, &free_slob_small);
                goto out;
        }
@@ -398,6 +417,10 @@ static void slob_free(void *block, int size)
        sp->units += units;
        if (b < sp->free) {
+                if (b + units == sp->free) {
+                        units += slob_units(sp->free);
+                        sp->free = slob_next(sp->free);
+                }
                set_slob(b, units, sp->free);
                sp->free = b;
        } else {
diff --git a/mm/slub.c b/mm/slub.c
index 5cc4b7dddb50..3f056677fa8f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -247,7 +247,10 @@ static void sysfs_slab_remove(struct kmem_cache *);
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s) {}
+static inline void sysfs_slab_remove(struct kmem_cache *s)
+{
+        kfree(s);
+}
 #endif
 /********************************************************************
@@ -354,22 +357,22 @@ static void print_section(char *text, u8 *addr, unsigned int length)
                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
                        newline = 0;
                }
-                printk(" %02x", addr[i]);
+                printk(KERN_CONT " %02x", addr[i]);
                offset = i % 16;
                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
                if (offset == 15) {
-                        printk(" %s\n",ascii);
+                        printk(KERN_CONT " %s\n", ascii);
                        newline = 1;
                }
        }
        if (!newline) {
                i %= 16;
                while (i < 16) {
-                        printk("   ");
+                        printk(KERN_CONT "   ");
                        ascii[i] = ' ';
                        i++;
                }
-                printk(" %s\n", ascii);
+                printk(KERN_CONT " %s\n", ascii);
        }
 }
@@ -529,7 +532,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
        if (s->flags & __OBJECT_POISON) {
                memset(p, POISON_FREE, s->objsize - 1);
-                p[s->objsize -1] = POISON_END;
+                p[s->objsize - 1] = POISON_END;
        }
        if (s->flags & SLAB_RED_ZONE)
@@ -558,7 +561,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
                        u8 *object, char *what,
-                        u8* start, unsigned int value, unsigned int bytes)
+                        u8 *start, unsigned int value, unsigned int bytes)
 {
        u8 *fault;
        u8 *end;
@@ -692,7 +695,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
                        (!check_bytes_and_report(s, page, p, "Poison", p,
                                        POISON_FREE, s->objsize - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
-                                p + s->objsize -1, POISON_END, 1)))
+                                p + s->objsize - 1, POISON_END, 1)))
                        return 0;
                /*
                 * check_pad_bytes cleans up on its own.
@@ -900,8 +903,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
                                "SLUB <none>: no slab for object 0x%p.\n",
                                                object);
                        dump_stack();
-                }
+                } else
-                else
                        object_err(s, page, object,
                                        "page slab pointer corrupt.");
                goto fail;
@@ -947,7 +949,7 @@ static int __init setup_slub_debug(char *str)
        /*
         * Determine which debug features should be switched on
         */
-        for ( ;*str && *str != ','; str++) {
+        for (; *str && *str != ','; str++) {
                switch (tolower(*str)) {
                case 'f':
                        slub_debug |= SLAB_DEBUG_FREE;
@@ -966,7 +968,7 @@ static int __init setup_slub_debug(char *str)
                        break;
                default:
                        printk(KERN_ERR "slub_debug option '%c' "
-                                "unknown. skipped\n",*str);
+                                "unknown. skipped\n", *str);
                }
        }
@@ -1039,7 +1041,7 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
 */
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
-        struct page * page;
+        struct page *page;
        int pages = 1 << s->order;
        if (s->order)
@@ -1135,7 +1137,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        mod_zone_page_state(page_zone(page),
                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-                - pages);
+                -pages);
        __free_pages(page, s->order);
 }
@@ -1195,19 +1197,15 @@ static __always_inline int slab_trylock(struct page *page)
 /*
 * Management of partially allocated slabs
 */
-static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
+static void add_partial(struct kmem_cache_node *n,
-{
+                                struct page *page, int tail)
-        spin_lock(&n->list_lock);
-        n->nr_partial++;
-        list_add_tail(&page->lru, &n->partial);
-        spin_unlock(&n->list_lock);
-}
-static void add_partial(struct kmem_cache_node *n, struct page *page)
 {
        spin_lock(&n->list_lock);
        n->nr_partial++;
-        list_add(&page->lru, &n->partial);
+        if (tail)
+                list_add_tail(&page->lru, &n->partial);
+        else
+                list_add(&page->lru, &n->partial);
        spin_unlock(&n->list_lock);
 }
@@ -1292,7 +1290,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
         * expensive if we do it every time we are trying to find a slab
         * with available objects.
         */
-        if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
+        if (!s->remote_node_defrag_ratio ||
+                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
        zonelist = &NODE_DATA(slab_node(current->mempolicy))
@@ -1335,7 +1334,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
 *
 * On exit the slab lock will have been dropped.
 */
-static void unfreeze_slab(struct kmem_cache *s, struct page *page)
+static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1343,7 +1342,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
        if (page->inuse) {
                if (page->freelist)
-                        add_partial(n, page);
+                        add_partial(n, page, tail);
                else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
                        add_full(n, page);
                slab_unlock(page);
@@ -1358,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
                         * partial list stays small. kmem_cache_shrink can
                         * reclaim empty slabs from the partial list.
                         */
-                        add_partial_tail(n, page);
+                        add_partial(n, page, 1);
                        slab_unlock(page);
                } else {
                        slab_unlock(page);
@@ -1373,6 +1372,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        struct page *page = c->page;
+        int tail = 1;
        /*
         * Merge cpu freelist into freelist. Typically we get here
         * because both freelists are empty. So this is unlikely
@@ -1381,6 +1381,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
        while (unlikely(c->freelist)) {
                void **object;
+                tail = 0;       /* Hot objects. Put the slab first */
                /* Retrieve object from cpu_freelist */
                object = c->freelist;
                c->freelist = c->freelist[c->offset];
@@ -1391,7 +1393,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                page->inuse--;
        }
        c->page = NULL;
-        unfreeze_slab(s, page);
+        unfreeze_slab(s, page, tail);
 }
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -1539,7 +1541,7 @@ debug:
 *
 * Otherwise we can simply pick the next object from the lockless free list.
 */
-static void __always_inline *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc(struct kmem_cache *s,
                gfp_t gfpflags, int node, void *addr)
 {
        void **object;
@@ -1613,7 +1615,7 @@ checks_ok:
         * then add it.
         */
        if (unlikely(!prior))
-                add_partial_tail(get_node(s, page_to_nid(page)), page);
+                add_partial(get_node(s, page_to_nid(page)), page, 1);
 out_unlock:
        slab_unlock(page);
@@ -1647,7 +1649,7 @@ debug:
 * If fastpath is not possible then fall back to __slab_free where we deal
 * with all sorts of special processing.
 */
-static void __always_inline slab_free(struct kmem_cache *s,
+static __always_inline void slab_free(struct kmem_cache *s,
                        struct page *page, void *x, void *addr)
 {
        void **object = (void *)x;
@@ -1997,6 +1999,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 {
        struct page *page;
        struct kmem_cache_node *n;
+        unsigned long flags;
        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
@@ -2021,7 +2024,14 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
 #endif
        init_kmem_cache_node(n);
        atomic_long_inc(&n->nr_slabs);
-        add_partial(n, page);
+        /*
+         * lockdep requires consistent irq usage for each lock
+         * so even though there cannot be a race this early in
+         * the boot sequence, we still disable irqs.
+         */
+        local_irq_save(flags);
+        add_partial(n, page, 0);
+        local_irq_restore(flags);
        return n;
 }
@@ -2206,7 +2216,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->refcount = 1;
 #ifdef CONFIG_NUMA
-        s->defrag_ratio = 100;
+        s->remote_node_defrag_ratio = 100;
 #endif
        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                goto error;
@@ -2228,7 +2238,7 @@ error:
 */
 int kmem_ptr_validate(struct kmem_cache *s, const void *object)
 {
-        struct page * page;
+        struct page *page;
        page = get_object_page(object);
@@ -2322,7 +2332,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
                if (kmem_cache_close(s))
                        WARN_ON(1);
                sysfs_slab_remove(s);
-                kfree(s);
        } else
                up_write(&slub_lock);
 }
@@ -2341,7 +2350,7 @@ static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
 static int __init setup_slub_min_order(char *str)
 {
-        get_option (&str, &slub_min_order);
+        get_option(&str, &slub_min_order);
        return 1;
 }
@@ -2350,7 +2359,7 @@ __setup("slub_min_order=", setup_slub_min_order);
 static int __init setup_slub_max_order(char *str)
 {
-        get_option (&str, &slub_max_order);
+        get_option(&str, &slub_max_order);
        return 1;
 }
@@ -2359,7 +2368,7 @@ __setup("slub_max_order=", setup_slub_max_order);
 static int __init setup_slub_min_objects(char *str)
 {
-        get_option (&str, &slub_min_objects);
+        get_option(&str, &slub_min_objects);
        return 1;
 }
@@ -2605,6 +2614,19 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
+static unsigned long count_partial(struct kmem_cache_node *n)
+{
+        unsigned long flags;
+        unsigned long x = 0;
+        struct page *page;
+        spin_lock_irqsave(&n->list_lock, flags);
+        list_for_each_entry(page, &n->partial, lru)
+                x += page->inuse;
+        spin_unlock_irqrestore(&n->list_lock, flags);
+        return x;
+}
 /*
 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
 * the remaining slabs by the number of items in use. The slabs with the
@@ -2931,7 +2953,7 @@ static struct kmem_cache *find_mergeable(size_t size,
                 * Check if alignment is compatible.
                 * Courtesy of Adrian Drzewiecki
                 */
-                if ((s->size & ~(align -1)) != s->size)
+                if ((s->size & ~(align - 1)) != s->size)
                        continue;
                if (s->size - size >= sizeof(void *))
@@ -3040,8 +3062,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata slab_notifier =
+static struct notifier_block __cpuinitdata slab_notifier = {
-        { &slab_cpuup_callback, NULL, 0 };
+        &slab_cpuup_callback, NULL, 0
+};
 #endif
@@ -3076,19 +3099,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        return slab_alloc(s, gfpflags, node, caller);
 }
-static unsigned long count_partial(struct kmem_cache_node *n)
-{
-        unsigned long flags;
-        unsigned long x = 0;
-        struct page *page;
-        spin_lock_irqsave(&n->list_lock, flags);
-        list_for_each_entry(page, &n->partial, lru)
-                x += page->inuse;
-        spin_unlock_irqrestore(&n->list_lock, flags);
-        return x;
-}
 #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
 static int validate_slab(struct kmem_cache *s, struct page *page,
                                                unsigned long *map)
@@ -3390,7 +3400,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
 static int list_locations(struct kmem_cache *s, char *buf,
                                        enum track_item alloc)
 {
-        int n = 0;
+        int len = 0;
        unsigned long i;
        struct loc_track t = { 0, 0, NULL };
        int node;
@@ -3421,54 +3431,54 @@ static int list_locations(struct kmem_cache *s, char *buf,
        for (i = 0; i < t.count; i++) {
                struct location *l = &t.loc[i];
-                if (n > PAGE_SIZE - 100)
+                if (len > PAGE_SIZE - 100)
                        break;
-                n += sprintf(buf + n, "%7ld ", l->count);
+                len += sprintf(buf + len, "%7ld ", l->count);
                if (l->addr)
-                        n += sprint_symbol(buf + n, (unsigned long)l->addr);
+                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
                else
-                        n += sprintf(buf + n, "<not-available>");
+                        len += sprintf(buf + len, "<not-available>");
                if (l->sum_time != l->min_time) {
                        unsigned long remainder;
-                        n += sprintf(buf + n, " age=%ld/%ld/%ld",
+                        len += sprintf(buf + len, " age=%ld/%ld/%ld",
                        l->min_time,
                        div_long_long_rem(l->sum_time, l->count, &remainder),
                        l->max_time);
                } else
-                        n += sprintf(buf + n, " age=%ld",
+                        len += sprintf(buf + len, " age=%ld",
                                l->min_time);
                if (l->min_pid != l->max_pid)
-                        n += sprintf(buf + n, " pid=%ld-%ld",
+                        len += sprintf(buf + len, " pid=%ld-%ld",
                                l->min_pid, l->max_pid);
                else
-                        n += sprintf(buf + n, " pid=%ld",
+                        len += sprintf(buf + len, " pid=%ld",
                                l->min_pid);
                if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
-                                n < PAGE_SIZE - 60) {
+                                len < PAGE_SIZE - 60) {
-                        n += sprintf(buf + n, " cpus=");
+                        len += sprintf(buf + len, " cpus=");
-                        n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
                                        l->cpus);
                }
                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
-                                n < PAGE_SIZE - 60) {
+                                len < PAGE_SIZE - 60) {
-                        n += sprintf(buf + n, " nodes=");
+                        len += sprintf(buf + len, " nodes=");
-                        n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+                        len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
                                        l->nodes);
                }
-                n += sprintf(buf + n, "\n");
+                len += sprintf(buf + len, "\n");
        }
        free_loc_track(&t);
        if (!t.count)
-                n += sprintf(buf, "No data\n");
+                len += sprintf(buf, "No data\n");
-        return n;
+        return len;
 }
 enum slab_stat_type {
@@ -3498,7 +3508,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
        for_each_possible_cpu(cpu) {
                struct page *page;
-                int node;
                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
                if (!c)
@@ -3510,8 +3519,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
                        continue;
                if (page) {
                        if (flags & SO_CPU) {
-                                int x = 0;
                                if (flags & SO_OBJECTS)
                                        x = page->inuse;
                                else
@@ -3848,24 +3855,24 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
 SLAB_ATTR_RO(free_calls);
 #ifdef CONFIG_NUMA
-static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", s->defrag_ratio / 10);
+        return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
 }
-static ssize_t defrag_ratio_store(struct kmem_cache *s,
+static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
        int n = simple_strtoul(buf, NULL, 10);
        if (n < 100)
-                s->defrag_ratio = n * 10;
+                s->remote_node_defrag_ratio = n * 10;
        return length;
 }
-SLAB_ATTR(defrag_ratio);
+SLAB_ATTR(remote_node_defrag_ratio);
 #endif
-static struct attribute * slab_attrs[] = {
+static struct attribute *slab_attrs[] = {
        &slab_size_attr.attr,
        &object_size_attr.attr,
        &objs_per_slab_attr.attr,
@@ -3893,7 +3900,7 @@ static struct attribute * slab_attrs[] = {
        &cache_dma_attr.attr,
 #endif
 #ifdef CONFIG_NUMA
-        &defrag_ratio_attr.attr,
+        &remote_node_defrag_ratio_attr.attr,
 #endif
        NULL
 };
@@ -3940,6 +3947,13 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        return err;
 }
+static void kmem_cache_release(struct kobject *kobj)
+{
+        struct kmem_cache *s = to_slab(kobj);
+        kfree(s);
+}
 static struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
@@ -3947,6 +3961,7 @@ static struct sysfs_ops slab_sysfs_ops = {
 static struct kobj_type slab_ktype = {
        .sysfs_ops = &slab_sysfs_ops,
+        .release = kmem_cache_release
 };
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -4048,6 +4063,7 @@ static void sysfs_slab_remove(struct kmem_cache *s)
 {
        kobject_uevent(&s->kobj, KOBJ_REMOVE);
        kobject_del(&s->kobj);
+        kobject_put(&s->kobj);
 }
 /*
diff --git a/mm/sparse.c b/mm/sparse.c
index a2183cb5d524..f6a43c09c322 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
+static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
@@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
        return __kmalloc_section_memmap(nr_pages);
 }
-static int vaddr_in_vmalloc_area(void *addr)
-{
-        if (addr >= (void *)VMALLOC_START &&
-            addr < (void *)VMALLOC_END)
-                return 1;
-        return 0;
-}
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-        if (vaddr_in_vmalloc_area(memmap))
+        if (is_vmalloc_addr(memmap))
                vfree(memmap);
        else
                free_pages((unsigned long)memmap,
diff --git a/mm/swap.c b/mm/swap.c
index 9ac88323d237..710a20bb9749 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -41,7 +42,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
 */
-static void fastcall __page_cache_release(struct page *page)
+static void __page_cache_release(struct page *page)
 {
        if (PageLRU(page)) {
                unsigned long flags;
@@ -165,7 +166,7 @@ int rotate_reclaimable_page(struct page *page)
 /*
 * FIXME: speed this up?
 */
-void fastcall activate_page(struct page *page)
+void activate_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
@@ -175,6 +176,7 @@ void fastcall activate_page(struct page *page)
                SetPageActive(page);
                add_page_to_active_list(zone, page);
                __count_vm_event(PGACTIVATE);
+                mem_cgroup_move_lists(page_get_page_cgroup(page), true);
        }
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -186,7 +188,7 @@ void fastcall activate_page(struct page *page)
 * inactive,referenced          ->      active,unreferenced
 * active,unreferenced          ->      active,referenced
 */
-void fastcall mark_page_accessed(struct page *page)
+void mark_page_accessed(struct page *page)
 {
        if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
                activate_page(page);
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mark_page_accessed);
 * lru_cache_add: add a page to the page lists
 * @page: the page to add
 */
-void fastcall lru_cache_add(struct page *page)
+void lru_cache_add(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -212,7 +214,7 @@ void fastcall lru_cache_add(struct page *page)
        put_cpu_var(lru_add_pvecs);
 }
-void fastcall lru_cache_add_active(struct page *page)
+void lru_cache_add_active(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b52635601dfe..ec42f01a8d02 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
@@ -51,26 +52,22 @@ static struct {
        unsigned long del_total;
        unsigned long find_success;
        unsigned long find_total;
-        unsigned long noent_race;
-        unsigned long exist_race;
 } swap_cache_info;
 void show_swap_cache_info(void)
 {
-        printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
+        printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
-                swap_cache_info.find_success, swap_cache_info.find_total,
+                swap_cache_info.find_success, swap_cache_info.find_total);
-                swap_cache_info.noent_race, swap_cache_info.exist_race);
        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 /*
- * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
-                               gfp_t gfp_mask)
 {
        int error;
@@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
+                        INC_CACHE_INFO(add_total);
                }
                write_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
@@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
        return error;
 }
-static int add_to_swap_cache(struct page *page, swp_entry_t entry)
-{
-        int error;
-        BUG_ON(PageLocked(page));
-        if (!swap_duplicate(entry)) {
-                INC_CACHE_INFO(noent_race);
-                return -ENOENT;
-        }
-        SetPageLocked(page);
-        error = __add_to_swap_cache(page, entry, GFP_KERNEL);
-        /*
-         * Anon pages are already on the LRU, we don't run lru_cache_add here.
-         */
-        if (error) {
-                ClearPageLocked(page);
-                swap_free(entry);
-                if (error == -EEXIST)
-                        INC_CACHE_INFO(exist_race);
-                return error;
-        }
-        INC_CACHE_INFO(add_total);
-        return 0;
-}
 /*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
@@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
        int err;
        BUG_ON(!PageLocked(page));
+        BUG_ON(!PageUptodate(page));
        for (;;) {
                entry = get_swap_page();
@@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
                /*
                 * Add it to the swap cache and mark it dirty
                 */
-                err = __add_to_swap_cache(page, entry,
+                err = add_to_swap_cache(page, entry,
                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
-                        SetPageUptodate(page);
                        SetPageDirty(page);
-                        INC_CACHE_INFO(add_total);
                        return 1;
                case -EEXIST:
                        /* Raced with "speculative" read_swap_cache_async */
-                        INC_CACHE_INFO(exist_race);
                        swap_free(entry);
                        continue;
                default:
@@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page)
        page_cache_release(page);
 }
-/*
- * Strange swizzling function only for use by shmem_writepage
- */
-int move_to_swap_cache(struct page *page, swp_entry_t entry)
-{
-        int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
-        if (!err) {
-                remove_from_page_cache(page);
-                page_cache_release(page);       /* pagecache ref */
-                if (!swap_duplicate(entry))
-                        BUG();
-                SetPageDirty(page);
-                INC_CACHE_INFO(add_total);
-        } else if (err == -EEXIST)
-                INC_CACHE_INFO(exist_race);
-        return err;
-}
-/*
- * Strange swizzling function for shmem_getpage (and shmem_unuse)
- */
-int move_from_swap_cache(struct page *page, unsigned long index,
-                struct address_space *mapping)
-{
-        int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
-        if (!err) {
-                delete_from_swap_cache(page);
-                /* shift page from clean_pages to dirty_pages list */
-                ClearPageDirty(page);
-                set_page_dirty(page);
-        }
-        return err;
-}
 /* 
 * If we are the only user, then try to free up the swap cache. 
 * 
@@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
-struct page *read_swap_cache_async(swp_entry_t entry,
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *found_page, *new_page = NULL;
@@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry,
                 * Get a new page to read into from swap.
                 */
                if (!new_page) {
-                        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                        new_page = alloc_page_vma(gfp_mask, vma, addr);
-                                                                vma, addr);
                        if (!new_page)
                                break;          /* Out of memory */
                }
                /*
+                 * Swap entry may have been freed since our caller observed it.
+                 */
+                if (!swap_duplicate(entry))
+                        break;
+                /*
                 * Associate the page with swap entry in the swap cache.
-                 * May fail (-ENOENT) if swap entry has been freed since
+                 * May fail (-EEXIST) if there is already a page associated
-                 * our caller observed it.  May fail (-EEXIST) if there
+                 * with this entry in the swap cache: added by a racing
-                 * is already a page associated with this entry in the
+                 * read_swap_cache_async, or add_to_swap or shmem_writepage
-                 * swap cache: added by a racing read_swap_cache_async,
+                 * re-using the just freed swap entry for an existing page.
-                 * or by try_to_swap_out (or shmem_writepage) re-using
-                 * the just freed swap entry for an existing page.
                 * May fail (-ENOMEM) if radix-tree node allocation failed.
                 */
-                err = add_to_swap_cache(new_page, entry);
+                SetPageLocked(new_page);
+                err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
                if (!err) {
                        /*
                         * Initiate read into locked page and return.
@@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry,
                        swap_readpage(NULL, new_page);
                        return new_page;
                }
-        } while (err != -ENOENT && err != -ENOMEM);
+                ClearPageLocked(new_page);
+                swap_free(entry);
+        } while (err != -ENOMEM);
        if (new_page)
                page_cache_release(new_page);
        return found_page;
 }
+/**
+ * swapin_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @vma: user vma this address belongs to
+ * @addr: target address for mempolicy
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read an aligned block of
+ * (1 << page_cluster) entries in the swap area. This method is chosen
+ * because it doesn't cost us any seek time.  We also make sure to queue
+ * the 'original' request together with the readahead ones...
+ *
+ * This has been extended to use the NUMA policies from the mm triggering
+ * the readahead.
+ *
+ * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
+ */
+struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        int nr_pages;
+        struct page *page;
+        unsigned long offset;
+        unsigned long end_offset;
+        /*
+         * Get starting offset for readaround, and number of pages to read.
+         * Adjust starting address by readbehind (for NUMA interleave case)?
+         * No, it's very unlikely that swap layout would follow vma layout,
+         * more likely that neighbouring swap pages came from the same node:
+         * so use the same "addr" to choose the same node for each swap read.
+         */
+        nr_pages = valid_swaphandles(entry, &offset);
+        for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+                /* Ok, do the async read-ahead now */
+                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
+                                                gfp_mask, vma, addr);
+                if (!page)
+                        break;
+                page_cache_release(page);
+        }
+        lru_add_drain();        /* Push any new pages onto the LRU now */
+        return read_swap_cache_async(entry, gfp_mask, vma, addr);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648e1360..02ccab5ad9d9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
 #include <linux/mutex.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/memcontrol.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -506,9 +507,24 @@ unsigned int count_swap_pages(int type, int free)
 * just let do_wp_page work it out if a write is requested later - to
 * force COW, vm_page_prot omits write permission from any private vma.
 */
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
+        spinlock_t *ptl;
+        pte_t *pte;
+        int ret = 1;
+        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+                ret = -ENOMEM;
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
+                if (ret > 0)
+                        mem_cgroup_uncharge_page(page);
+                ret = 0;
+                goto out;
+        }
        inc_mm_counter(vma->vm_mm, anon_rss);
        get_page(page);
        set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +536,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
         * immediately swapped out again after swapon.
         */
        activate_page(page);
+out:
+        pte_unmap_unlock(pte, ptl);
+        return ret;
 }
 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,23 +547,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 {
        pte_t swp_pte = swp_entry_to_pte(entry);
        pte_t *pte;
-        spinlock_t *ptl;
+        int ret = 0;
-        int found = 0;
-        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        /*
+         * We don't actually need pte lock while scanning for swp_pte: since
+         * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
+         * page table while we're scanning; though it could get zapped, and on
+         * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
+         * of unmatched parts which look like swp_pte, so unuse_pte must
+         * recheck under pte lock.  Scanning without pte lock lets it be
+         * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+         */
+        pte = pte_offset_map(pmd, addr);
        do {
                /*
                 * swapoff spends a _lot_ of time in this loop!
                 * Test inline before going to call unuse_pte.
                 */
                if (unlikely(pte_same(*pte, swp_pte))) {
-                        unuse_pte(vma, pte++, addr, entry, page);
+                        pte_unmap(pte);
-                        found = 1;
+                        ret = unuse_pte(vma, pmd, addr, entry, page);
-                        break;
+                        if (ret)
+                                goto out;
+                        pte = pte_offset_map(pmd, addr);
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
-        pte_unmap_unlock(pte - 1, ptl);
+        pte_unmap(pte - 1);
-        return found;
+out:
+        return ret;
 }
 static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 {
        pmd_t *pmd;
        unsigned long next;
+        int ret;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                if (unuse_pte_range(vma, pmd, addr, next, entry, page))
+                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
-                        return 1;
+                if (ret)
+                        return ret;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
@@ -571,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 {
        pud_t *pud;
        unsigned long next;
+        int ret;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                if (unuse_pmd_range(vma, pud, addr, next, entry, page))
+                ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
-                        return 1;
+                if (ret)
+                        return ret;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
@@ -588,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma,
 {
        pgd_t *pgd;
        unsigned long addr, end, next;
+        int ret;
        if (page->mapping) {
                addr = page_address_in_vma(page, vma);
@@ -605,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                if (unuse_pud_range(vma, pgd, addr, next, entry, page))
+                ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
-                        return 1;
+                if (ret)
+                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
 }
@@ -615,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm,
                                swp_entry_t entry, struct page *page)
 {
        struct vm_area_struct *vma;
+        int ret = 0;
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
@@ -627,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm,
                lock_page(page);
        }
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                if (vma->anon_vma && unuse_vma(vma, entry, page))
+                if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
                        break;
        }
        up_read(&mm->mmap_sem);
-        /*
+        return (ret < 0)? ret: 0;
-         * Currently unuse_mm cannot fail, but leave error handling
-         * at call sites for now, since we change it from time to time.
-         */
-        return 0;
 }
 /*
@@ -730,7 +763,8 @@ static int try_to_unuse(unsigned int type)
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
-                page = read_swap_cache_async(entry, NULL, 0);
+                page = read_swap_cache_async(entry,
+                                        GFP_HIGHUSER_MOVABLE, NULL, 0);
                if (!page) {
                        /*
                         * Either swap_duplicate() failed because entry
@@ -789,7 +823,7 @@ static int try_to_unuse(unsigned int type)
                        atomic_inc(&new_start_mm->mm_users);
                        atomic_inc(&prev_mm->mm_users);
                        spin_lock(&mmlist_lock);
-                        while (*swap_map > 1 && !retval &&
+                        while (*swap_map > 1 && !retval && !shmem &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
                                if (!atomic_inc_not_zero(&mm->mm_users))
@@ -821,6 +855,13 @@ static int try_to_unuse(unsigned int type)
                        mmput(start_mm);
                        start_mm = new_start_mm;
                }
+                if (shmem) {
+                        /* page has already been unlocked and released */
+                        if (shmem > 0)
+                                continue;
+                        retval = shmem;
+                        break;
+                }
                if (retval) {
                        unlock_page(page);
                        page_cache_release(page);
@@ -859,12 +900,6 @@ static int try_to_unuse(unsigned int type)
                 * read from disk into another page.  Splitting into two
                 * pages would be incorrect if swap supported "shared
                 * private" pages, but they are handled by tmpfs files.
-                 *
-                 * Note shmem_unuse already deleted a swappage from
-                 * the swap cache, unless the move to filepage failed:
-                 * in which case it left swappage in cache, lowered its
-                 * swap count to pass quickly through the loops above,
-                 * and now we must reincrement count to try again later.
                 */
                if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
                        struct writeback_control wbc = {
@@ -875,12 +910,8 @@ static int try_to_unuse(unsigned int type)
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
-                if (PageSwapCache(page)) {
+                if (PageSwapCache(page))
-                        if (shmem)
+                        delete_from_swap_cache(page);
-                                swap_duplicate(entry);
-                        else
-                                delete_from_swap_cache(page);
-                }
                /*
                 * So we could skip searching mms once swap count went
@@ -1768,31 +1799,48 @@ get_swap_info_struct(unsigned type)
 */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 {
+        struct swap_info_struct *si;
        int our_page_cluster = page_cluster;
-        int ret = 0, i = 1 << our_page_cluster;
+        pgoff_t target, toff;
-        unsigned long toff;
+        pgoff_t base, end;
-        struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
+        int nr_pages = 0;
        if (!our_page_cluster)  /* no readahead */
                return 0;
-        toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
-        if (!toff)              /* first page is swap header */
+        si = &swap_info[swp_type(entry)];
-                toff++, i--;
+        target = swp_offset(entry);
-        *offset = toff;
+        base = (target >> our_page_cluster) << our_page_cluster;
+        end = base + (1 << our_page_cluster);
+        if (!base)              /* first page is swap header */
+                base++;
        spin_lock(&swap_lock);
-        do {
+        if (end > si->max)      /* don't go beyond end of map */
-                /* Don't read-ahead past the end of the swap area */
+                end = si->max;
-                if (toff >= swapdev->max)
+        /* Count contiguous allocated slots above our target */
+        for (toff = target; ++toff < end; nr_pages++) {
+                /* Don't read in free or bad pages */
+                if (!si->swap_map[toff])
                        break;
+                if (si->swap_map[toff] == SWAP_MAP_BAD)
+                        break;
+        }
+        /* Count contiguous allocated slots below our target */
+        for (toff = target; --toff >= base; nr_pages++) {
                /* Don't read in free or bad pages */
-                if (!swapdev->swap_map[toff])
+                if (!si->swap_map[toff])
                        break;
-                if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
+                if (si->swap_map[toff] == SWAP_MAP_BAD)
                        break;
-                toff++;
+        }
-                ret++;
-        } while (--i);
        spin_unlock(&swap_lock);
-        return ret;
+        /*
+         * Indicate starting offset, and return number of pages to get:
+         * if only 1, say 0, since there's then no readahead to be done.
+         */
+        *offset = ++toff;
+        return nr_pages? ++nr_pages: 0;
 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index d436a9c82db7..702083638c16 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
        return 0;
 }
-#if 0
-int shmem_mmap(struct file *file, struct vm_area_struct *vma)
-{
-        file_accessed(file);
-#ifndef CONFIG_MMU
-        return ramfs_nommu_mmap(file, vma);
-#else
-        return 0;
-#endif
-}
-#endif  /*  0  */
 #ifndef CONFIG_MMU
 unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long addr,
diff --git a/mm/truncate.c b/mm/truncate.c
index cadc15653dde..c35c49e54fb6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -21,7 +21,7 @@
 /**
- * do_invalidatepage - invalidate part of all of a page
+ * do_invalidatepage - invalidate part or all of a page
 * @page: the page which is affected
 * @offset: the index of the truncation point
 *
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
-        zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
+        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
        if (PagePrivate(page))
                do_invalidatepage(page, partial);
 }
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
 /*
 * If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bale out if page->mapping is no longer equal to the original
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (page->mapping != mapping)
                return;
-        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        if (PagePrivate(page))
                do_invalidatepage(page, 0);
+        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        remove_from_page_cache(page);
        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af77e171e339..0536dde139d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
+/*
+ * Map a vmalloc()-space virtual address to the physical page.
+ */
+struct page *vmalloc_to_page(const void *vmalloc_addr)
+{
+        unsigned long addr = (unsigned long) vmalloc_addr;
+        struct page *page = NULL;
+        pgd_t *pgd = pgd_offset_k(addr);
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        if (!pgd_none(*pgd)) {
+                pud = pud_offset(pgd, addr);
+                if (!pud_none(*pud)) {
+                        pmd = pmd_offset(pud, addr);
+                        if (!pmd_none(*pmd)) {
+                                ptep = pte_offset_map(pmd, addr);
+                                pte = *ptep;
+                                if (pte_present(pte))
+                                        page = pte_page(pte);
+                                pte_unmap(ptep);
+                        }
+                }
+        }
+        return page;
+}
+EXPORT_SYMBOL(vmalloc_to_page);
+/*
+ * Map a vmalloc()-space virtual address to the physical page frame number.
+ */
+unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
+{
+        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
+}
+EXPORT_SYMBOL(vmalloc_to_pfn);
 static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
                                            unsigned long start, unsigned long end,
                                            int node, gfp_t gfp_mask)
@@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
                if (addr > end - size)
                        goto out;
        }
+        if ((size + addr) < addr)
+                goto out;
+        if (addr > end - size)
+                goto out;
 found:
        area->next = *p;
@@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 }
 /* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(void *addr)
+static struct vm_struct *__find_vm_area(const void *addr)
 {
        struct vm_struct *tmp;
@@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr)
 }
 /* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(void *addr)
+static struct vm_struct *__remove_vm_area(const void *addr)
 {
        struct vm_struct **p, *tmp;
@@ -310,7 +352,7 @@ found:
 *      This function returns the found VM area, but using it is NOT safe
 *      on SMP machines, except for its size or flags.
 */
-struct vm_struct *remove_vm_area(void *addr)
+struct vm_struct *remove_vm_area(const void *addr)
 {
        struct vm_struct *v;
        write_lock(&vmlist_lock);
@@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr)
        return v;
 }
-static void __vunmap(void *addr, int deallocate_pages)
+static void __vunmap(const void *addr, int deallocate_pages)
 {
        struct vm_struct *area;
@@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages)
                int i;
                for (i = 0; i < area->nr_pages; i++) {
-                        BUG_ON(!area->pages[i]);
+                        struct page *page = area->pages[i];
-                        __free_page(area->pages[i]);
+                        BUG_ON(!page);
+                        __free_page(page);
                }
                if (area->flags & VM_VPAGES)
@@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages)
 *
 *      Must not be called in interrupt context.
 */
-void vfree(void *addr)
+void vfree(const void *addr)
 {
        BUG_ON(in_interrupt());
        __vunmap(addr, 1);
@@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree);
 *
 *      Must not be called in interrupt context.
 */
-void vunmap(void *addr)
+void vunmap(const void *addr)
 {
        BUG_ON(in_interrupt());
        __vunmap(addr, 0);
@@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
-void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
+static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                pgprot_t prot, int node)
+                                 pgprot_t prot, int node)
 {
        struct page **pages;
        unsigned int nr_pages, array_size, i;
@@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        }
        for (i = 0; i < area->nr_pages; i++) {
+                struct page *page;
                if (node < 0)
-                        area->pages[i] = alloc_page(gfp_mask);
+                        page = alloc_page(gfp_mask);
                else
-                        area->pages[i] = alloc_pages_node(node, gfp_mask, 0);
+                        page = alloc_pages_node(node, gfp_mask, 0);
-                if (unlikely(!area->pages[i])) {
+                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vunmap() */
                        area->nr_pages = i;
                        goto fail;
                }
+                area->pages[i] = page;
        }
        if (map_vm_area(area, prot, &pages))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5a9597e3bbc..a26dabd62fed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/memcontrol.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -68,6 +69,22 @@ struct scan_control {
        int all_unreclaimable;
        int order;
+        /*
+         * Pages that have (or should have) IO pending.  If we run into
+         * a lot of these, we're better off waiting a little for IO to
+         * finish rather than scanning more pages in the VM.
+         */
+        int nr_io_pages;
+        /* Which cgroup do we reclaim from */
+        struct mem_cgroup *mem_cgroup;
+        /* Pluggable isolate pages callback */
+        unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+                        unsigned long *scanned, int order, int mode,
+                        struct zone *z, struct mem_cgroup *mem_cont,
+                        int active);
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -109,6 +126,12 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_CGROUP_MEM_CONT
+#define scan_global_lru(sc)     (!(sc)->mem_cgroup)
+#else
+#define scan_global_lru(sc)     (1)
+#endif
 /*
 * Add a shrinker callback to be called from the vm
 */
@@ -489,11 +512,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         */
                        if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
                                wait_on_page_writeback(page);
-                        else
+                        else {
+                                sc->nr_io_pages++;
                                goto keep_locked;
+                        }
                }
-                referenced = page_referenced(page, 1);
+                referenced = page_referenced(page, 1, sc->mem_cgroup);
                /* In active use or really unfreeable?  Activate it. */
                if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
                                        referenced && page_mapping_inuse(page))
@@ -529,8 +554,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
                                goto keep_locked;
-                        if (!may_enter_fs)
+                        if (!may_enter_fs) {
+                                sc->nr_io_pages++;
                                goto keep_locked;
+                        }
                        if (!sc->may_writepage)
                                goto keep_locked;
@@ -541,8 +568,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                        case PAGE_SUCCESS:
-                                if (PageWriteback(page) || PageDirty(page))
+                                if (PageWriteback(page) || PageDirty(page)) {
+                                        sc->nr_io_pages++;
                                        goto keep;
+                                }
                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
@@ -626,7 +655,7 @@ keep:
 *
 * returns 0 on success, -ve errno on failure.
 */
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
 {
        int ret = -EINVAL;
@@ -760,6 +789,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        return nr_taken;
 }
+static unsigned long isolate_pages_global(unsigned long nr,
+                                        struct list_head *dst,
+                                        unsigned long *scanned, int order,
+                                        int mode, struct zone *z,
+                                        struct mem_cgroup *mem_cont,
+                                        int active)
+{
+        if (active)
+                return isolate_lru_pages(nr, &z->active_list, dst,
+                                                scanned, order, mode);
+        else
+                return isolate_lru_pages(nr, &z->inactive_list, dst,
+                                                scanned, order, mode);
+}
 /*
 * clear_active_flags() is a helper for shrink_active_list(), clearing
 * any active bits from the pages in the list.
@@ -801,18 +845,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                unsigned long nr_freed;
                unsigned long nr_active;
-                nr_taken = isolate_lru_pages(sc->swap_cluster_max,
+                nr_taken = sc->isolate_pages(sc->swap_cluster_max,
-                             &zone->inactive_list,
                             &page_list, &nr_scan, sc->order,
                             (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-                                             ISOLATE_BOTH : ISOLATE_INACTIVE);
+                                             ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                zone, sc->mem_cgroup, 0);
                nr_active = clear_active_flags(&page_list);
                __count_vm_events(PGDEACTIVATE, nr_active);
                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
                __mod_zone_page_state(zone, NR_INACTIVE,
                                                -(nr_taken - nr_active));
-                zone->pages_scanned += nr_scan;
+                if (scan_global_lru(sc))
+                        zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
@@ -844,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
-                } else
+                } else if (scan_global_lru(sc))
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
                if (nr_taken == 0)
@@ -899,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
 }
 /*
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
+ */
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+                                int priority)
+{
+        long mapped_ratio;
+        long distress;
+        long swap_tendency;
+        long imbalance;
+        int reclaim_mapped = 0;
+        int prev_priority;
+        if (scan_global_lru(sc) && zone_is_near_oom(zone))
+                return 1;
+        /*
+         * `distress' is a measure of how much trouble we're having
+         * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+         */
+        if (scan_global_lru(sc))
+                prev_priority = zone->prev_priority;
+        else
+                prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+        distress = 100 >> min(prev_priority, priority);
+        /*
+         * The point of this algorithm is to decide when to start
+         * reclaiming mapped memory instead of just pagecache.  Work out
+         * how much memory
+         * is mapped.
+         */
+        if (scan_global_lru(sc))
+                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                                global_page_state(NR_ANON_PAGES)) * 100) /
+                                        vm_total_pages;
+        else
+                mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
+        /*
+         * Now decide how much we really want to unmap some pages.  The
+         * mapped ratio is downgraded - just because there's a lot of
+         * mapped memory doesn't necessarily mean that page reclaim
+         * isn't succeeding.
+         *
+         * The distress ratio is important - we don't want to start
+         * going oom.
+         *
+         * A 100% value of vm_swappiness overrides this algorithm
+         * altogether.
+         */
+        swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+        /*
+         * If there's huge imbalance between active and inactive
+         * (think active 100 times larger than inactive) we should
+         * become more permissive, or the system will take too much
+         * cpu before it start swapping during memory pressure.
+         * Distress is about avoiding early-oom, this is about
+         * making swappiness graceful despite setting it to low
+         * values.
+         *
+         * Avoid div by zero with nr_inactive+1, and max resulting
+         * value is vm_total_pages.
+         */
+        if (scan_global_lru(sc)) {
+                imbalance  = zone_page_state(zone, NR_ACTIVE);
+                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+        } else
+                imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
+        /*
+         * Reduce the effect of imbalance if swappiness is low,
+         * this means for a swappiness very low, the imbalance
+         * must be much higher than 100 for this logic to make
+         * the difference.
+         *
+         * Max temporary value is vm_total_pages*100.
+         */
+        imbalance *= (vm_swappiness + 1);
+        imbalance /= 100;
+        /*
+         * If not much of the ram is mapped, makes the imbalance
+         * less relevant, it's high priority we refill the inactive
+         * list with mapped pages only in presence of high ratio of
+         * mapped pages.
+         *
+         * Max temporary value is vm_total_pages*100.
+         */
+        imbalance *= mapped_ratio;
+        imbalance /= 100;
+        /* apply imbalance feedback to swap_tendency */
+        swap_tendency += imbalance;
+        /*
+         * Now use this metric to decide whether to start moving mapped
+         * memory onto the inactive list.
+         */
+        if (swap_tendency >= 100)
+                reclaim_mapped = 1;
+        return reclaim_mapped;
+}
+/*
 * This moves pages from the active list to the inactive list.
 *
 * We move them the other way if the page is referenced by one or more
@@ -915,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                                struct scan_control *sc, int priority)
 {
@@ -928,99 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        if (sc->may_swap) {
+        if (sc->may_swap)
-                long mapped_ratio;
+                reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
-                long distress;
-                long swap_tendency;
-                long imbalance;
-                if (zone_is_near_oom(zone))
-                        goto force_reclaim_mapped;
-                /*
-                 * `distress' is a measure of how much trouble we're having
-                 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-                 */
-                distress = 100 >> min(zone->prev_priority, priority);
-                /*
-                 * The point of this algorithm is to decide when to start
-                 * reclaiming mapped memory instead of just pagecache.  Work out
-                 * how much memory
-                 * is mapped.
-                 */
-                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                                global_page_state(NR_ANON_PAGES)) * 100) /
-                                        vm_total_pages;
-                /*
-                 * Now decide how much we really want to unmap some pages.  The
-                 * mapped ratio is downgraded - just because there's a lot of
-                 * mapped memory doesn't necessarily mean that page reclaim
-                 * isn't succeeding.
-                 *
-                 * The distress ratio is important - we don't want to start
-                 * going oom.
-                 *
-                 * A 100% value of vm_swappiness overrides this algorithm
-                 * altogether.
-                 */
-                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-                /*
-                 * If there's huge imbalance between active and inactive
-                 * (think active 100 times larger than inactive) we should
-                 * become more permissive, or the system will take too much
-                 * cpu before it start swapping during memory pressure.
-                 * Distress is about avoiding early-oom, this is about
-                 * making swappiness graceful despite setting it to low
-                 * values.
-                 *
-                 * Avoid div by zero with nr_inactive+1, and max resulting
-                 * value is vm_total_pages.
-                 */
-                imbalance  = zone_page_state(zone, NR_ACTIVE);
-                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-                /*
-                 * Reduce the effect of imbalance if swappiness is low,
-                 * this means for a swappiness very low, the imbalance
-                 * must be much higher than 100 for this logic to make
-                 * the difference.
-                 *
-                 * Max temporary value is vm_total_pages*100.
-                 */
-                imbalance *= (vm_swappiness + 1);
-                imbalance /= 100;
-                /*
-                 * If not much of the ram is mapped, makes the imbalance
-                 * less relevant, it's high priority we refill the inactive
-                 * list with mapped pages only in presence of high ratio of
-                 * mapped pages.
-                 *
-                 * Max temporary value is vm_total_pages*100.
-                 */
-                imbalance *= mapped_ratio;
-                imbalance /= 100;
-                /* apply imbalance feedback to swap_tendency */
-                swap_tendency += imbalance;
-                /*
-                 * Now use this metric to decide whether to start moving mapped
-                 * memory onto the inactive list.
-                 */
-                if (swap_tendency >= 100)
-force_reclaim_mapped:
-                        reclaim_mapped = 1;
-        }
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
-        pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+        pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
-                            &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
+                                        ISOLATE_ACTIVE, zone,
-        zone->pages_scanned += pgscanned;
+                                        sc->mem_cgroup, 1);
+        /*
+         * zone->pages_scanned is used for detect zone's oom
+         * mem_cgroup remembers nr_scan by itself.
+         */
+        if (scan_global_lru(sc))
+                zone->pages_scanned += pgscanned;
        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
@@ -1031,7 +1108,7 @@ force_reclaim_mapped:
                if (page_mapped(page)) {
                        if (!reclaim_mapped ||
                            (total_swap_pages == 0 && PageAnon(page)) ||
-                            page_referenced(page, 0)) {
+                            page_referenced(page, 0, sc->mem_cgroup)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }
@@ -1051,6 +1128,7 @@ force_reclaim_mapped:
                ClearPageActive(page);
                list_move(&page->lru, &zone->inactive_list);
+                mem_cgroup_move_lists(page_get_page_cgroup(page), false);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1157,7 @@ force_reclaim_mapped:
                SetPageLRU(page);
                VM_BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
+                mem_cgroup_move_lists(page_get_page_cgroup(page), true);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1108,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
-        /*
+        if (scan_global_lru(sc)) {
-         * Add one to `nr_to_scan' just to make sure that the kernel will
+                /*
-         * slowly sift through the active list.
+                 * Add one to nr_to_scan just to make sure that the kernel
-         */
+                 * will slowly sift through the active list.
-        zone->nr_scan_active +=
+                 */
-                (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
+                zone->nr_scan_active +=
-        nr_active = zone->nr_scan_active;
+                        (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-        if (nr_active >= sc->swap_cluster_max)
+                nr_active = zone->nr_scan_active;
-                zone->nr_scan_active = 0;
+                zone->nr_scan_inactive +=
-        else
+                        (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-                nr_active = 0;
+                nr_inactive = zone->nr_scan_inactive;
+                if (nr_inactive >= sc->swap_cluster_max)
+                        zone->nr_scan_inactive = 0;
+                else
+                        nr_inactive = 0;
+                if (nr_active >= sc->swap_cluster_max)
+                        zone->nr_scan_active = 0;
+                else
+                        nr_active = 0;
+        } else {
+                /*
+                 * This reclaim occurs not because zone memory shortage but
+                 * because memory controller hits its limit.
+                 * Then, don't modify zone reclaim related data.
+                 */
+                nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+                                        zone, priority);
+                nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+                                        zone, priority);
+        }
-        zone->nr_scan_inactive +=
-                (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-        nr_inactive = zone->nr_scan_inactive;
-        if (nr_inactive >= sc->swap_cluster_max)
-                zone->nr_scan_inactive = 0;
-        else
-                nr_inactive = 0;
        while (nr_active || nr_inactive) {
                if (nr_active) {
@@ -1171,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
        unsigned long nr_reclaimed = 0;
        int i;
        sc->all_unreclaimable = 1;
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
                if (!populated_zone(zone))
                        continue;
+                /*
+                 * Take care memory controller reclaiming has small influence
+                 * to global LRU.
+                 */
+                if (scan_global_lru(sc)) {
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                continue;
+                        note_zone_scanning_priority(zone, priority);
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                        if (zone_is_all_unreclaimable(zone) &&
-                        continue;
+                                                priority != DEF_PRIORITY)
+                                continue;       /* Let kswapd poll it */
-                note_zone_scanning_priority(zone, priority);
+                        sc->all_unreclaimable = 0;
+                } else {
-                if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
+                        /*
-                        continue;       /* Let kswapd poll it */
+                         * Ignore cpuset limitation here. We just want to reduce
+                         * # of used pages by us regardless of memory shortage.
-                sc->all_unreclaimable = 0;
+                         */
+                        sc->all_unreclaimable = 0;
+                        mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+                                                        priority);
+                }
                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
        return nr_reclaimed;
 }
 
@@ -1206,7 +1313,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+                                          struct scan_control *sc)
 {
        int priority;
        int ret = 0;
@@ -1215,39 +1323,43 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
        int i;
-        struct scan_control sc = {
-                .gfp_mask = gfp_mask,
-                .may_writepage = !laptop_mode,
-                .swap_cluster_max = SWAP_CLUSTER_MAX,
-                .may_swap = 1,
-                .swappiness = vm_swappiness,
-                .order = order,
-        };
-        count_vm_event(ALLOCSTALL);
-        for (i = 0; zones[i] != NULL; i++) {
+        if (scan_global_lru(sc))
-                struct zone *zone = zones[i];
+                count_vm_event(ALLOCSTALL);
+        /*
+         * mem_cgroup will not do shrink_slab.
+         */
+        if (scan_global_lru(sc)) {
+                for (i = 0; zones[i] != NULL; i++) {
+                        struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                        continue;
+                                continue;
-                lru_pages += zone_page_state(zone, NR_ACTIVE)
+                        lru_pages += zone_page_state(zone, NR_ACTIVE)
-                                + zone_page_state(zone, NR_INACTIVE);
+                                        + zone_page_state(zone, NR_INACTIVE);
+                }
        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                sc.nr_scanned = 0;
+                sc->nr_scanned = 0;
+                sc->nr_io_pages = 0;
                if (!priority)
                        disable_swap_token();
-                nr_reclaimed += shrink_zones(priority, zones, &sc);
+                nr_reclaimed += shrink_zones(priority, zones, sc);
-                shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
+                /*
-                if (reclaim_state) {
+                 * Don't shrink slabs when reclaiming memory from
-                        nr_reclaimed += reclaim_state->reclaimed_slab;
+                 * over limit cgroups
-                        reclaim_state->reclaimed_slab = 0;
+                 */
+                if (scan_global_lru(sc)) {
+                        shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+                        if (reclaim_state) {
+                                nr_reclaimed += reclaim_state->reclaimed_slab;
+                                reclaim_state->reclaimed_slab = 0;
+                        }
                }
-                total_scanned += sc.nr_scanned;
+                total_scanned += sc->nr_scanned;
-                if (nr_reclaimed >= sc.swap_cluster_max) {
+                if (nr_reclaimed >= sc->swap_cluster_max) {
                        ret = 1;
                        goto out;
                }
@@ -1259,18 +1371,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-                if (total_scanned > sc.swap_cluster_max +
+                if (total_scanned > sc->swap_cluster_max +
-                                        sc.swap_cluster_max / 2) {
+                                        sc->swap_cluster_max / 2) {
                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-                        sc.may_writepage = 1;
+                        sc->may_writepage = 1;
                }
                /* Take a nap, wait for some writeback to complete */
-                if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+                if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
+                                sc->nr_io_pages > sc->swap_cluster_max)
                        congestion_wait(WRITE, HZ/10);
        }
        /* top priority shrink_caches still had more to do? don't OOM, then */
-        if (!sc.all_unreclaimable)
+        if (!sc->all_unreclaimable && scan_global_lru(sc))
                ret = 1;
 out:
        /*
@@ -1282,17 +1395,63 @@ out:
         */
        if (priority < 0)
                priority = 0;
-        for (i = 0; zones[i] != NULL; i++) {
-                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+        if (scan_global_lru(sc)) {
-                        continue;
+                for (i = 0; zones[i] != NULL; i++) {
+                        struct zone *zone = zones[i];
+                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                continue;
+                        zone->prev_priority = priority;
+                }
+        } else
+                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
-                zone->prev_priority = priority;
-        }
        return ret;
 }
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+{
+        struct scan_control sc = {
+                .gfp_mask = gfp_mask,
+                .may_writepage = !laptop_mode,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .may_swap = 1,
+                .swappiness = vm_swappiness,
+                .order = order,
+                .mem_cgroup = NULL,
+                .isolate_pages = isolate_pages_global,
+        };
+        return do_try_to_free_pages(zones, gfp_mask, &sc);
+}
+#ifdef CONFIG_CGROUP_MEM_CONT
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+                                                gfp_t gfp_mask)
+{
+        struct scan_control sc = {
+                .gfp_mask = gfp_mask,
+                .may_writepage = !laptop_mode,
+                .may_swap = 1,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .swappiness = vm_swappiness,
+                .order = 0,
+                .mem_cgroup = mem_cont,
+                .isolate_pages = mem_cgroup_isolate_pages,
+        };
+        struct zone **zones;
+        int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
+        zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+        if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+                return 1;
+        return 0;
+}
+#endif
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
@@ -1328,6 +1487,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = vm_swappiness,
                .order = order,
+                .mem_cgroup = NULL,
+                .isolate_pages = isolate_pages_global,
        };
        /*
         * temp_priority is used to remember the scanning priority at which
@@ -1352,6 +1513,7 @@ loop_again:
                if (!priority)
                        disable_swap_token();
+                sc.nr_io_pages = 0;
                all_zones_ok = 1;
                /*
@@ -1444,7 +1606,8 @@ loop_again:
                 * OK, kswapd is getting into trouble.  Take a nap, then take
                 * another pass across the zones.
                 */
-                if (total_scanned && priority < DEF_PRIORITY - 2)
+                if (total_scanned && priority < DEF_PRIORITY - 2 &&
+                                        sc.nr_io_pages > sc.swap_cluster_max)
                        congestion_wait(WRITE, HZ/10);
                /*
@@ -1649,6 +1812,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                .swap_cluster_max = nr_pages,
                .may_writepage = 1,
                .swappiness = vm_swappiness,
+                .isolate_pages = isolate_pages_global,
        };
        current->reclaim_state = &reclaim_state;
@@ -1834,6 +1998,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .swappiness = vm_swappiness,
+                .isolate_pages = isolate_pages_global,
        };
        unsigned long slab_reclaimable;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e8d846f57774..422d960ffcd8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states);
 static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
 {
-        int cpu = 0;
+        int cpu;
        int i;
        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
-        cpu = first_cpu(*cpumask);
+        for_each_cpu_mask(cpu, *cpumask) {
-        while (cpu < NR_CPUS) {
                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
-                cpu = next_cpu(cpu, *cpumask);
-                if (cpu < NR_CPUS)
-                        prefetch(&per_cpu(vm_event_states, cpu));
                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
                        ret[i] += this->event[i];
        }
@@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state);
 /*
 * Update the zone counters for one cpu.
 *
+ * The cpu specified must be either the current cpu or a processor that
+ * is not online. If it is the current cpu then the execution thread must
+ * be pinned to the current cpu.
+ *
 * Note that refresh_cpu_vm_stats strives to only access
 * node local memory. The per cpu pagesets on remote zones are placed
 * in the memory local to the processor using that pageset. So the
@@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu)
 {
        struct zone *zone;
        int i;
-        unsigned long flags;
+        int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
        for_each_zone(zone) {
                struct per_cpu_pageset *p;
@@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu)
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                        if (p->vm_stat_diff[i]) {
+                                unsigned long flags;
+                                int v;
                                local_irq_save(flags);
-                                zone_page_state_add(p->vm_stat_diff[i],
+                                v = p->vm_stat_diff[i];
-                                        zone, i);
                                p->vm_stat_diff[i] = 0;
+                                local_irq_restore(flags);
+                                atomic_long_add(v, &zone->vm_stat[i]);
+                                global_diff[i] += v;
 #ifdef CONFIG_NUMA
                                /* 3 seconds idle till flush */
                                p->expire = 3;
 #endif
-                                local_irq_restore(flags);
                        }
 #ifdef CONFIG_NUMA
                /*
@@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu)
                 * Check if there are pages remaining in this pageset
                 * if not then there is nothing to expire.
                 */
-                if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+                if (!p->expire || !p->pcp.count)
                        continue;
                /*
@@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu)
                if (p->expire)
                        continue;
-                if (p->pcp[0].count)
+                if (p->pcp.count)
-                        drain_zone_pages(zone, p->pcp + 0);
+                        drain_zone_pages(zone, &p->pcp);
-                if (p->pcp[1].count)
-                        drain_zone_pages(zone, p->pcp + 1);
 #endif
        }
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (global_diff[i])
+                        atomic_long_add(global_diff[i], &vm_stat[i]);
 }
 #endif
@@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n  pagesets");
        for_each_online_cpu(i) {
                struct per_cpu_pageset *pageset;
-                int j;
                pageset = zone_pcp(zone, i);
-                for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                seq_printf(m,
-                        seq_printf(m,
+                           "\n    cpu: %i"
-                                   "\n    cpu: %i pcp: %i"
+                           "\n              count: %i"
-                                   "\n              count: %i"
+                           "\n              high:  %i"
-                                   "\n              high:  %i"
+                           "\n              batch: %i",
-                                   "\n              batch: %i",
+                           i,
-                                   i, j,
+                           pageset->pcp.count,
-                                   pageset->pcp[j].count,
+                           pageset->pcp.high,
-                                   pageset->pcp[j].high,
+                           pageset->pcp.batch);
-                                   pageset->pcp[j].batch);
-                        }
 #ifdef CONFIG_SMP
                seq_printf(m, "\n  vm stats threshold: %d",
                                pageset->stat_threshold);