Merge branch 'master' into next

Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
author: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
committer: James Morris <jmorris@namei.org> 2009-02-05 19:01:45 -0500
commit: cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree: 7c06d8f30783115e3384721046258ce615b129c5 /mm
parent: 8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent: f01d1d546abb2f4028b5299092f529eefb01253a (diff)
40 files changed, 4061 insertions, 1854 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a816..a5b77811fdf2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
          example on NUMA systems to put pages nearer to the processors accessing
          the page.
-config RESOURCES_64BIT
-        bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
-        default 64BIT
-        help
-          This option allows memory and IO resources to be 64 bit.
 config PHYS_ADDR_T_64BIT
        def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7c..72255be57f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
-                           readahead.o swap.o truncate.o vmscan.o \
+                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
                           page_isolation.o mm_init.o $(mmu-y)
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b046e6..8e8587444132 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
 {
        struct backing_dev_info *bdi = m->private;
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
-        long bdi_thresh;
+        unsigned long bdi_thresh;
        get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->max_prop_frac = PROP_FRAC_BASE;
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-                err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
                if (err)
                        goto err;
        }
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142a..51a0ccf61e0e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        unsigned long fallback = 0;
        unsigned long min, max, start, sidx, midx, step;
+        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+                align, goal, limit);
        BUG_ON(!size);
        BUG_ON(align & (align - 1));
        BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        if (!bdata->node_bootmem_map)
                return NULL;
-        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
-                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
-                align, goal, limit);
        min = bdata->node_min_pfn;
        max = bdata->node_low_pfn;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a1da969bd980..54a0f8040afa 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -24,7 +24,7 @@
 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 * deactivate the pages and clear PG_Referenced.
 */
-asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
+SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
        struct file *file = fget(fd);
        struct address_space *mapping;
@@ -126,12 +126,26 @@ out:
        fput(file);
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
+{
+        return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
+}
+SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
+#endif
 #ifdef __ARCH_WANT_SYS_FADVISE64
-asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)
+SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
 {
        return sys_fadvise64_64(fd, offset, len, advice);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
+{
+        return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
+}
+SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
+#endif
 #endif
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f8944d17..23acefe51808 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        int ret;
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
-                .nr_to_write = mapping->nrpages * 2,
+                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        VM_BUG_ON(!PageLocked(page));
        error = mem_cgroup_cache_charge(page, current->mm,
-                                        gfp_mask & ~__GFP_HIGHMEM);
+                                        gfp_mask & GFP_RECLAIM_MASK);
        if (error)
                goto out;
@@ -741,7 +741,14 @@ repeat:
                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        return NULL;
-                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+                /*
+                 * We want a regular kernel memory (not highmem or DMA etc)
+                 * allocation for the radix tree nodes, but we need to honour
+                 * the context-specific requirements the caller has asked for.
+                 * GFP_RECLAIM_MASK collects those requirements.
+                 */
+                err = add_to_page_cache_lru(page, mapping, index,
+                        (gfp_mask & GFP_RECLAIM_MASK));
                if (unlikely(err)) {
                        page_cache_release(page);
                        page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
                return NULL;
        }
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
-        if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+        if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
                page_cache_release(page);
                page = NULL;
        }
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        goto out; /* skip atime */
                size = i_size_read(inode);
                if (pos < size) {
-                        retval = filemap_write_and_wait(mapping);
+                        retval = filemap_write_and_wait_range(mapping, pos,
+                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
@@ -1366,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
        return 0;
 }
-asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
        ssize_t ret;
        struct file *file;
@@ -1385,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
        }
        return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+        return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
 #ifdef CONFIG_MMU
 /**
@@ -1530,7 +1545,6 @@ retry_find:
        /*
         * Found the page and have a reference on it.
         */
-        mark_page_accessed(page);
        ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
@@ -1766,7 +1780,7 @@ int should_remove_suid(struct dentry *dentry)
        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
                kill |= ATTR_KILL_SGID;
-        if (unlikely(kill && !capable(CAP_FSETID)))
+        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
                return kill;
        return 0;
@@ -2060,18 +2074,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (count != ocount)
                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-        /*
-         * Unmap all mmappings of the file up-front.
-         *
-         * This will cause any pte dirty bits to be propagated into the
-         * pageframes for the subsequent filemap_write_and_wait().
-         */
        write_len = iov_length(iov, *nr_segs);
        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
-        if (mapping_mapped(mapping))
-                unmap_mapping_range(mapping, pos, write_len, 0);
-        written = filemap_write_and_wait(mapping);
+        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
        if (written)
                goto out;
@@ -2140,19 +2146,24 @@ EXPORT_SYMBOL(generic_file_direct_write);
 * Find or create a page at the given pagecache position. Return the locked
 * page. This function is specifically for buffered writes.
 */
-struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+                                        pgoff_t index, unsigned flags)
 {
        int status;
        struct page *page;
+        gfp_t gfp_notmask = 0;
+        if (flags & AOP_FLAG_NOFS)
+                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
        if (likely(page))
                return page;
-        page = page_cache_alloc(mapping);
+        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
        if (!page)
                return NULL;
-        status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
+        status = add_to_page_cache_lru(page, mapping, index,
+                                                GFP_KERNEL & ~gfp_notmask);
        if (unlikely(status)) {
                page_cache_release(page);
                if (status == -EEXIST)
@@ -2161,7 +2172,7 @@ repeat:
        }
        return page;
 }
-EXPORT_SYMBOL(__grab_cache_page);
+EXPORT_SYMBOL(grab_cache_page_write_begin);
 static ssize_t generic_perform_write(struct file *file,
                                struct iov_iter *i, loff_t pos)
@@ -2286,7 +2297,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
         * the file data here, to try to honour O_DIRECT expectations.
         */
        if (unlikely(file->f_flags & O_DIRECT) && written)
-                status = filemap_write_and_wait(mapping);
+                status = filemap_write_and_wait_range(mapping,
+                                        pos, pos + written - 1);
        return written ? written : status;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush_notify(vma, address, pte);
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7b..736ba7f3306a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                if (page) {
                        if (pte_dirty(pte))
                                set_page_dirty(page);
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
                        page_cache_release(page);
                        update_hiwater_rss(mm);
                        dec_mm_counter(mm, file_rss);
@@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
 * and the vma's default protection is used. Arbitrary protections
 * might be implemented in the future.
 */
-asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-        unsigned long prot, unsigned long pgoff, unsigned long flags)
+                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
 {
        struct mm_struct *mm = current->mm;
        struct address_space *mapping;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb89..618e98304080 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
 }
 /*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+        struct hstate *hstate;
+        if (!is_vm_hugetlb_page(vma))
+                return PAGE_SIZE;
+        hstate = hstate_vma(vma);
+        return 1UL << (hstate->order + PAGE_SHIFT);
+}
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+        return vma_kernel_pagesize(vma);
+}
+#endif
+/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
 {
        int i;
-        if (unlikely(sz > MAX_ORDER_NR_PAGES))
+        if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
-                return clear_gigantic_page(page, addr, sz);
+                clear_gigantic_page(page, addr, sz);
+                return;
+        }
        might_sleep();
        for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
        int i;
        struct hstate *h = hstate_vma(vma);
-        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                return copy_gigantic_page(dst, src, addr, vma);
+                copy_gigantic_page(dst, src, addr, vma);
+                return;
+        }
        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 }
-__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
        struct huge_bootmem_page *m;
        int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
                         * puts them into the mem_map).
                         */
                        m = addr;
-                        if (m)
+                        goto found;
-                                goto found;
                }
                hstate_next_node(h);
                nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb68..478223b73a2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
 /*
 * in mm/page_alloc.c
 */
+extern unsigned long highest_memmap_pfn;
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 /*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define GUP_FLAGS_WRITE                  0x1
 #define GUP_FLAGS_FORCE                  0x2
 #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+#define GUP_FLAGS_IGNORE_SIGKILL         0x8
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int len, int flags,
diff --git a/mm/madvise.c b/mm/madvise.c
index f9349c18a1b5..b9ce574827c8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 */
-asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
        unsigned long end, tmp;
        struct vm_area_struct * vma, *prev;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0c..8e4be9cb2a6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
@@ -34,12 +36,23 @@
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
+#include "internal.h"
 #include <asm/uaccess.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account         (0)
+#endif
+static DEFINE_MUTEX(memcg_tasklist);    /* can be hold under cgroup_mutex */
 /*
 * Statistics for memory cgroup.
 */
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
 } ____cacheline_aligned_in_smp;
 struct mem_cgroup_stat {
-        struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+        struct mem_cgroup_stat_cpu cpustat[0];
 };
 /*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
        /*
         * spin_lock to protect the per cgroup LRU
         */
-        spinlock_t              lru_lock;
        struct list_head        lists[NR_LRU_LISTS];
        unsigned long           count[NR_LRU_LISTS];
+        struct zone_reclaim_stat reclaim_stat;
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
@@ -122,44 +136,74 @@ struct mem_cgroup {
         */
        struct res_counter res;
        /*
+         * the counter to account for mem+swap usage.
+         */
+        struct res_counter memsw;
+        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
         */
        struct mem_cgroup_lru_info info;
+        /*
+          protect against reclaim related member.
+        */
+        spinlock_t reclaim_param_lock;
        int     prev_priority;  /* for recording reclaim priority */
+        /*
+         * While reclaiming in a hiearchy, we cache the last child we
+         * reclaimed from. Protected by hierarchy_mutex
+         */
+        struct mem_cgroup *last_scanned_child;
        /*
-         * statistics.
+         * Should the accounting and control be hierarchical, per subtree?
+         */
+        bool use_hierarchy;
+        unsigned long   last_oom_jiffies;
+        atomic_t        refcnt;
+        unsigned int    swappiness;
+        /*
+         * statistics. This must be placed at the end of memcg.
         */
        struct mem_cgroup_stat stat;
 };
-static struct mem_cgroup init_mem_cgroup;
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
+        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        NR_CHARGE_TYPE,
 };
 /* only for here (for easy reading.) */
 #define PCGF_CACHE      (1UL << PCG_CACHE)
 #define PCGF_USED       (1UL << PCG_USED)
-#define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
 #define PCGF_LOCK       (1UL << PCG_LOCK)
-#define PCGF_FILE       (1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
-        PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
-        PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+        PCGF_USED | PCGF_LOCK, /* Anon */
-        PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+        PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
        0, /* FORCE */
 };
-/*
+/* for encoding cft->private value on file */
- * Always modified under lru lock. Then, not necessary to preempt_disable()
+#define _MEM                    (0)
- */
+#define _MEMSWAP                (1)
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         struct page_cgroup *pc,
                                         bool charge)
@@ -167,10 +211,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        int val = (charge)? 1 : -1;
        struct mem_cgroup_stat *stat = &mem->stat;
        struct mem_cgroup_stat_cpu *cpustat;
+        int cpu = get_cpu();
-        VM_BUG_ON(!irqs_disabled());
+        cpustat = &stat->cpustat[cpu];
-        cpustat = &stat->cpustat[smp_processor_id()];
        if (PageCgroupCache(pc))
                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
        else
@@ -182,6 +225,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        else
                __mem_cgroup_stat_add_safe(cpustat,
                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+        put_cpu();
 }
 static struct mem_cgroup_per_zone *
@@ -197,6 +241,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
        int nid = page_cgroup_nid(pc);
        int zid = page_cgroup_zid(pc);
+        if (!mem)
+                return NULL;
        return mem_cgroup_zoneinfo(mem, nid, zid);
 }
@@ -236,118 +283,169 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
-                        struct page_cgroup *pc)
 {
-        int lru = LRU_BASE;
+        struct mem_cgroup *mem = NULL;
+        /*
-        if (PageCgroupUnevictable(pc))
+         * Because we have no locks, mm->owner's may be being moved to other
-                lru = LRU_UNEVICTABLE;
+         * cgroup. We use css_tryget() here even if this looks
-        else {
+         * pessimistic (rather than adding locks here).
-                if (PageCgroupActive(pc))
+         */
-                        lru += LRU_ACTIVE;
+        rcu_read_lock();
-                if (PageCgroupFile(pc))
+        do {
-                        lru += LRU_FILE;
+                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        }
+                if (unlikely(!mem))
+                        break;
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        } while (!css_tryget(&mem->css));
+        rcu_read_unlock();
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
+        return mem;
-        list_del(&pc->lru);
 }
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
-                                struct page_cgroup *pc)
 {
-        int lru = LRU_BASE;
+        if (!mem)
+                return true;
+        return css_is_removed(&mem->css);
+}
-        if (PageCgroupUnevictable(pc))
+/*
-                lru = LRU_UNEVICTABLE;
+ * Following LRU functions are allowed to be used without PCG_LOCK.
-        else {
+ * Operations are called by routine of global LRU independently from memcg.
-                if (PageCgroupActive(pc))
+ * What we have to take care of here is validness of pc->mem_cgroup.
-                        lru += LRU_ACTIVE;
+ *
-                if (PageCgroupFile(pc))
+ * Changes to pc->mem_cgroup happens when
-                        lru += LRU_FILE;
+ * 1. charge
-        }
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
-        list_add(&pc->lru, &mz->lists[lru]);
+{
+        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
+        struct mem_cgroup_per_zone *mz;
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+        if (mem_cgroup_disabled())
+                return;
+        pc = lookup_page_cgroup(page);
+        /* can happen while we handle swapcache. */
+        if (list_empty(&pc->lru) || !pc->mem_cgroup)
+                return;
+        /*
+         * We don't check PCG_USED bit. It's cleared when the "page" is finally
+         * removed from global LRU.
+         */
+        mz = page_cgroup_zoneinfo(pc);
+        mem = pc->mem_cgroup;
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        list_del_init(&pc->lru);
+        return;
 }
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_del_lru(struct page *page)
 {
-        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        mem_cgroup_del_lru_list(page, page_lru(page));
-        int active    = PageCgroupActive(pc);
+}
-        int file      = PageCgroupFile(pc);
-        int unevictable = PageCgroupUnevictable(pc);
-        enum lru_list from = unevictable ? LRU_UNEVICTABLE :
-                                (LRU_FILE * !!file + !!active);
-        if (lru == from)
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+        struct mem_cgroup_per_zone *mz;
+        struct page_cgroup *pc;
+        if (mem_cgroup_disabled())
                return;
-        MEM_CGROUP_ZSTAT(mz, from) -= 1;
+        pc = lookup_page_cgroup(page);
        /*
-         * However this is done under mz->lru_lock, another flags, which
+         * Used bit is set without atomic ops but after smp_wmb().
-         * are not related to LRU, will be modified from out-of-lock.
+         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         * We have to use atomic set/clear flags.
         */
-        if (is_unevictable_lru(lru)) {
+        smp_rmb();
-                ClearPageCgroupActive(pc);
+        /* unused page is not rotated. */
-                SetPageCgroupUnevictable(pc);
+        if (!PageCgroupUsed(pc))
-        } else {
+                return;
-                if (is_active_lru(lru))
+        mz = page_cgroup_zoneinfo(pc);
-                        SetPageCgroupActive(pc);
-                else
-                        ClearPageCgroupActive(pc);
-                ClearPageCgroupUnevictable(pc);
-        }
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
        list_move(&pc->lru, &mz->lists[lru]);
 }
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 {
-        int ret;
+        struct page_cgroup *pc;
+        struct mem_cgroup_per_zone *mz;
-        task_lock(task);
+        if (mem_cgroup_disabled())
-        ret = task->mm && mm_match_cgroup(task->mm, mem);
+                return;
-        task_unlock(task);
+        pc = lookup_page_cgroup(page);
-        return ret;
+        /*
+         * Used bit is set without atomic ops but after smp_wmb().
+         * For making pc->mem_cgroup visible, insert smp_rmb() here.
+         */
+        smp_rmb();
+        if (!PageCgroupUsed(pc))
+                return;
+        mz = page_cgroup_zoneinfo(pc);
+        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        list_add(&pc->lru, &mz->lists[lru]);
 }
 /*
- * This routine assumes that the appropriate zone's lru lock is already held
+ * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * lru because the page may.be reused after it's fully uncharged (because of
+ * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * it again. This function is only used to charge SwapCache. It's done under
+ * lock_page and expected that zone->lru_lock is never held.
 */
-void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
+static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
 {
-        struct page_cgroup *pc;
-        struct mem_cgroup_per_zone *mz;
        unsigned long flags;
+        struct zone *zone = page_zone(page);
+        struct page_cgroup *pc = lookup_page_cgroup(page);
-        if (mem_cgroup_subsys.disabled)
+        spin_lock_irqsave(&zone->lru_lock, flags);
-                return;
        /*
-         * We cannot lock_page_cgroup while holding zone's lru_lock,
+         * Forget old LRU when this page_cgroup is *not* used. This Used bit
-         * because other holders of lock_page_cgroup can be interrupted
+         * is guarded by lock_page() because the page is SwapCache.
-         * with an attempt to rotate_reclaimable_page.  But we cannot
-         * safely get to page_cgroup without it, so just try_lock it:
-         * mem_cgroup_isolate_pages allows for page left on wrong list.
         */
-        pc = lookup_page_cgroup(page);
+        if (!PageCgroupUsed(pc))
-        if (!trylock_page_cgroup(pc))
+                mem_cgroup_del_lru_list(page, page_lru(page));
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+{
+        unsigned long flags;
+        struct zone *zone = page_zone(page);
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        /* link when the page is linked to LRU but page_cgroup isn't */
+        if (PageLRU(page) && list_empty(&pc->lru))
+                mem_cgroup_add_lru_list(page, page_lru(page));
+        spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+void mem_cgroup_move_lists(struct page *page,
+                           enum lru_list from, enum lru_list to)
+{
+        if (mem_cgroup_disabled())
                return;
-        if (pc && PageCgroupUsed(pc)) {
+        mem_cgroup_del_lru_list(page, from);
-                mz = page_cgroup_zoneinfo(pc);
+        mem_cgroup_add_lru_list(page, to);
-                spin_lock_irqsave(&mz->lru_lock, flags);
+}
-                __mem_cgroup_move_lists(pc, lru);
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
-        }
+{
-        unlock_page_cgroup(pc);
+        int ret;
+        task_lock(task);
+        ret = task->mm && mm_match_cgroup(task->mm, mem);
+        task_unlock(task);
+        return ret;
 }
 /*
@@ -372,39 +470,116 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
-        return mem->prev_priority;
+        int prev_priority;
+        spin_lock(&mem->reclaim_param_lock);
+        prev_priority = mem->prev_priority;
+        spin_unlock(&mem->reclaim_param_lock);
+        return prev_priority;
 }
 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
+        spin_lock(&mem->reclaim_param_lock);
        if (priority < mem->prev_priority)
                mem->prev_priority = priority;
+        spin_unlock(&mem->reclaim_param_lock);
 }
 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
+        spin_lock(&mem->reclaim_param_lock);
        mem->prev_priority = priority;
+        spin_unlock(&mem->reclaim_param_lock);
 }
-/*
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
- * Calculate # of pages to be scanned in this priority/zone.
+{
- * See also vmscan.c
+        unsigned long active;
- *
+        unsigned long inactive;
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
+        unsigned long gb;
- * (see include/linux/mmzone.h)
+        unsigned long inactive_ratio;
- */
+        inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+        active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+        gb = (inactive + active) >> (30 - PAGE_SHIFT);
+        if (gb)
+                inactive_ratio = int_sqrt(10 * gb);
+        else
+                inactive_ratio = 1;
+        if (present_pages) {
+                present_pages[0] = inactive;
+                present_pages[1] = active;
+        }
+        return inactive_ratio;
+}
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-                                        int priority, enum lru_list lru)
+{
+        unsigned long active;
+        unsigned long inactive;
+        unsigned long present_pages[2];
+        unsigned long inactive_ratio;
+        inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+        inactive = present_pages[0];
+        active = present_pages[1];
+        if (inactive * inactive_ratio < active)
+                return 1;
+        return 0;
+}
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+                                       struct zone *zone,
+                                       enum lru_list lru)
 {
-        long nr_pages;
        int nid = zone->zone_pgdat->node_id;
        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+        return MEM_CGROUP_ZSTAT(mz, lru);
+}
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+                                                      struct zone *zone)
+{
+        int nid = zone->zone_pgdat->node_id;
+        int zid = zone_idx(zone);
+        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        return (nr_pages >> priority);
+        return &mz->reclaim_stat;
+}
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+        struct page_cgroup *pc;
+        struct mem_cgroup_per_zone *mz;
+        if (mem_cgroup_disabled())
+                return NULL;
+        pc = lookup_page_cgroup(page);
+        /*
+         * Used bit is set without atomic ops but after smp_wmb().
+         * For making pc->mem_cgroup visible, insert smp_rmb() here.
+         */
+        smp_rmb();
+        if (!PageCgroupUsed(pc))
+                return NULL;
+        mz = page_cgroup_zoneinfo(pc);
+        if (!mz)
+                return NULL;
+        return &mz->reclaim_stat;
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -429,94 +604,279 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
        src = &mz->lists[lru];
-        spin_lock(&mz->lru_lock);
        scan = 0;
        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
                if (scan >= nr_to_scan)
                        break;
+                page = pc->page;
                if (unlikely(!PageCgroupUsed(pc)))
                        continue;
-                page = pc->page;
                if (unlikely(!PageLRU(page)))
                        continue;
-                /*
-                 * TODO: play better with lumpy reclaim, grabbing anything.
-                 */
-                if (PageUnevictable(page) ||
-                    (PageActive(page) && !active) ||
-                    (!PageActive(page) && active)) {
-                        __mem_cgroup_move_lists(pc, page_lru(page));
-                        continue;
-                }
                scan++;
-                list_move(&pc->lru, &pc_list);
                if (__isolate_lru_page(page, mode, file) == 0) {
                        list_move(&page->lru, dst);
                        nr_taken++;
                }
        }
-        list_splice(&pc_list, src);
-        spin_unlock(&mz->lru_lock);
        *scanned = scan;
        return nr_taken;
 }
+#define mem_cgroup_from_res_counter(counter, member)    \
+        container_of(counter, struct mem_cgroup, member)
 /*
- * Charge the memory controller for page usage.
+ * This routine finds the DFS walk successor. This routine should be
- * Return
+ * called with hierarchy_mutex held
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
 */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+static struct mem_cgroup *
-                                gfp_t gfp_mask, enum charge_type ctype,
+__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
-                                struct mem_cgroup *memcg)
 {
+        struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
+        curr_cgroup = curr->css.cgroup;
+        root_cgroup = root_mem->css.cgroup;
+        if (!list_empty(&curr_cgroup->children)) {
+                /*
+                 * Walk down to children
+                 */
+                cgroup = list_entry(curr_cgroup->children.next,
+                                                struct cgroup, sibling);
+                curr = mem_cgroup_from_cont(cgroup);
+                goto done;
+        }
+visit_parent:
+        if (curr_cgroup == root_cgroup) {
+                /* caller handles NULL case */
+                curr = NULL;
+                goto done;
+        }
+        /*
+         * Goto next sibling
+         */
+        if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
+                cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
+                                                sibling);
+                curr = mem_cgroup_from_cont(cgroup);
+                goto done;
+        }
+        /*
+         * Go up to next parent and next parent's sibling if need be
+         */
+        curr_cgroup = curr_cgroup->parent;
+        goto visit_parent;
+done:
+        return curr;
+}
+/*
+ * Visit the first child (need not be the first child as per the ordering
+ * of the cgroup list, since we track last_scanned_child) of @mem and use
+ * that to reclaim free pages from.
+ */
+static struct mem_cgroup *
+mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
+{
+        struct cgroup *cgroup;
+        struct mem_cgroup *orig, *next;
+        bool obsolete;
+        /*
+         * Scan all children under the mem_cgroup mem
+         */
+        mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+        orig = root_mem->last_scanned_child;
+        obsolete = mem_cgroup_is_obsolete(orig);
+        if (list_empty(&root_mem->css.cgroup->children)) {
+                /*
+                 * root_mem might have children before and last_scanned_child
+                 * may point to one of them. We put it later.
+                 */
+                if (orig)
+                        VM_BUG_ON(!obsolete);
+                next = NULL;
+                goto done;
+        }
+        if (!orig || obsolete) {
+                cgroup = list_first_entry(&root_mem->css.cgroup->children,
+                                struct cgroup, sibling);
+                next = mem_cgroup_from_cont(cgroup);
+        } else
+                next = __mem_cgroup_get_next_node(orig, root_mem);
+done:
+        if (next)
+                mem_cgroup_get(next);
+        root_mem->last_scanned_child = next;
+        if (orig)
+                mem_cgroup_put(orig);
+        mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+        return (next) ? next : root_mem;
+}
+static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+{
+        if (do_swap_account) {
+                if (res_counter_check_under_limit(&mem->res) &&
+                        res_counter_check_under_limit(&mem->memsw))
+                        return true;
+        } else
+                if (res_counter_check_under_limit(&mem->res))
+                        return true;
+        return false;
+}
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+        struct cgroup *cgrp = memcg->css.cgroup;
+        unsigned int swappiness;
+        /* root ? */
+        if (cgrp->parent == NULL)
+                return vm_swappiness;
+        spin_lock(&memcg->reclaim_param_lock);
+        swappiness = memcg->swappiness;
+        spin_unlock(&memcg->reclaim_param_lock);
+        return swappiness;
+}
+/*
+ * Dance down the hierarchy if needed to reclaim memory. We remember the
+ * last child we reclaimed from, so that we don't end up penalizing
+ * one child extensively based on its position in the children list.
+ *
+ * root_mem is the original ancestor that we've been reclaim from.
+ */
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+                                                gfp_t gfp_mask, bool noswap)
+{
+        struct mem_cgroup *next_mem;
+        int ret = 0;
+        /*
+         * Reclaim unconditionally and don't check for return value.
+         * We need to reclaim in the current group and down the tree.
+         * One might think about checking for children before reclaiming,
+         * but there might be left over accounting, even after children
+         * have left.
+         */
+        ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+                                           get_swappiness(root_mem));
+        if (mem_cgroup_check_under_limit(root_mem))
+                return 1;       /* indicate reclaim has succeeded */
+        if (!root_mem->use_hierarchy)
+                return ret;
+        next_mem = mem_cgroup_get_next_node(root_mem);
+        while (next_mem != root_mem) {
+                if (mem_cgroup_is_obsolete(next_mem)) {
+                        next_mem = mem_cgroup_get_next_node(root_mem);
+                        continue;
+                }
+                ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+                                                   get_swappiness(next_mem));
+                if (mem_cgroup_check_under_limit(root_mem))
+                        return 1;       /* indicate reclaim has succeeded */
+                next_mem = mem_cgroup_get_next_node(root_mem);
+        }
+        return ret;
+}
+bool mem_cgroup_oom_called(struct task_struct *task)
+{
+        bool ret = false;
        struct mem_cgroup *mem;
-        struct page_cgroup *pc;
+        struct mm_struct *mm;
-        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct mem_cgroup_per_zone *mz;
-        unsigned long flags;
-        pc = lookup_page_cgroup(page);
+        rcu_read_lock();
-        /* can happen at boot */
+        mm = task->mm;
-        if (unlikely(!pc))
+        if (!mm)
+                mm = &init_mm;
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
+                ret = true;
+        rcu_read_unlock();
+        return ret;
+}
+/*
+ * Unlike exported interface, "oom" parameter is added. if oom==true,
+ * oom-killer can be invoked.
+ */
+static int __mem_cgroup_try_charge(struct mm_struct *mm,
+                        gfp_t gfp_mask, struct mem_cgroup **memcg,
+                        bool oom)
+{
+        struct mem_cgroup *mem, *mem_over_limit;
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct res_counter *fail_res;
+        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+                /* Don't account this! */
+                *memcg = NULL;
                return 0;
-        prefetchw(pc);
+        }
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
+        mem = *memcg;
-        if (likely(!memcg)) {
+        if (likely(!mem)) {
-                rcu_read_lock();
+                mem = try_get_mem_cgroup_from_mm(mm);
-                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                *memcg = mem;
-                if (unlikely(!mem)) {
-                        rcu_read_unlock();
-                        return 0;
-                }
-                /*
-                 * For every charge from the cgroup, increment reference count
-                 */
-                css_get(&mem->css);
-                rcu_read_unlock();
        } else {
-                mem = memcg;
+                css_get(&mem->css);
-                css_get(&memcg->css);
        }
+        if (unlikely(!mem))
+                return 0;
+        VM_BUG_ON(mem_cgroup_is_obsolete(mem));
+        while (1) {
+                int ret;
+                bool noswap = false;
+                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+                if (likely(!ret)) {
+                        if (!do_swap_account)
+                                break;
+                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+                                                        &fail_res);
+                        if (likely(!ret))
+                                break;
+                        /* mem+swap counter fails */
+                        res_counter_uncharge(&mem->res, PAGE_SIZE);
+                        noswap = true;
+                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+                                                                        memsw);
+                } else
+                        /* mem counter fails */
+                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+                                                                        res);
-        while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
                if (!(gfp_mask & __GFP_WAIT))
-                        goto out;
+                        goto nomem;
-                if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
+                                                        noswap);
+                if (ret)
                        continue;
                /*
@@ -525,49 +885,221 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                 * moved to swap cache or just unmapped from the cgroup.
                 * Check the limit again to see if the reclaim reduced the
                 * current usage of the cgroup before giving up
+                 *
                 */
-                if (res_counter_check_under_limit(&mem->res))
+                if (mem_cgroup_check_under_limit(mem_over_limit))
                        continue;
                if (!nr_retries--) {
-                        mem_cgroup_out_of_memory(mem, gfp_mask);
+                        if (oom) {
-                        goto out;
+                                mutex_lock(&memcg_tasklist);
+                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+                                mutex_unlock(&memcg_tasklist);
+                                mem_over_limit->last_oom_jiffies = jiffies;
+                        }
+                        goto nomem;
                }
        }
+        return 0;
+nomem:
+        css_put(&mem->css);
+        return -ENOMEM;
+}
+static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+{
+        struct mem_cgroup *mem;
+        swp_entry_t ent;
+        if (!PageSwapCache(page))
+                return NULL;
+        ent.val = page_private(page);
+        mem = lookup_swap_cgroup(ent);
+        if (!mem)
+                return NULL;
+        if (!css_tryget(&mem->css))
+                return NULL;
+        return mem;
+}
+/*
+ * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+ * USED state. If already USED, uncharge and return.
+ */
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+                                     struct page_cgroup *pc,
+                                     enum charge_type ctype)
+{
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                if (do_swap_account)
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
                css_put(&mem->css);
+                return;
-                goto done;
        }
        pc->mem_cgroup = mem;
-        /*
+        smp_wmb();
-         * If a page is accounted as a page cache, insert to inactive list.
-         * If anon, insert to active list.
-         */
        pc->flags = pcg_default_flags[ctype];
-        mz = page_cgroup_zoneinfo(pc);
+        mem_cgroup_charge_statistics(mem, pc, true);
-        spin_lock_irqsave(&mz->lru_lock, flags);
-        __mem_cgroup_add_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
        unlock_page_cgroup(pc);
+}
-done:
+/**
-        return 0;
+ * mem_cgroup_move_account - move account of the page
+ * @pc: page_cgroup of the page.
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ *
+ * returns 0 at success,
+ * returns -EBUSY when lock is busy or "pc" is unstable.
+ *
+ * This function does "uncharge" from old cgroup but doesn't do "charge" to
+ * new cgroup. It should be done by a caller.
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+        struct mem_cgroup *from, struct mem_cgroup *to)
+{
+        struct mem_cgroup_per_zone *from_mz, *to_mz;
+        int nid, zid;
+        int ret = -EBUSY;
+        VM_BUG_ON(from == to);
+        VM_BUG_ON(PageLRU(pc->page));
+        nid = page_cgroup_nid(pc);
+        zid = page_cgroup_zid(pc);
+        from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
+        to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
+        if (!trylock_page_cgroup(pc))
+                return ret;
+        if (!PageCgroupUsed(pc))
+                goto out;
+        if (pc->mem_cgroup != from)
+                goto out;
+        res_counter_uncharge(&from->res, PAGE_SIZE);
+        mem_cgroup_charge_statistics(from, pc, false);
+        if (do_swap_account)
+                res_counter_uncharge(&from->memsw, PAGE_SIZE);
+        css_put(&from->css);
+        css_get(&to->css);
+        pc->mem_cgroup = to;
+        mem_cgroup_charge_statistics(to, pc, true);
+        ret = 0;
 out:
-        css_put(&mem->css);
+        unlock_page_cgroup(pc);
-        return -ENOMEM;
+        return ret;
+}
+/*
+ * move charges to its parent.
+ */
+static int mem_cgroup_move_parent(struct page_cgroup *pc,
+                                  struct mem_cgroup *child,
+                                  gfp_t gfp_mask)
+{
+        struct page *page = pc->page;
+        struct cgroup *cg = child->css.cgroup;
+        struct cgroup *pcg = cg->parent;
+        struct mem_cgroup *parent;
+        int ret;
+        /* Is ROOT ? */
+        if (!pcg)
+                return -EINVAL;
+        parent = mem_cgroup_from_cont(pcg);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+        if (ret || !parent)
+                return ret;
+        if (!get_page_unless_zero(page)) {
+                ret = -EBUSY;
+                goto uncharge;
+        }
+        ret = isolate_lru_page(page);
+        if (ret)
+                goto cancel;
+        ret = mem_cgroup_move_account(pc, child, parent);
+        putback_lru_page(page);
+        if (!ret) {
+                put_page(page);
+                /* drop extra refcnt by try_charge() */
+                css_put(&parent->css);
+                return 0;
+        }
+cancel:
+        put_page(page);
+uncharge:
+        /* drop extra refcnt by try_charge() */
+        css_put(&parent->css);
+        /* uncharge if move fails */
+        res_counter_uncharge(&parent->res, PAGE_SIZE);
+        if (do_swap_account)
+                res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+        return ret;
+}
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask, enum charge_type ctype,
+                                struct mem_cgroup *memcg)
+{
+        struct mem_cgroup *mem;
+        struct page_cgroup *pc;
+        int ret;
+        pc = lookup_page_cgroup(page);
+        /* can happen at boot */
+        if (unlikely(!pc))
+                return 0;
+        prefetchw(pc);
+        mem = memcg;
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+        if (ret || !mem)
+                return ret;
+        __mem_cgroup_commit_charge(mem, pc, ctype);
+        return 0;
 }
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_newpage_charge(struct page *page,
+                              struct mm_struct *mm, gfp_t gfp_mask)
 {
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
        if (PageCompound(page))
                return 0;
@@ -589,7 +1121,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        if (mem_cgroup_subsys.disabled)
+        struct mem_cgroup *mem = NULL;
+        int ret;
+        if (mem_cgroup_disabled())
                return 0;
        if (PageCompound(page))
                return 0;
@@ -601,6 +1136,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
         * For GFP_NOWAIT case, the page may be pre-charged before calling
         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
         * charge twice. (It works but has to pay a bit larger cost.)
+         * And when the page is SwapCache, it should take swap information
+         * into account. This is under lock_page() now.
         */
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
@@ -617,58 +1154,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
-        if (unlikely(!mm))
+        if (do_swap_account && PageSwapCache(page)) {
+                mem = try_get_mem_cgroup_from_swapcache(page);
+                if (mem)
+                        mm = NULL;
+                  else
+                        mem = NULL;
+                /* SwapCache may be still linked to LRU now. */
+                mem_cgroup_lru_del_before_commit_swapcache(page);
+        }
+        if (unlikely(!mm && !mem))
                mm = &init_mm;
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
-        else
-                return mem_cgroup_charge_common(page, mm, gfp_mask,
+        ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+        if (mem)
+                css_put(&mem->css);
+        if (PageSwapCache(page))
+                mem_cgroup_lru_add_after_commit_swapcache(page);
+        if (do_swap_account && !ret && PageSwapCache(page)) {
+                swp_entry_t ent = {.val = page_private(page)};
+                /* avoid double counting */
+                mem = swap_cgroup_record(ent, NULL);
+                if (mem) {
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                        mem_cgroup_put(mem);
+                }
+        }
+        return ret;
+}
+/*
+ * While swap-in, try_charge -> commit or cancel, the page is locked.
+ * And when try_charge() successfully returns, one refcnt to memcg without
+ * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * "commit()" or removed by "cancel()"
+ */
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+                                 struct page *page,
+                                 gfp_t mask, struct mem_cgroup **ptr)
+{
+        struct mem_cgroup *mem;
+        int ret;
+        if (mem_cgroup_disabled())
+                return 0;
+        if (!do_swap_account)
+                goto charge_cur_mm;
+        /*
+         * A racing thread's fault, or swapoff, may have already updated
+         * the pte, and even removed page from swap cache: return success
+         * to go on to do_swap_page()'s pte_same() test, which should fail.
+         */
+        if (!PageSwapCache(page))
+                return 0;
+        mem = try_get_mem_cgroup_from_swapcache(page);
+        if (!mem)
+                goto charge_cur_mm;
+        *ptr = mem;
+        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+        /* drop extra refcnt from tryget */
+        css_put(&mem->css);
+        return ret;
+charge_cur_mm:
+        if (unlikely(!mm))
+                mm = &init_mm;
+        return __mem_cgroup_try_charge(mm, mask, ptr, true);
+}
+void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
+{
+        struct page_cgroup *pc;
+        if (mem_cgroup_disabled())
+                return;
+        if (!ptr)
+                return;
+        pc = lookup_page_cgroup(page);
+        mem_cgroup_lru_del_before_commit_swapcache(page);
+        __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        mem_cgroup_lru_add_after_commit_swapcache(page);
+        /*
+         * Now swap is on-memory. This means this page may be
+         * counted both as mem and swap....double count.
+         * Fix it by uncharging from memsw. Basically, this SwapCache is stable
+         * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
+         * may call delete_from_swap_cache() before reach here.
+         */
+        if (do_swap_account && PageSwapCache(page)) {
+                swp_entry_t ent = {.val = page_private(page)};
+                struct mem_cgroup *memcg;
+                memcg = swap_cgroup_record(ent, NULL);
+                if (memcg) {
+                        res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                        mem_cgroup_put(memcg);
+                }
+        }
+        /* add this page(page_cgroup) to the LRU we want. */
+}
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!mem)
+                return;
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (do_swap_account)
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        css_put(&mem->css);
 }
 /*
 * uncharge if !page_mapped(page)
 */
-static void
+static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct mem_cgroup_per_zone *mz;
-        unsigned long flags;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
-                return;
+                return NULL;
+        if (PageSwapCache(page))
+                return NULL;
        /*
         * Check if our page_cgroup is valid
         */
        pc = lookup_page_cgroup(page);
        if (unlikely(!pc || !PageCgroupUsed(pc)))
-                return;
+                return NULL;
        lock_page_cgroup(pc);
-        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-             || !PageCgroupUsed(pc)) {
+        mem = pc->mem_cgroup;
-                /* This happens at race in zap_pte_range() and do_swap_page()*/
-                unlock_page_cgroup(pc);
+        if (!PageCgroupUsed(pc))
-                return;
+                goto unlock_out;
+        switch (ctype) {
+        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+                if (page_mapped(page))
+                        goto unlock_out;
+                break;
+        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
+                if (!PageAnon(page)) {  /* Shared memory */
+                        if (page->mapping && !page_is_file_cache(page))
+                                goto unlock_out;
+                } else if (page_mapped(page)) /* Anon */
+                                goto unlock_out;
+                break;
+        default:
+                break;
        }
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        mem_cgroup_charge_statistics(mem, pc, false);
        ClearPageCgroupUsed(pc);
-        mem = pc->mem_cgroup;
+        /*
+         * pc->mem_cgroup is not cleared here. It will be accessed when it's
+         * freed from LRU. This is safe because uncharged page is expected not
+         * to be reused (freed soon). Exception is SwapCache, it's handled by
+         * special functions.
+         */
        mz = page_cgroup_zoneinfo(pc);
-        spin_lock_irqsave(&mz->lru_lock, flags);
-        __mem_cgroup_remove_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
        unlock_page_cgroup(pc);
-        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        /* at swapout, this memcg will be accessed to record to swap */
-        css_put(&mem->css);
+        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                css_put(&mem->css);
-        return;
+        return mem;
+unlock_out:
+        unlock_page_cgroup(pc);
+        return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
@@ -689,16 +1366,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 }
 /*
- * Before starting migration, account against new page.
+ * called from __delete_from_swap_cache() and drop "page" account.
+ * memcg information is recorded to swap_cgroup of "ent"
 */
-int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
+void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
+{
+        struct mem_cgroup *memcg;
+        memcg = __mem_cgroup_uncharge_common(page,
+                                        MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+        /* record memcg information */
+        if (do_swap_account && memcg) {
+                swap_cgroup_record(ent, memcg);
+                mem_cgroup_get(memcg);
+        }
+        if (memcg)
+                css_put(&memcg->css);
+}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/*
+ * called from swap_entry_free(). remove record in swap_cgroup and
+ * uncharge "memsw" account.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t ent)
+{
+        struct mem_cgroup *memcg;
+        if (!do_swap_account)
+                return;
+        memcg = swap_cgroup_record(ent, NULL);
+        if (memcg) {
+                res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+                mem_cgroup_put(memcg);
+        }
+}
+#endif
+/*
+ * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
+ * page belongs to.
+ */
+int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
        int ret = 0;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
        pc = lookup_page_cgroup(page);
@@ -706,41 +1422,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                css_get(&mem->css);
-                if (PageCgroupCache(pc)) {
-                        if (page_is_file_cache(page))
-                                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-                        else
-                                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-                }
        }
        unlock_page_cgroup(pc);
        if (mem) {
-                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
-                        ctype, mem);
                css_put(&mem->css);
        }
+        *ptr = mem;
        return ret;
 }
 /* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct page *newpage)
+void mem_cgroup_end_migration(struct mem_cgroup *mem,
+                struct page *oldpage, struct page *newpage)
 {
+        struct page *target, *unused;
+        struct page_cgroup *pc;
+        enum charge_type ctype;
+        if (!mem)
+                return;
+        /* at migration success, oldpage->mapping is NULL. */
+        if (oldpage->mapping) {
+                target = oldpage;
+                unused = NULL;
+        } else {
+                target = newpage;
+                unused = oldpage;
+        }
+        if (PageAnon(target))
+                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+        else if (page_is_file_cache(target))
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        else
+                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+        /* unused page is not on radix-tree now. */
+        if (unused)
+                __mem_cgroup_uncharge_common(unused, ctype);
+        pc = lookup_page_cgroup(target);
        /*
-         * At success, page->mapping is not NULL.
+         * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
-         * special rollback care is necessary when
+         * So, double-counting is effectively avoided.
-         * 1. at migration failure. (newpage->mapping is cleared in this case)
-         * 2. the newpage was moved but not remapped again because the task
-         *    exits and the newpage is obsolete. In this case, the new page
-         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
-         *    always for avoiding mess. The  page_cgroup will be removed if
-         *    unnecessary. File cache pages is still on radix-tree. Don't
-         *    care it.
         */
-        if (!newpage->mapping)
+        __mem_cgroup_commit_charge(mem, pc, ctype);
-                __mem_cgroup_uncharge_common(newpage,
-                                MEM_CGROUP_CHARGE_TYPE_FORCE);
+        /*
-        else if (PageAnon(newpage))
+         * Both of oldpage and newpage are still under lock_page().
-                mem_cgroup_uncharge_page(newpage);
+         * Then, we don't have to care about race in radix-tree.
+         * But we have to be careful that this page is unmapped or not.
+         *
+         * There is a case for !page_mapped(). At the start of
+         * migration, oldpage was mapped. But now, it's zapped.
+         * But we know *target* page is not freed/reused under us.
+         * mem_cgroup_uncharge_page() does all necessary checks.
+         */
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+                mem_cgroup_uncharge_page(target);
 }
 /*
@@ -748,29 +1490,26 @@ void mem_cgroup_end_migration(struct page *newpage)
 * This is typically used for page reclaiming for shmem for reducing side
 * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
-int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_shrink_usage(struct page *page,
+                            struct mm_struct *mm,
+                            gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        int progress = 0;
        int retry = MEM_CGROUP_RECLAIM_RETRIES;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return 0;
-        if (!mm)
+        if (page)
+                mem = try_get_mem_cgroup_from_swapcache(page);
+        if (!mem && mm)
+                mem = try_get_mem_cgroup_from_mm(mm);
+        if (unlikely(!mem))
                return 0;
-        rcu_read_lock();
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        if (unlikely(!mem)) {
-                rcu_read_unlock();
-                return 0;
-        }
-        css_get(&mem->css);
-        rcu_read_unlock();
        do {
-                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+                progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
-                progress += res_counter_check_under_limit(&mem->res);
+                progress += mem_cgroup_check_under_limit(mem);
        } while (!progress && --retry);
        css_put(&mem->css);
@@ -779,116 +1518,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
        return 0;
 }
-int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+static DEFINE_MUTEX(set_limit_mutex);
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+                                unsigned long long val)
 {
        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
        int progress;
+        u64 memswlimit;
        int ret = 0;
-        while (res_counter_set_limit(&memcg->res, val)) {
+        while (retry_count) {
                if (signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
-                if (!retry_count) {
+                /*
-                        ret = -EBUSY;
+                 * Rather than hide all in some function, I do this in
+                 * open coded manner. You see what this really does.
+                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 */
+                mutex_lock(&set_limit_mutex);
+                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                if (memswlimit < val) {
+                        ret = -EINVAL;
+                        mutex_unlock(&set_limit_mutex);
                        break;
                }
-                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+                ret = res_counter_set_limit(&memcg->res, val);
-                if (!progress)
+                mutex_unlock(&set_limit_mutex);
-                        retry_count--;
+                if (!ret)
+                        break;
+                progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
+                                                           false);
+                if (!progress)                  retry_count--;
        }
        return ret;
 }
+int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
+                                unsigned long long val)
+{
+        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+        u64 memlimit, oldusage, curusage;
+        int ret;
+        if (!do_swap_account)
+                return -EINVAL;
+        while (retry_count) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                /*
+                 * Rather than hide all in some function, I do this in
+                 * open coded manner. You see what this really does.
+                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 */
+                mutex_lock(&set_limit_mutex);
+                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                if (memlimit > val) {
+                        ret = -EINVAL;
+                        mutex_unlock(&set_limit_mutex);
+                        break;
+                }
+                ret = res_counter_set_limit(&memcg->memsw, val);
+                mutex_unlock(&set_limit_mutex);
+                if (!ret)
+                        break;
+                oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
+                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+                if (curusage >= oldusage)
+                        retry_count--;
+        }
+        return ret;
+}
 /*
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
-#define FORCE_UNCHARGE_BATCH    (128)
+static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
-static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+                                int node, int zid, enum lru_list lru)
-                            struct mem_cgroup_per_zone *mz,
-                            enum lru_list lru)
 {
-        struct page_cgroup *pc;
+        struct zone *zone;
-        struct page *page;
+        struct mem_cgroup_per_zone *mz;
-        int count = FORCE_UNCHARGE_BATCH;
+        struct page_cgroup *pc, *busy;
-        unsigned long flags;
+        unsigned long flags, loop;
        struct list_head *list;
+        int ret = 0;
+        zone = &NODE_DATA(node)->node_zones[zid];
+        mz = mem_cgroup_zoneinfo(mem, node, zid);
        list = &mz->lists[lru];
-        spin_lock_irqsave(&mz->lru_lock, flags);
+        loop = MEM_CGROUP_ZSTAT(mz, lru);
-        while (!list_empty(list)) {
+        /* give some margin against EBUSY etc...*/
-                pc = list_entry(list->prev, struct page_cgroup, lru);
+        loop += 256;
-                page = pc->page;
+        busy = NULL;
-                if (!PageCgroupUsed(pc))
+        while (loop--) {
-                        break;
+                ret = 0;
-                get_page(page);
+                spin_lock_irqsave(&zone->lru_lock, flags);
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
+                if (list_empty(list)) {
-                /*
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
-                 * Check if this page is on LRU. !LRU page can be found
-                 * if it's under page migration.
-                 */
-                if (PageLRU(page)) {
-                        __mem_cgroup_uncharge_common(page,
-                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
-                        put_page(page);
-                        if (--count <= 0) {
-                                count = FORCE_UNCHARGE_BATCH;
-                                cond_resched();
-                        }
-                } else {
-                        spin_lock_irqsave(&mz->lru_lock, flags);
                        break;
                }
-                spin_lock_irqsave(&mz->lru_lock, flags);
+                pc = list_entry(list->prev, struct page_cgroup, lru);
+                if (busy == pc) {
+                        list_move(&pc->lru, list);
+                        busy = 0;
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
+                        continue;
+                }
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+                if (ret == -ENOMEM)
+                        break;
+                if (ret == -EBUSY || ret == -EINVAL) {
+                        /* found lock contention or "pc" is obsolete. */
+                        busy = pc;
+                        cond_resched();
+                } else
+                        busy = NULL;
        }
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        if (!ret && !list_empty(list))
+                return -EBUSY;
+        return ret;
 }
 /*
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
-static int mem_cgroup_force_empty(struct mem_cgroup *mem)
+static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
 {
-        int ret = -EBUSY;
+        int ret;
-        int node, zid;
+        int node, zid, shrink;
+        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct cgroup *cgrp = mem->css.cgroup;
        css_get(&mem->css);
-        /*
-         * page reclaim code (kswapd etc..) will move pages between
+        shrink = 0;
-         * active_list <-> inactive_list while we don't take a lock.
+        /* should free all ? */
-         * So, we have to do loop here until all lists are empty.
+        if (free_all)
-         */
+                goto try_to_free;
+move_account:
        while (mem->res.usage > 0) {
-                if (atomic_read(&mem->css.cgroup->count) > 0)
+                ret = -EBUSY;
+                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+                        goto out;
+                ret = -EINTR;
+                if (signal_pending(current))
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
-                for_each_node_state(node, N_POSSIBLE)
+                ret = 0;
-                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                for_each_node_state(node, N_HIGH_MEMORY) {
-                                struct mem_cgroup_per_zone *mz;
+                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
-                                mz = mem_cgroup_zoneinfo(mem, node, zid);
+                                for_each_lru(l) {
-                                for_each_lru(l)
+                                        ret = mem_cgroup_force_empty_list(mem,
-                                        mem_cgroup_force_empty_list(mem, mz, l);
+                                                        node, zid, l);
+                                        if (ret)
+                                                break;
+                                }
                        }
+                        if (ret)
+                                break;
+                }
+                /* it seems parent cgroup doesn't have enough mem */
+                if (ret == -ENOMEM)
+                        goto try_to_free;
                cond_resched();
        }
        ret = 0;
 out:
        css_put(&mem->css);
        return ret;
+try_to_free:
+        /* returns EBUSY if there is a task or if we come here twice. */
+        if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
+                ret = -EBUSY;
+                goto out;
+        }
+        /* we call try-to-free pages for make this cgroup empty */
+        lru_add_drain_all();
+        /* try to free all pages in this cgroup */
+        shrink = 1;
+        while (nr_retries && mem->res.usage > 0) {
+                int progress;
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        goto out;
+                }
+                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+                                                false, get_swappiness(mem));
+                if (!progress) {
+                        nr_retries--;
+                        /* maybe some writeback is necessary */
+                        congestion_wait(WRITE, HZ/10);
+                }
+        }
+        lru_add_drain();
+        /* try move_account...there may be some *locked* pages. */
+        if (mem->res.usage)
+                goto move_account;
+        ret = 0;
+        goto out;
+}
+int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+{
+        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+}
+static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
+{
+        return mem_cgroup_from_cont(cont)->use_hierarchy;
+}
+static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
+                                        u64 val)
+{
+        int retval = 0;
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct cgroup *parent = cont->parent;
+        struct mem_cgroup *parent_mem = NULL;
+        if (parent)
+                parent_mem = mem_cgroup_from_cont(parent);
+        cgroup_lock();
+        /*
+         * If parent's use_hiearchy is set, we can't make any modifications
+         * in the child subtrees. If it is unset, then the change can
+         * occur, provided the current cgroup has no children.
+         *
+         * For the root cgroup, parent_mem is NULL, we allow value to be
+         * set if there are no children.
+         */
+        if ((!parent_mem || !parent_mem->use_hierarchy) &&
+                                (val == 1 || val == 0)) {
+                if (list_empty(&cont->children))
+                        mem->use_hierarchy = val;
+                else
+                        retval = -EBUSY;
+        } else
+                retval = -EINVAL;
+        cgroup_unlock();
+        return retval;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-                                    cft->private);
+        u64 val = 0;
+        int type, name;
+        type = MEMFILE_TYPE(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (type) {
+        case _MEM:
+                val = res_counter_read_u64(&mem->res, name);
+                break;
+        case _MEMSWAP:
+                if (do_swap_account)
+                        val = res_counter_read_u64(&mem->memsw, name);
+                break;
+        default:
+                BUG();
+                break;
+        }
+        return val;
 }
 /*
 * The user of this function is...
@@ -898,15 +1816,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                            const char *buffer)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        int type, name;
        unsigned long long val;
        int ret;
-        switch (cft->private) {
+        type = MEMFILE_TYPE(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
        case RES_LIMIT:
                /* This function does all necessary parse...reuse it */
                ret = res_counter_memparse_write_strategy(buffer, &val);
-                if (!ret)
+                if (ret)
+                        break;
+                if (type == _MEM)
                        ret = mem_cgroup_resize_limit(memcg, val);
+                else
+                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
                break;
        default:
                ret = -EINVAL; /* should be BUG() ? */
@@ -915,27 +1840,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
        return ret;
 }
+static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
+                unsigned long long *mem_limit, unsigned long long *memsw_limit)
+{
+        struct cgroup *cgroup;
+        unsigned long long min_limit, min_memsw_limit, tmp;
+        min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+        min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+        cgroup = memcg->css.cgroup;
+        if (!memcg->use_hierarchy)
+                goto out;
+        while (cgroup->parent) {
+                cgroup = cgroup->parent;
+                memcg = mem_cgroup_from_cont(cgroup);
+                if (!memcg->use_hierarchy)
+                        break;
+                tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
+                min_limit = min(min_limit, tmp);
+                tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                min_memsw_limit = min(min_memsw_limit, tmp);
+        }
+out:
+        *mem_limit = min_limit;
+        *memsw_limit = min_memsw_limit;
+        return;
+}
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
        struct mem_cgroup *mem;
+        int type, name;
        mem = mem_cgroup_from_cont(cont);
-        switch (event) {
+        type = MEMFILE_TYPE(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
        case RES_MAX_USAGE:
-                res_counter_reset_max(&mem->res);
+                if (type == _MEM)
+                        res_counter_reset_max(&mem->res);
+                else
+                        res_counter_reset_max(&mem->memsw);
                break;
        case RES_FAILCNT:
-                res_counter_reset_failcnt(&mem->res);
+                if (type == _MEM)
+                        res_counter_reset_failcnt(&mem->res);
+                else
+                        res_counter_reset_failcnt(&mem->memsw);
                break;
        }
        return 0;
 }
-static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
-{
-        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
-}
 static const struct mem_cgroup_stat_desc {
        const char *msg;
        u64 unit;
@@ -984,42 +1941,170 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
        }
+        {
+                unsigned long long limit, memsw_limit;
+                memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+                cb->fill(cb, "hierarchical_memory_limit", limit);
+                if (do_swap_account)
+                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+        }
+#ifdef CONFIG_DEBUG_VM
+        cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
+        {
+                int nid, zid;
+                struct mem_cgroup_per_zone *mz;
+                unsigned long recent_rotated[2] = {0, 0};
+                unsigned long recent_scanned[2] = {0, 0};
+                for_each_online_node(nid)
+                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                                mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+                                recent_rotated[0] +=
+                                        mz->reclaim_stat.recent_rotated[0];
+                                recent_rotated[1] +=
+                                        mz->reclaim_stat.recent_rotated[1];
+                                recent_scanned[0] +=
+                                        mz->reclaim_stat.recent_scanned[0];
+                                recent_scanned[1] +=
+                                        mz->reclaim_stat.recent_scanned[1];
+                        }
+                cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
+                cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
+                cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
+                cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+        }
+#endif
        return 0;
 }
+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        return get_swappiness(memcg);
+}
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+                                       u64 val)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *parent;
+        if (val > 100)
+                return -EINVAL;
+        if (cgrp->parent == NULL)
+                return -EINVAL;
+        parent = mem_cgroup_from_cont(cgrp->parent);
+        cgroup_lock();
+        /* If under hierarchy, only empty-root can set this value */
+        if ((parent->use_hierarchy) ||
+            (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+                cgroup_unlock();
+                return -EINVAL;
+        }
+        spin_lock(&memcg->reclaim_param_lock);
+        memcg->swappiness = val;
+        spin_unlock(&memcg->reclaim_param_lock);
+        cgroup_unlock();
+        return 0;
+}
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
-                .private = RES_USAGE,
+                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "max_usage_in_bytes",
-                .private = RES_MAX_USAGE,
+                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "limit_in_bytes",
-                .private = RES_LIMIT,
+                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
        {
                .name = "failcnt",
-                .private = RES_FAILCNT,
+                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read,
        },
        {
+                .name = "stat",
+                .read_map = mem_control_stat_show,
+        },
+        {
                .name = "force_empty",
-                .trigger = mem_force_empty_write,
+                .trigger = mem_cgroup_force_empty_write,
        },
        {
-                .name = "stat",
+                .name = "use_hierarchy",
-                .read_map = mem_control_stat_show,
+                .write_u64 = mem_cgroup_hierarchy_write,
+                .read_u64 = mem_cgroup_hierarchy_read,
        },
+        {
+                .name = "swappiness",
+                .read_u64 = mem_cgroup_swappiness_read,
+                .write_u64 = mem_cgroup_swappiness_write,
+        },
+};
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static struct cftype memsw_cgroup_files[] = {
+        {
+                .name = "memsw.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+                .trigger = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+                .write_string = mem_cgroup_write,
+                .read_u64 = mem_cgroup_read,
+        },
+        {
+                .name = "memsw.failcnt",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+                .trigger = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read,
+        },
+};
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+        if (!do_swap_account)
+                return 0;
+        return cgroup_add_files(cont, ss, memsw_cgroup_files,
+                                ARRAY_SIZE(memsw_cgroup_files));
 };
+#else
+static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+        return 0;
+}
+#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
@@ -1046,7 +2131,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                spin_lock_init(&mz->lru_lock);
                for_each_lru(l)
                        INIT_LIST_HEAD(&mz->lists[l]);
        }
@@ -1058,55 +2142,133 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        kfree(mem->info.nodeinfo[node]);
 }
+static int mem_cgroup_size(void)
+{
+        int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+        return sizeof(struct mem_cgroup) + cpustat_size;
+}
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *mem;
+        int size = mem_cgroup_size();
-        if (sizeof(*mem) < PAGE_SIZE)
+        if (size < PAGE_SIZE)
-                mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+                mem = kmalloc(size, GFP_KERNEL);
        else
-                mem = vmalloc(sizeof(*mem));
+                mem = vmalloc(size);
        if (mem)
-                memset(mem, 0, sizeof(*mem));
+                memset(mem, 0, size);
        return mem;
 }
-static void mem_cgroup_free(struct mem_cgroup *mem)
+/*
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
+ */
+static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
-        if (sizeof(*mem) < PAGE_SIZE)
+        int node;
+        for_each_node_state(node, N_POSSIBLE)
+                free_mem_cgroup_per_zone_info(mem, node);
+        if (mem_cgroup_size() < PAGE_SIZE)
                kfree(mem);
        else
                vfree(mem);
 }
+static void mem_cgroup_get(struct mem_cgroup *mem)
+{
+        atomic_inc(&mem->refcnt);
+}
-static struct cgroup_subsys_state *
+static void mem_cgroup_put(struct mem_cgroup *mem)
+{
+        if (atomic_dec_and_test(&mem->refcnt)) {
+                struct mem_cgroup *parent = parent_mem_cgroup(mem);
+                __mem_cgroup_free(mem);
+                if (parent)
+                        mem_cgroup_put(parent);
+        }
+}
+/*
+ * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
+ */
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+{
+        if (!mem->res.parent)
+                return NULL;
+        return mem_cgroup_from_res_counter(mem->res.parent, res);
+}
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static void __init enable_swap_cgroup(void)
+{
+        if (!mem_cgroup_disabled() && really_do_swap_account)
+                do_swap_account = 1;
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
+static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem, *parent;
        int node;
-        if (unlikely((cont->parent) == NULL)) {
+        mem = mem_cgroup_alloc();
-                mem = &init_mem_cgroup;
+        if (!mem)
-        } else {
+                return ERR_PTR(-ENOMEM);
-                mem = mem_cgroup_alloc();
-                if (!mem)
-                        return ERR_PTR(-ENOMEM);
-        }
-        res_counter_init(&mem->res);
        for_each_node_state(node, N_POSSIBLE)
                if (alloc_mem_cgroup_per_zone_info(mem, node))
                        goto free_out;
+        /* root ? */
+        if (cont->parent == NULL) {
+                enable_swap_cgroup();
+                parent = NULL;
+        } else {
+                parent = mem_cgroup_from_cont(cont->parent);
+                mem->use_hierarchy = parent->use_hierarchy;
+        }
+        if (parent && parent->use_hierarchy) {
+                res_counter_init(&mem->res, &parent->res);
+                res_counter_init(&mem->memsw, &parent->memsw);
+                /*
+                 * We increment refcnt of the parent to ensure that we can
+                 * safely access it on res_counter_charge/uncharge.
+                 * This refcnt will be decremented when freeing this
+                 * mem_cgroup(see mem_cgroup_put).
+                 */
+                mem_cgroup_get(parent);
+        } else {
+                res_counter_init(&mem->res, NULL);
+                res_counter_init(&mem->memsw, NULL);
+        }
+        mem->last_scanned_child = NULL;
+        spin_lock_init(&mem->reclaim_param_lock);
+        if (parent)
+                mem->swappiness = get_swappiness(parent);
+        atomic_set(&mem->refcnt, 1);
        return &mem->css;
 free_out:
-        for_each_node_state(node, N_POSSIBLE)
+        __mem_cgroup_free(mem);
-                free_mem_cgroup_per_zone_info(mem, node);
-        if (cont->parent != NULL)
-                mem_cgroup_free(mem);
        return ERR_PTR(-ENOMEM);
 }
@@ -1114,26 +2276,33 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
                                        struct cgroup *cont)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-        mem_cgroup_force_empty(mem);
+        mem_cgroup_force_empty(mem, false);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        int node;
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
-        for_each_node_state(node, N_POSSIBLE)
+        if (last_scanned_child) {
-                free_mem_cgroup_per_zone_info(mem, node);
+                VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
+                mem_cgroup_put(last_scanned_child);
-        mem_cgroup_free(mem_cgroup_from_cont(cont));
+        }
+        mem_cgroup_put(mem);
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        return cgroup_add_files(cont, ss, mem_cgroup_files,
+        int ret;
-                                        ARRAY_SIZE(mem_cgroup_files));
+        ret = cgroup_add_files(cont, ss, mem_cgroup_files,
+                                ARRAY_SIZE(mem_cgroup_files));
+        if (!ret)
+                ret = register_memsw_files(cont, ss);
+        return ret;
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -1141,25 +2310,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *old_cont,
                                struct task_struct *p)
 {
-        struct mm_struct *mm;
+        mutex_lock(&memcg_tasklist);
-        struct mem_cgroup *mem, *old_mem;
-        mm = get_task_mm(p);
-        if (mm == NULL)
-                return;
-        mem = mem_cgroup_from_cont(cont);
-        old_mem = mem_cgroup_from_cont(old_cont);
        /*
-         * Only thread group leaders are allowed to migrate, the mm_struct is
+         * FIXME: It's better to move charges of this process from old
-         * in effect owned by the leader
+         * memcg to new memcg. But it's just on TODO-List now.
         */
-        if (!thread_group_leader(p))
+        mutex_unlock(&memcg_tasklist);
-                goto out;
-out:
-        mmput(mm);
 }
 struct cgroup_subsys mem_cgroup_subsys = {
@@ -1172,3 +2328,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .attach = mem_cgroup_move_task,
        .early_init = 0,
 };
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init disable_swap_account(char *s)
+{
+        really_do_swap_account = 0;
+        return 1;
+}
+__setup("noswapaccount", disable_swap_account);
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 0a2010a9518c..baa999e87cd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/swapops.h>
+#include <linux/elf.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -59,9 +62,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
-#include <linux/swapops.h>
-#include <linux/elf.h>
 #include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
-                          unsigned long vaddr)
+                          pte_t pte, struct page *page)
-{
+{
-        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
-                        "vm_flags = %lx, vaddr = %lx\n",
+        pud_t *pud = pud_offset(pgd, addr);
-                (long long)pte_val(pte),
+        pmd_t *pmd = pmd_offset(pud, addr);
-                (vma->vm_mm == current->mm ? current->comm : "???"),
+        struct address_space *mapping;
-                vma->vm_flags, vaddr);
+        pgoff_t index;
+        static unsigned long resume;
+        static unsigned long nr_shown;
+        static unsigned long nr_unshown;
+        /*
+         * Allow a burst of 60 reports, then keep quiet for that minute;
+         * or allow a steady drip of one report per second.
+         */
+        if (nr_shown == 60) {
+                if (time_before(jiffies, resume)) {
+                        nr_unshown++;
+                        return;
+                }
+                if (nr_unshown) {
+                        printk(KERN_ALERT
+                                "BUG: Bad page map: %lu messages suppressed\n",
+                                nr_unshown);
+                        nr_unshown = 0;
+                }
+                nr_shown = 0;
+        }
+        if (nr_shown++ == 0)
+                resume = jiffies + 60 * HZ;
+        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
+        index = linear_page_index(vma, addr);
+        printk(KERN_ALERT
+                "BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
+                current->comm,
+                (long long)pte_val(pte), (long long)pmd_val(*pmd));
+        if (page) {
+                printk(KERN_ALERT
+                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+                page, (void *)page->flags, page_count(page),
+                page_mapcount(page), page->mapping, page->index);
+        }
+        printk(KERN_ALERT
+                "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+                (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+        /*
+         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
+         */
+        if (vma->vm_ops)
+                print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
+                                (unsigned long)vma->vm_ops->fault);
+        if (vma->vm_file && vma->vm_file->f_op)
+                print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
+                                (unsigned long)vma->vm_file->f_op->mmap);
        dump_stack();
+        add_taint(TAINT_BAD_PAGE);
 }
 static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                                pte_t pte)
 {
-        unsigned long pfn;
+        unsigned long pfn = pte_pfn(pte);
        if (HAVE_PTE_SPECIAL) {
-                if (likely(!pte_special(pte))) {
+                if (likely(!pte_special(pte)))
-                        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                        goto check_pfn;
-                        return pte_page(pte);
+                if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
-                }
+                        print_bad_pte(vma, addr, pte, NULL);
-                VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
                return NULL;
        }
        /* !HAVE_PTE_SPECIAL case follows: */
-        pfn = pte_pfn(pte);
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                }
        }
-        VM_BUG_ON(!pfn_valid(pfn));
+check_pfn:
+        if (unlikely(pfn > highest_memmap_pfn)) {
+                print_bad_pte(vma, addr, pte, NULL);
+                return NULL;
+        }
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
-         *
         * eg. VDSO mappings can cause them to exist.
         */
 out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        else {
                                if (pte_dirty(ptent))
                                        set_page_dirty(page);
-                                if (pte_young(ptent))
+                                if (pte_young(ptent) &&
-                                        SetPageReferenced(page);
+                                    likely(!VM_SequentialReadHint(vma)))
+                                        mark_page_accessed(page);
                                file_rss--;
                        }
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
+                        if (unlikely(page_mapcount(page) < 0))
+                                print_bad_pte(vma, addr, ptent, page);
                        tlb_remove_page(tlb, page);
                        continue;
                }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                 */
                if (unlikely(details))
                        continue;
-                if (!pte_file(ptent))
+                if (pte_file(ptent)) {
-                        free_swap_and_cache(pte_to_swp_entry(ptent));
+                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
+                                print_bad_pte(vma, addr, ptent, NULL);
+                } else if
+                  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
+                        print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        int write = !!(flags & GUP_FLAGS_WRITE);
        int force = !!(flags & GUP_FLAGS_FORCE);
        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+        int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
        if (len <= 0)
                return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        struct page *page;
                        /*
-                         * If tsk is ooming, cut off its access to large memory
+                         * If we have a pending SIGKILL, don't keep faulting
-                         * allocations. It has a pending SIGKILL, but it can't
+                         * pages and potentially allocating memory, unless
-                         * be processed until returning to user space.
+                         * current is handling munlock--e.g., on exit. In
+                         * that case, we are not allocating memory.  Rather,
+                         * we're only unlocking already resident/mapped pages.
                         */
-                        if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+                        if (unlikely(!ignore_sigkill &&
-                                return i ? i : -ENOMEM;
+                                        fatal_signal_pending(current)))
+                                return i ? i : -ERESTARTSYS;
                        if (write)
                                foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * do_wp_page has broken COW when necessary,
                                 * even if maybe_mkwrite decided not to set
                                 * pte_write. We can thus safely do subsequent
-                                 * page lookups as if they were reads.
+                                 * page lookups as if they were reads. But only
+                                 * do so when looping for pte_write is futile:
+                                 * in some cases userspace may also be wanting
+                                 * to write to the gotten user page, which a
+                                 * read fault here might prevent (a readonly
+                                 * page might get reCOWed by userspace write).
                                 */
-                                if (ret & VM_FAULT_WRITE)
+                                if ((ret & VM_FAULT_WRITE) &&
+                                    !(vma->vm_flags & VM_WRITE))
                                        foll_flags &= ~FOLL_WRITE;
                                cond_resched();
@@ -1444,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
 {
        int ret;
+        pgprot_t pgprot = vma->vm_page_prot;
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
@@ -1458,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
+        if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
                return -EINVAL;
-        ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+        ret = insert_pfn(vma, addr, pfn, pgprot);
        if (ret)
                untrack_pfn_vma(vma, pfn, PAGE_SIZE);
@@ -1604,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-        err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
+        err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
-        if (err)
+        if (err) {
+                /*
+                 * To indicate that track_pfn related cleanup is not
+                 * needed from higher level routine calling unmap_vmas
+                 */
+                vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
                return -EINVAL;
+        }
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -1644,6 +1718,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
        BUG_ON(pmd_huge(*pmd));
+        arch_enter_lazy_mmu_mode();
        token = pmd_pgtable(*pmd);
        do {
@@ -1652,6 +1728,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
+        arch_leave_lazy_mmu_mode();
        if (mm != &init_mm)
                pte_unmap_unlock(pte-1, ptl);
        return err;
@@ -1837,10 +1915,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * not dirty accountable.
         */
        if (PageAnon(old_page)) {
-                if (trylock_page(old_page)) {
+                if (!trylock_page(old_page)) {
-                        reuse = can_share_swap_page(old_page);
+                        page_cache_get(old_page);
-                        unlock_page(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        lock_page(old_page);
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte)) {
+                                unlock_page(old_page);
+                                page_cache_release(old_page);
+                                goto unlock;
+                        }
+                        page_cache_release(old_page);
                }
+                reuse = reuse_swap_page(old_page);
+                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
                /*
@@ -1910,7 +1999,7 @@ gotten:
         * Don't let another task, with possibly unlocked vma,
         * keep the mlocked page.
         */
-        if (vma->vm_flags & VM_LOCKED) {
+        if ((vma->vm_flags & VM_LOCKED) && old_page) {
                lock_page(old_page);    /* for LRU manipulation */
                clear_page_mlock(old_page);
                unlock_page(old_page);
@@ -1918,7 +2007,7 @@ gotten:
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
-        if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
        /*
@@ -1943,11 +2032,7 @@ gotten:
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-                SetPageSwapBacked(new_page);
-                lru_cache_add_active_or_unevictable(new_page, vma);
                page_add_new_anon_rmap(new_page, vma, address);
-//TODO:  is this safe?  do_anonymous_page() does it this way.
                set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                if (old_page) {
@@ -1973,7 +2058,7 @@ gotten:
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                        page_remove_rmap(old_page, vma);
+                        page_remove_rmap(old_page);
                }
                /* Free the old page.. */
@@ -2266,7 +2351,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
                unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
        }
-        if (inode->i_op && inode->i_op->truncate)
+        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
@@ -2286,7 +2371,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
         * a way to truncate a range of blocks (punch a hole) -
         * we should return failure right now.
         */
-        if (!inode->i_op || !inode->i_op->truncate_range)
+        if (!inode->i_op->truncate_range)
                return -ENOSYS;
        mutex_lock(&inode->i_mutex);
@@ -2314,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        swp_entry_t entry;
        pte_t pte;
+        struct mem_cgroup *ptr = NULL;
        int ret = 0;
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2352,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
                unlock_page(page);
                goto out;
@@ -2370,22 +2456,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_nomap;
        }
-        /* The page isn't present yet, go ahead with the fault. */
+        /*
+         * The page isn't present yet, go ahead with the fault.
+         *
+         * Be careful about the sequence of operations here.
+         * To get its accounting right, reuse_swap_page() must be called
+         * while the page is counted on swap but not yet in mapcount i.e.
+         * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+         * must be called after the swap_free(), or it will never succeed.
+         * Because delete_from_swap_page() may be called by reuse_swap_page(),
+         * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
+         * in page->private. In this case, a record in swap_cgroup  is silently
+         * discarded at swap_free().
+         */
        inc_mm_counter(mm, anon_rss);
        pte = mk_pte(page, vma->vm_page_prot);
-        if (write_access && can_share_swap_page(page)) {
+        if (write_access && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                write_access = 0;
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
        page_add_anon_rmap(page, vma, address);
+        /* It's better to call commit-charge after rmap is established */
+        mem_cgroup_commit_charge_swapin(page, ptr);
        swap_free(entry);
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
-                remove_exclusive_swap_page(page);
+                try_to_free_swap(page);
        unlock_page(page);
        if (write_access) {
@@ -2402,7 +2501,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-        mem_cgroup_uncharge_page(page);
+        mem_cgroup_cancel_charge_swapin(ptr);
        pte_unmap_unlock(page_table, ptl);
        unlock_page(page);
        page_cache_release(page);
@@ -2432,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto oom;
        __SetPageUptodate(page);
-        if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
                goto oom_free_page;
        entry = mk_pte(page, vma->vm_page_prot);
@@ -2442,8 +2541,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
        inc_mm_counter(mm, anon_rss);
-        SetPageSwapBacked(page);
-        lru_cache_add_active_or_unevictable(page, vma);
        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
@@ -2525,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                ret = VM_FAULT_OOM;
                                goto out;
                        }
-                        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                        if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
                                ret = VM_FAULT_OOM;
                                page_cache_release(page);
                                goto out;
@@ -2591,8 +2688,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
-                        SetPageSwapBacked(page);
-                        lru_cache_add_active_or_unevictable(page, vma);
                        page_add_new_anon_rmap(page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
@@ -2602,7 +2697,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                get_page(dirty_page);
                        }
                }
-//TODO:  is this safe?  do_anonymous_page() does it this way.
                set_pte_at(mm, address, page_table, entry);
                /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2760,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                return 0;
-        if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+        if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-                        !(vma->vm_flags & VM_CAN_NONLINEAR))) {
                /*
                 * Page table corrupted: show pte and kill process.
                 */
-                print_bad_pte(vma, orig_pte, address);
+                print_bad_pte(vma, address, orig_pte, NULL);
                return VM_FAULT_OOM;
        }
@@ -2953,7 +3046,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 {
        resource_size_t phys_addr;
        unsigned long prot = 0;
-        void *maddr;
+        void __iomem *maddr;
        int offset = addr & (PAGE_SIZE-1);
        if (follow_phys(vma, addr, write, &prot, &phys_addr))
@@ -3079,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
 #ifdef CONFIG_PROVE_LOCKING
 void might_fault(void)
 {
+        /*
+         * Some code (nfs/sunrpc) uses socket ops on kernel memory while
+         * holding the mmap_sem, this is safe because kernel memory doesn't
+         * get paged out, therefore we'll never actually fault, and the
+         * below annotations will generate false positives.
+         */
+        if (segment_eq(get_fs(), KERNEL_DS))
+                return;
        might_sleep();
        /*
         * it would be nicer only to annotate paths which are not under
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17371185468..c083cf5fd6df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        return 0;
 }
-static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(int nid, struct zone *zone,
+                                        unsigned long phys_start_pfn)
 {
        int nr_pages = PAGES_PER_SECTION;
        int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
        if (ret < 0)
                return ret;
-        return register_new_memory(__pfn_to_section(phys_start_pfn));
+        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 * call this function after deciding the zone to which to
 * add the new pages.
 */
-int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
-                 unsigned long nr_pages)
+                        unsigned long nr_pages)
 {
        unsigned long i;
        int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(zone, i << PFN_SECTION_SHIFT);
+                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
                /*
                 * EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
 }
 static struct page *
-hotremove_migrate_alloc(struct page *page,
+hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-                        unsigned long private,
-                        int **x)
 {
-        /* This should be improoooooved!! */
+        /* This should be improooooved!! */
-        return alloc_page(GFP_HIGHUSER_PAGECACHE);
+        return alloc_page(GFP_HIGHUSER_MOVABLE);
 }
 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e412ffa8e52e..3eb4a6fdc043 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 }
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
-                        unsigned long mode,
+                unsigned long, mode, unsigned long __user *, nmask,
-                        unsigned long __user *nmask, unsigned long maxnode,
+                unsigned long, maxnode, unsigned, flags)
-                        unsigned flags)
 {
        nodemask_t nodes;
        int err;
@@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 }
 /* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
+SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
-                unsigned long maxnode)
+                unsigned long, maxnode)
 {
        int err;
        nodemask_t nodes;
@@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        return do_set_mempolicy(mode, flags, &nodes);
 }
-asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
-                const unsigned long __user *old_nodes,
+                const unsigned long __user *, old_nodes,
-                const unsigned long __user *new_nodes)
+                const unsigned long __user *, new_nodes)
 {
        const struct cred *cred = current_cred(), *tcred;
        struct mm_struct *mm;
@@ -1185,10 +1184,9 @@ out:
 /* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
-                                unsigned long __user *nmask,
+                unsigned long __user *, nmask, unsigned long, maxnode,
-                                unsigned long maxnode,
+                unsigned long, addr, unsigned long, flags)
-                                unsigned long addr, unsigned long flags)
 {
        int err;
        int uninitialized_var(pval);
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab8c08b..2bb4e1d63520 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
                goto out;
-        /*
-         * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
-         * Failure is not an option here: we're now expected to remove every
-         * migration pte, and will cause crashes otherwise.  Normally this
-         * is not an issue: mem_cgroup_prepare_migration bumped up the old
-         * page_cgroup count for safety, that's now attached to the new page,
-         * so this charge should just be another incrementation of the count,
-         * to keep in balance with rmap.c's mem_cgroup_uncharging.  But if
-         * there's been a force_empty, those reference counts may no longer
-         * be reliable, and this charge can actually fail: oh well, we don't
-         * make the situation any worse by proceeding as if it had succeeded.
-         */
-        mem_cgroup_charge(new, mm, GFP_ATOMIC);
        get_page(new);
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
@@ -300,12 +286,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         * Now we know that no one else is looking at the page.
         */
        get_page(newpage);      /* add cache reference */
-#ifdef CONFIG_SWAP
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
                set_page_private(newpage, page_private(page));
        }
-#endif
        radix_tree_replace_slot(pslot, newpage);
@@ -373,18 +357,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        mlock_migrate_page(newpage, page);
-#ifdef CONFIG_SWAP
        ClearPageSwapCache(page);
-#endif
        ClearPagePrivate(page);
        set_page_private(page, 0);
        /* page->mapping contains a flag for PageAnon() */
        anon = PageAnon(page);
        page->mapping = NULL;
-        if (!anon) /* This page was removed from radix-tree. */
-                mem_cgroup_uncharge_cache_page(page);
        /*
         * If any waiters have accumulated on the new page then
         * wake them up.
@@ -618,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
        int charge = 0;
+        struct mem_cgroup *mem;
        if (!newpage)
                return -ENOMEM;
@@ -627,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                goto move_newpage;
        }
-        charge = mem_cgroup_prepare_migration(page, newpage);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto move_newpage;
-        }
        /* prepare cgroup just returns 0 or -ENOMEM */
-        BUG_ON(charge);
        rc = -EAGAIN;
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
                lock_page(page);
        }
+        /* charge against new page */
+        charge = mem_cgroup_prepare_migration(page, &mem);
+        if (charge == -ENOMEM) {
+                rc = -ENOMEM;
+                goto unlock;
+        }
+        BUG_ON(charge);
        if (PageWriteback(page)) {
                if (!force)
-                        goto unlock;
+                        goto uncharge;
                wait_on_page_writeback(page);
        }
        /*
@@ -697,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
+uncharge:
+        if (!charge)
+                mem_cgroup_end_migration(mem, page, newpage);
 unlock:
        unlock_page(page);
@@ -713,8 +697,6 @@ unlock:
        }
 move_newpage:
-        if (!charge)
-                mem_cgroup_end_migration(newpage);
        /*
         * Move the new page to the LRU. If migration was not successful
@@ -848,12 +830,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                struct vm_area_struct *vma;
                struct page *page;
-                /*
-                 * A valid page pointer that will not match any of the
-                 * pages that will be moved.
-                 */
-                pp->page = ZERO_PAGE(0);
                err = -EFAULT;
                vma = find_vma(mm, pp->addr);
                if (!vma || !vma_migratable(vma))
@@ -919,41 +895,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                         const int __user *nodes,
                         int __user *status, int flags)
 {
-        struct page_to_node *pm = NULL;
+        struct page_to_node *pm;
        nodemask_t task_nodes;
-        int err = 0;
+        unsigned long chunk_nr_pages;
-        int i;
+        unsigned long chunk_start;
+        int err;
        task_nodes = cpuset_mems_allowed(task);
-        /* Limit nr_pages so that the multiplication may not overflow */
+        err = -ENOMEM;
-        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
-                err = -E2BIG;
+        if (!pm)
-                goto out;
-        }
-        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
-        if (!pm) {
-                err = -ENOMEM;
                goto out;
-        }
        /*
-         * Get parameters from user space and initialize the pm
+         * Store a chunk of page_to_node array in a page,
-         * array. Return various errors if the user did something wrong.
+         * but keep the last one as a marker
         */
-        for (i = 0; i < nr_pages; i++) {
+        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
-                const void __user *p;
-                err = -EFAULT;
+        for (chunk_start = 0;
-                if (get_user(p, pages + i))
+             chunk_start < nr_pages;
-                        goto out_pm;
+             chunk_start += chunk_nr_pages) {
+                int j;
+                if (chunk_start + chunk_nr_pages > nr_pages)
+                        chunk_nr_pages = nr_pages - chunk_start;
-                pm[i].addr = (unsigned long)p;
+                /* fill the chunk pm with addrs and nodes from user-space */
-                if (nodes) {
+                for (j = 0; j < chunk_nr_pages; j++) {
+                        const void __user *p;
                        int node;
-                        if (get_user(node, nodes + i))
+                        err = -EFAULT;
+                        if (get_user(p, pages + j + chunk_start))
+                                goto out_pm;
+                        pm[j].addr = (unsigned long) p;
+                        if (get_user(node, nodes + j + chunk_start))
                                goto out_pm;
                        err = -ENODEV;
@@ -964,22 +942,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                        if (!node_isset(node, task_nodes))
                                goto out_pm;
-                        pm[i].node = node;
+                        pm[j].node = node;
-                } else
+                }
-                        pm[i].node = 0; /* anything to not match MAX_NUMNODES */
-        }
+                /* End marker for this chunk */
-        /* End marker */
+                pm[chunk_nr_pages].node = MAX_NUMNODES;
-        pm[nr_pages].node = MAX_NUMNODES;
+                /* Migrate this chunk */
+                err = do_move_page_to_node_array(mm, pm,
+                                                 flags & MPOL_MF_MOVE_ALL);
+                if (err < 0)
+                        goto out_pm;
-        err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
-        if (err >= 0)
                /* Return status information */
-                for (i = 0; i < nr_pages; i++)
+                for (j = 0; j < chunk_nr_pages; j++)
-                        if (put_user(pm[i].status, status + i))
+                        if (put_user(pm[j].status, status + j + chunk_start)) {
                                err = -EFAULT;
+                                goto out_pm;
+                        }
+        }
+        err = 0;
 out_pm:
-        vfree(pm);
+        free_page((unsigned long)pm);
 out:
        return err;
 }
@@ -1070,10 +1055,10 @@ out:
 * Move a list of pages in the address space of the currently executing
 * process.
 */
-asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
-                        const void __user * __user *pages,
+                const void __user * __user *, pages,
-                        const int __user *nodes,
+                const int __user *, nodes,
-                        int __user *status, int flags)
+                int __user *, status, int, flags)
 {
        const struct cred *cred = current_cred(), *tcred;
        struct task_struct *task;
diff --git a/mm/mincore.c b/mm/mincore.c
index 5178800bc129..8cb508f84ea4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -177,8 +177,8 @@ none_mapped:
 *              mapped
 *  -EAGAIN - A kernel resource was temporarily unavailable.
 */
-asmlinkage long sys_mincore(unsigned long start, size_t len,
+SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
-        unsigned char __user * vec)
+                unsigned char __user *, vec)
 {
        long retval;
        unsigned long pages;
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56e7616..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                  (atomic_read(&mm->mm_users) != 0));
        /*
-         * mlock:   don't page populate if page has PROT_NONE permission.
+         * mlock:   don't page populate if vma has PROT_NONE permission.
-         * munlock: the pages always do munlock althrough
+         * munlock: always do munlock although the vma has PROT_NONE
-         *          its has PROT_NONE permission.
+         *          permission, or SIGKILL is pending.
         */
        if (!mlock)
-                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
+                             GUP_FLAGS_IGNORE_SIGKILL;
        if (vma->vm_flags & VM_WRITE)
                gup_flags |= GUP_FLAGS_WRITE;
@@ -293,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
 *
 * return number of pages [> 0] to be removed from locked_vm on success
 * of "special" vmas.
- *
- * return negative error if vma spanning @start-@range disappears while
- * mmap semaphore is dropped.  Unlikely?
 */
 long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = vma->vm_mm;
        int nr_pages = (end - start) / PAGE_SIZE;
        BUG_ON(!(vma->vm_flags & VM_LOCKED));
@@ -313,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                long error;
-                downgrade_write(&mm->mmap_sem);
-                error = __mlock_vma_pages_range(vma, start, end, 1);
-                up_read(&mm->mmap_sem);
+                return __mlock_vma_pages_range(vma, start, end, 1);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                vma = find_vma(mm, start);
-                /* non-NULL vma must contain @start, but need to check @end */
-                if (!vma ||  end > vma->vm_end)
-                        return -ENOMEM;
-                return 0;       /* hide other errors from mmap(), et al */
        }
        /*
@@ -437,41 +422,14 @@ success:
        vma->vm_flags = newflags;
        if (lock) {
-                /*
-                 * mmap_sem is currently held for write.  Downgrade the write
-                 * lock to a read lock so that other faults, mmap scans, ...
-                 * while we fault in all pages.
-                 */
-                downgrade_write(&mm->mmap_sem);
                ret = __mlock_vma_pages_range(vma, start, end, 1);
-                /*
+                if (ret > 0) {
-                 * Need to reacquire mmap sem in write mode, as our callers
-                 * expect this.  We have no support for atomically upgrading
-                 * a sem to write, so we need to check for ranges while sem
-                 * is unlocked.
-                 */
-                up_read(&mm->mmap_sem);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                *prev = find_vma(mm, start);
-                /* non-NULL *prev must contain @start, but need to check @end */
-                if (!(*prev) || end > (*prev)->vm_end)
-                        ret = -ENOMEM;
-                else if (ret > 0) {
                        mm->locked_vm -= ret;
                        ret = 0;
                } else
                        ret = __mlock_posix_error_return(ret); /* translate if needed */
        } else {
-                /*
-                 * TODO:  for unlocking, pages will already be resident, so
-                 * we don't need to wait for allocations/reclaim/pagein, ...
-                 * However, unlocking a very large region can still take a
-                 * while.  Should we downgrade the semaphore for both lock
-                 * AND unlock ?
-                 */
                __mlock_vma_pages_range(vma, start, end, 0);
        }
@@ -529,7 +487,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
-asmlinkage long sys_mlock(unsigned long start, size_t len)
+SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
        unsigned long lock_limit;
@@ -557,7 +515,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
        return error;
 }
-asmlinkage long sys_munlock(unsigned long start, size_t len)
+SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 {
        int ret;
@@ -594,7 +552,7 @@ out:
        return 0;
 }
-asmlinkage long sys_mlockall(int flags)
+SYSCALL_DEFINE1(mlockall, int, flags)
 {
        unsigned long lock_limit;
        int ret = -EINVAL;
@@ -622,7 +580,7 @@ out:
        return ret;
 }
-asmlinkage long sys_munlockall(void)
+SYSCALL_DEFINE0(munlockall)
 {
        int ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index c3647f3b0621..3b3ed0bb9fdb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,7 +3,7 @@
 *
 * Written by obz.
 *
- * Address space accounting code        <alan@redhat.com>
+ * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */
 #include <linux/slab.h>
@@ -246,7 +246,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        return next;
 }
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
        unsigned long rlim, retval;
        unsigned long newbrk, oldbrk;
@@ -414,7 +414,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 static void __vma_link_file(struct vm_area_struct *vma)
 {
-        struct file * file;
+        struct file *file;
        file = vma->vm_file;
        if (file) {
@@ -475,11 +475,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 * insert vm structure into list and rbtree and anon_vma,
 * but it has already been inserted into prio_tree earlier.
 */
-static void
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
-        struct vm_area_struct * __vma, * prev;
+        struct vm_area_struct *__vma, *prev;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
        BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -660,6 +659,9 @@ again:			remove_next = 1 + (end > next->vm_end);
        validate_mm(mm);
 }
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
 /*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
@@ -667,7 +669,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        if (vma->vm_flags != vm_flags)
+        if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -909,7 +911,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 * The caller must hold down_write(current->mm->mmap_sem).
 */
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, unsigned long pgoff)
 {
@@ -1092,6 +1094,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
+/*
+ * We account for memory if it's a private writeable mapping,
+ * and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(unsigned int vm_flags)
+{
+        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
                          unsigned int vm_flags, unsigned long pgoff,
@@ -1119,36 +1130,32 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
-        if (flags & MAP_NORESERVE)
+        /*
+         * Set 'VM_NORESERVE' if we should not account for the
+         * memory use of this mapping. We only honor MAP_NORESERVE
+         * if we're allowed to overcommit memory.
+         */
+        if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                vm_flags |= VM_NORESERVE;
+        if (!accountable)
                vm_flags |= VM_NORESERVE;
-        if (accountable && (!(flags & MAP_NORESERVE) ||
+        /*
-                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+         * Private writable mapping: check memory availability
-                if (vm_flags & VM_SHARED) {
+         */
-                        /* Check memory availability in shmem_file_setup? */
+        if (accountable_mapping(vm_flags)) {
-                        vm_flags |= VM_ACCOUNT;
+                charged = len >> PAGE_SHIFT;
-                } else if (vm_flags & VM_WRITE) {
+                if (security_vm_enough_memory(charged))
-                        /*
+                        return -ENOMEM;
-                         * Private writable mapping: check memory availability
+                vm_flags |= VM_ACCOUNT;
-                         */
-                        charged = len >> PAGE_SHIFT;
-                        if (security_vm_enough_memory(charged))
-                                return -ENOMEM;
-                        vm_flags |= VM_ACCOUNT;
-                }
        }
        /*
-         * Can we just expand an old private anonymous mapping?
+         * Can we just expand an old mapping?
-         * The VM_SHARED test is necessary because shmem_zero_setup
-         * will create the file object for a shared anonymous map below.
         */
-        if (!file && !(vm_flags & VM_SHARED)) {
+        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
-                vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+        if (vma)
-                                        NULL, NULL, pgoff, NULL);
+                goto out;
-                if (vma)
-                        goto out;
-        }
        /*
         * Determine the object being mapped and call the appropriate
@@ -1191,14 +1198,6 @@ munmap_back:
                        goto free_vma;
        }
-        /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
-         * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
-         * that memory reservation must be checked; but that reservation
-         * belongs to shared memory object, not to vma: so now clear it.
-         */
-        if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
-                vma->vm_flags &= ~VM_ACCOUNT;
        /* Can addr have changed??
         *
         * Answer: Yes, several device drivers can do it in their
@@ -1211,17 +1210,8 @@ munmap_back:
        if (vma_wants_writenotify(vma))
                vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
-        if (file && vma_merge(mm, prev, addr, vma->vm_end,
+        vma_link(mm, vma, prev, rb_link, rb_parent);
-                        vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+        file = vma->vm_file;
-                mpol_put(vma_policy(vma));
-                kmem_cache_free(vm_area_cachep, vma);
-                fput(file);
-                if (vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
-        } else {
-                vma_link(mm, vma, prev, rb_link, rb_parent);
-                file = vma->vm_file;
-        }
        /* Once vma denies write, undo our temporary denial count */
        if (correct_wcount)
@@ -1468,7 +1458,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
@@ -1511,7 +1501,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
 {
        struct vm_area_struct *vma = NULL, *prev = NULL;
-        struct rb_node * rb_node;
+        struct rb_node *rb_node;
        if (!mm)
                goto out;
@@ -1545,7 +1535,7 @@ out:
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct rlimit *rlim = current->signal->rlim;
@@ -1953,7 +1943,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
        int ret;
        struct mm_struct *mm = current->mm;
@@ -2095,6 +2085,9 @@ void exit_mmap(struct mm_struct *mm)
        arch_exit_mmap(mm);
        mmu_notifier_release(mm);
+        if (!mm->mmap)  /* Can happen if dup_mmap() received an OOM */
+                return;
        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
@@ -2107,7 +2100,7 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
        tlb = tlb_gather_mmu(mm, 1);
-        /* Don't update_hiwater_rss(mm) here, do_exit already did */
+        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
@@ -2474,3 +2467,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
        mutex_unlock(&mm_all_locks_mutex);
 }
+/*
+ * initialise the VMA slab
+ */
+void __init mmap_init(void)
+{
+        vm_area_cachep = kmem_cache_create("vm_area_struct",
+                        sizeof(struct vm_area_struct), 0,
+                        SLAB_PANIC, NULL);
+}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index fded06f923f4..abe2694e13f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -4,7 +4,7 @@
 *  (C) Copyright 1994 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 *
- *  Address space accounting code       <alan@redhat.com>
+ *  Address space accounting code       <alan@lxorguk.ukuu.org.uk>
 *  (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */
@@ -22,6 +22,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                ptent = pte_mkwrite(ptent);
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
-#ifdef CONFIG_MIGRATION
+                } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
-                } else if (!pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
                        if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                set_pte_at(mm, addr, pte,
                                        swp_entry_to_pte(entry));
                        }
-#endif
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
@@ -219,8 +217,8 @@ fail:
        return error;
 }
-asmlinkage long
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-sys_mprotect(unsigned long start, size_t len, unsigned long prot)
+                unsigned long, prot)
 {
        unsigned long vm_flags, nstart, end, tmp, reqprot;
        struct vm_area_struct *vma, *prev;
diff --git a/mm/mremap.c b/mm/mremap.c
index 58a2908f42f5..a39b7b91be46 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,7 +3,7 @@
 *
 *      (C) Copyright 1996 Linus Torvalds
 *
- *      Address space accounting code   <alan@redhat.com>
+ *      Address space accounting code   <alan@lxorguk.ukuu.org.uk>
 *      (C) Copyright 2002 Red Hat Inc, All Rights Reserved
 */
@@ -420,9 +420,9 @@ out_nc:
        return ret;
 }
-asmlinkage unsigned long sys_mremap(unsigned long addr,
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-        unsigned long old_len, unsigned long new_len,
+                unsigned long, new_len, unsigned long, flags,
-        unsigned long flags, unsigned long new_addr)
+                unsigned long, new_addr)
 {
        unsigned long ret;
diff --git a/mm/msync.c b/mm/msync.c
index 144a7570535d..4083209b7f02 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -28,7 +28,7 @@
 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
 * applications.
 */
-asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
+SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 {
        unsigned long end;
        struct mm_struct *mm = current->mm;
@@ -82,7 +82,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        up_read(&mm->mmap_sem);
-                        error = do_fsync(file, 0);
+                        error = vfs_fsync(file, file->f_path.dentry, 0);
                        fput(file);
                        if (error || start >= end)
                                goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7695dc850785..2fcf47d449b4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,11 +6,11 @@
 *
 *  See Documentation/nommu-mmap.txt
 *
- *  Copyright (c) 2004-2005 David Howells <dhowells@redhat.com>
+ *  Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
- *  Copyright (c) 2007      Paul Mundt <lethal@linux-sh.org>
+ *  Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
 */
 #include <linux/module.h>
@@ -33,6 +33,28 @@
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
+static inline __attribute__((format(printf, 1, 2)))
+void no_printk(const char *fmt, ...)
+{
+}
+#if 0
+#define kenter(FMT, ...) \
+        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+        printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+        printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
+#else
+#define kenter(FMT, ...) \
+        no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
+#define kleave(FMT, ...) \
+        no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+        no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
+#endif
 #include "internal.h"
@@ -40,19 +62,22 @@ void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
-unsigned long askedalloc, realalloc;
 atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
+atomic_t mmap_pages_allocated;
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
-/* list of shareable VMAs */
+/* list of mapped, potentially shareable regions */
-struct rb_root nommu_vma_tree = RB_ROOT;
+static struct kmem_cache *vm_region_jar;
-DECLARE_RWSEM(nommu_vma_sem);
+struct rb_root nommu_region_tree = RB_ROOT;
+DECLARE_RWSEM(nommu_region_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
@@ -86,7 +111,7 @@ do_expand:
        i_size_write(inode, offset);
 out_truncate:
-        if (inode->i_op && inode->i_op->truncate)
+        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
@@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp)
                return ksize(objp);
        /*
+         * If it's not a compound page, see if we have a matching VMA
+         * region. This test is intentionally done in reverse order,
+         * so if there's no VMA, we still fall through and hand back
+         * PAGE_SIZE for 0-order pages.
+         */
+        if (!PageCompound(page)) {
+                struct vm_area_struct *vma;
+                vma = find_vma(current->mm, (unsigned long)objp);
+                if (vma)
+                        return vma->vm_end - vma->vm_start;
+        }
+        /*
         * The ksize() function is only guaranteed to work for pointers
         * returned by kmalloc(). So handle arbitrary pointers here.
         */
@@ -355,6 +394,24 @@ void vunmap(const void *addr)
 }
 EXPORT_SYMBOL(vunmap);
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+        BUG();
+        return NULL;
+}
+EXPORT_SYMBOL(vm_map_ram);
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+        BUG();
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+void vm_unmap_aliases(void)
+{
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 /*
 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 * have one.
@@ -377,7 +434,7 @@ EXPORT_SYMBOL(vm_insert_page);
 *  to a regular file.  in this case, the unmapping will need
 *  to invoke file system routines that need the global lock.
 */
-asmlinkage unsigned long sys_brk(unsigned long brk)
+SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
        struct mm_struct *mm = current->mm;
@@ -401,129 +458,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
        return mm->brk = brk;
 }
-#ifdef DEBUG
+/*
-static void show_process_blocks(void)
+ * initialise the VMA and region record slabs
+ */
+void __init mmap_init(void)
 {
-        struct vm_list_struct *vml;
+        vm_region_jar = kmem_cache_create("vm_region_jar",
+                                          sizeof(struct vm_region), 0,
-        printk("Process blocks %d:", current->pid);
+                                          SLAB_PANIC, NULL);
+        vm_area_cachep = kmem_cache_create("vm_area_struct",
-        for (vml = &current->mm->context.vmlist; vml; vml = vml->next) {
+                                           sizeof(struct vm_area_struct), 0,
-                printk(" %p: %p", vml, vml->vma);
+                                           SLAB_PANIC, NULL);
-                if (vml->vma)
-                        printk(" (%d @%lx #%d)",
-                               kobjsize((void *) vml->vma->vm_start),
-                               vml->vma->vm_start,
-                               atomic_read(&vml->vma->vm_usage));
-                printk(vml->next ? " ->" : ".\n");
-        }
 }
-#endif /* DEBUG */
 /*
- * add a VMA into a process's mm_struct in the appropriate place in the list
+ * validate the region tree
- * - should be called with mm->mmap_sem held writelocked
+ * - the caller must hold the region lock
 */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml)
+#ifdef CONFIG_DEBUG_NOMMU_REGIONS
+static noinline void validate_nommu_regions(void)
 {
-        struct vm_list_struct **ppv;
+        struct vm_region *region, *last;
+        struct rb_node *p, *lastp;
-        for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next)
-                if ((*ppv)->vma->vm_start > vml->vma->vm_start)
+        lastp = rb_first(&nommu_region_tree);
-                        break;
+        if (!lastp)
+                return;
-        vml->next = *ppv;
-        *ppv = vml;
+        last = rb_entry(lastp, struct vm_region, vm_rb);
+        if (unlikely(last->vm_end <= last->vm_start))
+                BUG();
+        if (unlikely(last->vm_top < last->vm_end))
+                BUG();
+        while ((p = rb_next(lastp))) {
+                region = rb_entry(p, struct vm_region, vm_rb);
+                last = rb_entry(lastp, struct vm_region, vm_rb);
+                if (unlikely(region->vm_end <= region->vm_start))
+                        BUG();
+                if (unlikely(region->vm_top < region->vm_end))
+                        BUG();
+                if (unlikely(region->vm_start < last->vm_top))
+                        BUG();
+                lastp = p;
+        }
 }
+#else
+#define validate_nommu_regions() do {} while(0)
+#endif
 /*
- * look up the first VMA in which addr resides, NULL if none
+ * add a region into the global tree
- * - should be called with mm->mmap_sem at least held readlocked
 */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+static void add_nommu_region(struct vm_region *region)
 {
-        struct vm_list_struct *loop, *vml;
+        struct vm_region *pregion;
+        struct rb_node **p, *parent;
-        /* search the vm_start ordered list */
+        validate_nommu_regions();
-        vml = NULL;
-        for (loop = mm->context.vmlist; loop; loop = loop->next) {
+        BUG_ON(region->vm_start & ~PAGE_MASK);
-                if (loop->vma->vm_start > addr)
-                        break;
+        parent = NULL;
-                vml = loop;
+        p = &nommu_region_tree.rb_node;
+        while (*p) {
+                parent = *p;
+                pregion = rb_entry(parent, struct vm_region, vm_rb);
+                if (region->vm_start < pregion->vm_start)
+                        p = &(*p)->rb_left;
+                else if (region->vm_start > pregion->vm_start)
+                        p = &(*p)->rb_right;
+                else if (pregion == region)
+                        return;
+                else
+                        BUG();
        }
-        if (vml && vml->vma->vm_end > addr)
+        rb_link_node(&region->vm_rb, parent, p);
-                return vml->vma;
+        rb_insert_color(&region->vm_rb, &nommu_region_tree);
-        return NULL;
+        validate_nommu_regions();
 }
-EXPORT_SYMBOL(find_vma);
 /*
- * find a VMA
+ * delete a region from the global tree
- * - we don't extend stack VMAs under NOMMU conditions
 */
-struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+static void delete_nommu_region(struct vm_region *region)
 {
-        return find_vma(mm, addr);
+        BUG_ON(!nommu_region_tree.rb_node);
-}
-int expand_stack(struct vm_area_struct *vma, unsigned long address)
+        validate_nommu_regions();
-{
+        rb_erase(&region->vm_rb, &nommu_region_tree);
-        return -ENOMEM;
+        validate_nommu_regions();
 }
 /*
- * look up the first VMA exactly that exactly matches addr
+ * free a contiguous series of pages
- * - should be called with mm->mmap_sem at least held readlocked
 */
-static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+static void free_page_series(unsigned long from, unsigned long to)
-                                                    unsigned long addr)
 {
-        struct vm_list_struct *vml;
+        for (; from < to; from += PAGE_SIZE) {
+                struct page *page = virt_to_page(from);
-        /* search the vm_start ordered list */
-        for (vml = mm->context.vmlist; vml; vml = vml->next) {
+                kdebug("- free %lx", from);
-                if (vml->vma->vm_start == addr)
+                atomic_dec(&mmap_pages_allocated);
-                        return vml->vma;
+                if (page_count(page) != 1)
-                if (vml->vma->vm_start > addr)
+                        kdebug("free page %p [%d]", page, page_count(page));
-                        break;
+                put_page(page);
        }
-        return NULL;
 }
 /*
- * find a VMA in the global tree
+ * release a reference to a region
+ * - the caller must hold the region semaphore, which this releases
+ * - the region may not have been added to the tree yet, in which case vm_top
+ *   will equal vm_start
 */
-static inline struct vm_area_struct *find_nommu_vma(unsigned long start)
+static void __put_nommu_region(struct vm_region *region)
+        __releases(nommu_region_sem)
 {
-        struct vm_area_struct *vma;
+        kenter("%p{%d}", region, atomic_read(&region->vm_usage));
-        struct rb_node *n = nommu_vma_tree.rb_node;
-        while (n) {
+        BUG_ON(!nommu_region_tree.rb_node);
-                vma = rb_entry(n, struct vm_area_struct, vm_rb);
-                if (start < vma->vm_start)
+        if (atomic_dec_and_test(&region->vm_usage)) {
-                        n = n->rb_left;
+                if (region->vm_top > region->vm_start)
-                else if (start > vma->vm_start)
+                        delete_nommu_region(region);
-                        n = n->rb_right;
+                up_write(&nommu_region_sem);
-                else
-                        return vma;
+                if (region->vm_file)
+                        fput(region->vm_file);
+                /* IO memory and memory shared directly out of the pagecache
+                 * from ramfs/tmpfs mustn't be released here */
+                if (region->vm_flags & VM_MAPPED_COPY) {
+                        kdebug("free series");
+                        free_page_series(region->vm_start, region->vm_top);
+                }
+                kmem_cache_free(vm_region_jar, region);
+        } else {
+                up_write(&nommu_region_sem);
        }
+}
-        return NULL;
+/*
+ * release a reference to a region
+ */
+static void put_nommu_region(struct vm_region *region)
+{
+        down_write(&nommu_region_sem);
+        __put_nommu_region(region);
 }
 /*
- * add a VMA in the global tree
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_sem held writelocked
 */
-static void add_nommu_vma(struct vm_area_struct *vma)
+static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *pvma;
+        struct vm_area_struct *pvma, **pp;
        struct address_space *mapping;
-        struct rb_node **p = &nommu_vma_tree.rb_node;
+        struct rb_node **p, *parent;
-        struct rb_node *parent = NULL;
+        kenter(",%p", vma);
+        BUG_ON(!vma->vm_region);
+        mm->map_count++;
+        vma->vm_mm = mm;
        /* add the VMA to the mapping */
        if (vma->vm_file) {
@@ -534,42 +640,62 @@ static void add_nommu_vma(struct vm_area_struct *vma)
                flush_dcache_mmap_unlock(mapping);
        }
-        /* add the VMA to the master list */
+        /* add the VMA to the tree */
+        parent = NULL;
+        p = &mm->mm_rb.rb_node;
        while (*p) {
                parent = *p;
                pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-                if (vma->vm_start < pvma->vm_start) {
+                /* sort by: start addr, end addr, VMA struct addr in that order
+                 * (the latter is necessary as we may get identical VMAs) */
+                if (vma->vm_start < pvma->vm_start)
                        p = &(*p)->rb_left;
-                }
+                else if (vma->vm_start > pvma->vm_start)
-                else if (vma->vm_start > pvma->vm_start) {
                        p = &(*p)->rb_right;
-                }
+                else if (vma->vm_end < pvma->vm_end)
-                else {
+                        p = &(*p)->rb_left;
-                        /* mappings are at the same address - this can only
+                else if (vma->vm_end > pvma->vm_end)
-                         * happen for shared-mem chardevs and shared file
+                        p = &(*p)->rb_right;
-                         * mappings backed by ramfs/tmpfs */
+                else if (vma < pvma)
-                        BUG_ON(!(pvma->vm_flags & VM_SHARED));
+                        p = &(*p)->rb_left;
+                else if (vma > pvma)
-                        if (vma < pvma)
+                        p = &(*p)->rb_right;
-                                p = &(*p)->rb_left;
+                else
-                        else if (vma > pvma)
+                        BUG();
-                                p = &(*p)->rb_right;
-                        else
-                                BUG();
-                }
        }
        rb_link_node(&vma->vm_rb, parent, p);
-        rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
+        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+        /* add VMA to the VMA list also */
+        for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
+                if (pvma->vm_start > vma->vm_start)
+                        break;
+                if (pvma->vm_start < vma->vm_start)
+                        continue;
+                if (pvma->vm_end < vma->vm_end)
+                        break;
+        }
+        vma->vm_next = *pp;
+        *pp = vma;
 }
 /*
- * delete a VMA from the global list
+ * delete a VMA from its owning mm_struct and address space
 */
-static void delete_nommu_vma(struct vm_area_struct *vma)
+static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+        struct vm_area_struct **pp;
        struct address_space *mapping;
+        struct mm_struct *mm = vma->vm_mm;
+        kenter("%p", vma);
+        mm->map_count--;
+        if (mm->mmap_cache == vma)
+                mm->mmap_cache = NULL;
        /* remove the VMA from the mapping */
        if (vma->vm_file) {
@@ -580,8 +706,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
                flush_dcache_mmap_unlock(mapping);
        }
-        /* remove from the master list */
+        /* remove from the MM's tree and list */
-        rb_erase(&vma->vm_rb, &nommu_vma_tree);
+        rb_erase(&vma->vm_rb, &mm->mm_rb);
+        for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
+                if (*pp == vma) {
+                        *pp = vma->vm_next;
+                        break;
+                }
+        }
+        vma->vm_mm = NULL;
+}
+/*
+ * destroy a VMA record
+ */
+static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+        kenter("%p", vma);
+        if (vma->vm_ops && vma->vm_ops->close)
+                vma->vm_ops->close(vma);
+        if (vma->vm_file) {
+                fput(vma->vm_file);
+                if (vma->vm_flags & VM_EXECUTABLE)
+                        removed_exe_file_vma(mm);
+        }
+        put_nommu_region(vma->vm_region);
+        kmem_cache_free(vm_area_cachep, vma);
+}
+/*
+ * look up the first VMA in which addr resides, NULL if none
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+        struct vm_area_struct *vma;
+        struct rb_node *n = mm->mm_rb.rb_node;
+        /* check the cache first */
+        vma = mm->mmap_cache;
+        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+                return vma;
+        /* trawl the tree (there may be multiple mappings in which addr
+         * resides) */
+        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+                vma = rb_entry(n, struct vm_area_struct, vm_rb);
+                if (vma->vm_start > addr)
+                        return NULL;
+                if (vma->vm_end > addr) {
+                        mm->mmap_cache = vma;
+                        return vma;
+                }
+        }
+        return NULL;
+}
+EXPORT_SYMBOL(find_vma);
+/*
+ * find a VMA
+ * - we don't extend stack VMAs under NOMMU conditions
+ */
+struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
+{
+        return find_vma(mm, addr);
+}
+/*
+ * expand a stack to a given address
+ * - not supported under NOMMU conditions
+ */
+int expand_stack(struct vm_area_struct *vma, unsigned long address)
+{
+        return -ENOMEM;
+}
+/*
+ * look up the first VMA exactly that exactly matches addr
+ * - should be called with mm->mmap_sem at least held readlocked
+ */
+static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
+                                             unsigned long addr,
+                                             unsigned long len)
+{
+        struct vm_area_struct *vma;
+        struct rb_node *n = mm->mm_rb.rb_node;
+        unsigned long end = addr + len;
+        /* check the cache first */
+        vma = mm->mmap_cache;
+        if (vma && vma->vm_start == addr && vma->vm_end == end)
+                return vma;
+        /* trawl the tree (there may be multiple mappings in which addr
+         * resides) */
+        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+                vma = rb_entry(n, struct vm_area_struct, vm_rb);
+                if (vma->vm_start < addr)
+                        continue;
+                if (vma->vm_start > addr)
+                        return NULL;
+                if (vma->vm_end == end) {
+                        mm->mmap_cache = vma;
+                        return vma;
+                }
+        }
+        return NULL;
 }
 /*
@@ -596,7 +829,7 @@ static int validate_mmap_request(struct file *file,
                                 unsigned long pgoff,
                                 unsigned long *_capabilities)
 {
-        unsigned long capabilities;
+        unsigned long capabilities, rlen;
        unsigned long reqprot = prot;
        int ret;
@@ -616,12 +849,12 @@ static int validate_mmap_request(struct file *file,
                return -EINVAL;
        /* Careful about overflows.. */
-        len = PAGE_ALIGN(len);
+        rlen = PAGE_ALIGN(len);
-        if (!len || len > TASK_SIZE)
+        if (!rlen || rlen > TASK_SIZE)
                return -ENOMEM;
        /* offset overflow? */
-        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
+        if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;
        if (file) {
@@ -795,13 +1028,18 @@ static unsigned long determine_vm_flags(struct file *file,
 }
 /*
- * set up a shared mapping on a file
+ * set up a shared mapping on a file (the driver or filesystem provides and
+ * pins the storage)
 */
-static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_shared_file(struct vm_area_struct *vma)
 {
        int ret;
        ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
+        if (ret == 0) {
+                vma->vm_region->vm_top = vma->vm_region->vm_end;
+                return ret;
+        }
        if (ret != -ENOSYS)
                return ret;
@@ -815,10 +1053,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
 /*
 * set up a private mapping or an anonymous shared mapping
 */
-static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
+static int do_mmap_private(struct vm_area_struct *vma,
+                           struct vm_region *region,
+                           unsigned long len)
 {
+        struct page *pages;
+        unsigned long total, point, n, rlen;
        void *base;
-        int ret;
+        int ret, order;
        /* invoke the file's mapping function so that it can keep track of
         * shared mappings on devices or memory
@@ -826,34 +1068,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
         */
        if (vma->vm_file) {
                ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
-                if (ret != -ENOSYS) {
+                if (ret == 0) {
                        /* shouldn't return success if we're not sharing */
-                        BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE));
+                        BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
-                        return ret; /* success or a real error */
+                        vma->vm_region->vm_top = vma->vm_region->vm_end;
+                        return ret;
                }
+                if (ret != -ENOSYS)
+                        return ret;
                /* getting an ENOSYS error indicates that direct mmap isn't
                 * possible (as opposed to tried but failed) so we'll try to
                 * make a private copy of the data and map that instead */
        }
+        rlen = PAGE_ALIGN(len);
        /* allocate some memory to hold the mapping
         * - note that this may not return a page-aligned address if the object
         *   we're allocating is smaller than a page
         */
-        base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
+        order = get_order(rlen);
-        if (!base)
+        kdebug("alloc order %d for %lx", order, len);
+        pages = alloc_pages(GFP_KERNEL, order);
+        if (!pages)
                goto enomem;
-        vma->vm_start = (unsigned long) base;
+        total = 1 << order;
-        vma->vm_end = vma->vm_start + len;
+        atomic_add(total, &mmap_pages_allocated);
-        vma->vm_flags |= VM_MAPPED_COPY;
+        point = rlen >> PAGE_SHIFT;
+        /* we allocated a power-of-2 sized page set, so we may want to trim off
+         * the excess */
+        if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+                while (total > point) {
+                        order = ilog2(total - point);
+                        n = 1 << order;
+                        kdebug("shave %lu/%lu @%lu", n, total - point, total);
+                        atomic_sub(n, &mmap_pages_allocated);
+                        total -= n;
+                        set_page_refcounted(pages + total);
+                        __free_pages(pages + total, order);
+                }
+        }
-#ifdef WARN_ON_SLACK
+        for (point = 1; point < total; point++)
-        if (len + WARN_ON_SLACK <= kobjsize(result))
+                set_page_refcounted(&pages[point]);
-                printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n",
-                       len, current->pid, kobjsize(result) - len);
+        base = page_address(pages);
-#endif
+        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
+        region->vm_start = (unsigned long) base;
+        region->vm_end   = region->vm_start + rlen;
+        region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
+        vma->vm_start = region->vm_start;
+        vma->vm_end   = region->vm_start + len;
        if (vma->vm_file) {
                /* read the contents of a file into the copy */
@@ -865,31 +1136,33 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
                old_fs = get_fs();
                set_fs(KERNEL_DS);
-                ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+                ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
                set_fs(old_fs);
                if (ret < 0)
                        goto error_free;
                /* clear the last little bit */
-                if (ret < len)
+                if (ret < rlen)
-                        memset(base + ret, 0, len - ret);
+                        memset(base + ret, 0, rlen - ret);
        } else {
                /* if it's an anonymous mapping, then just clear it */
-                memset(base, 0, len);
+                memset(base, 0, rlen);
        }
        return 0;
 error_free:
-        kfree(base);
+        free_page_series(region->vm_start, region->vm_end);
-        vma->vm_start = 0;
+        region->vm_start = vma->vm_start = 0;
+        region->vm_end   = vma->vm_end = 0;
+        region->vm_top   = 0;
        return ret;
 enomem:
-        printk("Allocation of length %lu from process %d failed\n",
+        printk("Allocation of length %lu from process %d (%s) failed\n",
-               len, current->pid);
+               len, current->pid, current->comm);
        show_free_areas();
        return -ENOMEM;
 }
@@ -904,13 +1177,14 @@ unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long flags,
                            unsigned long pgoff)
 {
-        struct vm_list_struct *vml = NULL;
+        struct vm_area_struct *vma;
-        struct vm_area_struct *vma = NULL;
+        struct vm_region *region;
        struct rb_node *rb;
-        unsigned long capabilities, vm_flags;
+        unsigned long capabilities, vm_flags, result;
-        void *result;
        int ret;
+        kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);
@@ -918,73 +1192,120 @@ unsigned long do_mmap_pgoff(struct file *file,
         * mapping */
        ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
                                    &capabilities);
-        if (ret < 0)
+        if (ret < 0) {
+                kleave(" = %d [val]", ret);
                return ret;
+        }
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
        vm_flags = determine_vm_flags(file, prot, flags, capabilities);
-        /* we're going to need to record the mapping if it works */
+        /* we're going to need to record the mapping */
-        vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL);
+        region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
-        if (!vml)
+        if (!region)
-                goto error_getting_vml;
+                goto error_getting_region;
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+        if (!vma)
+                goto error_getting_vma;
+        atomic_set(&region->vm_usage, 1);
+        region->vm_flags = vm_flags;
+        region->vm_pgoff = pgoff;
+        INIT_LIST_HEAD(&vma->anon_vma_node);
+        vma->vm_flags = vm_flags;
+        vma->vm_pgoff = pgoff;
+        if (file) {
+                region->vm_file = file;
+                get_file(file);
+                vma->vm_file = file;
+                get_file(file);
+                if (vm_flags & VM_EXECUTABLE) {
+                        added_exe_file_vma(current->mm);
+                        vma->vm_mm = current->mm;
+                }
+        }
-        down_write(&nommu_vma_sem);
+        down_write(&nommu_region_sem);
-        /* if we want to share, we need to check for VMAs created by other
+        /* if we want to share, we need to check for regions created by other
         * mmap() calls that overlap with our proposed mapping
-         * - we can only share with an exact match on most regular files
+         * - we can only share with a superset match on most regular files
         * - shared mappings on character devices and memory backed files are
         *   permitted to overlap inexactly as far as we are concerned for in
         *   these cases, sharing is handled in the driver or filesystem rather
         *   than here
         */
        if (vm_flags & VM_MAYSHARE) {
-                unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                struct vm_region *pregion;
-                unsigned long vmpglen;
+                unsigned long pglen, rpglen, pgend, rpgend, start;
-                /* suppress VMA sharing for shared regions */
+                pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                if (vm_flags & VM_SHARED &&
+                pgend = pgoff + pglen;
-                    capabilities & BDI_CAP_MAP_DIRECT)
-                        goto dont_share_VMAs;
-                for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
+                for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
-                        vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+                        pregion = rb_entry(rb, struct vm_region, vm_rb);
-                        if (!(vma->vm_flags & VM_MAYSHARE))
+                        if (!(pregion->vm_flags & VM_MAYSHARE))
                                continue;
                        /* search for overlapping mappings on the same file */
-                        if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode)
+                        if (pregion->vm_file->f_path.dentry->d_inode !=
+                            file->f_path.dentry->d_inode)
                                continue;
-                        if (vma->vm_pgoff >= pgoff + pglen)
+                        if (pregion->vm_pgoff >= pgend)
                                continue;
-                        vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1;
+                        rpglen = pregion->vm_end - pregion->vm_start;
-                        vmpglen >>= PAGE_SHIFT;
+                        rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                        if (pgoff >= vma->vm_pgoff + vmpglen)
+                        rpgend = pregion->vm_pgoff + rpglen;
+                        if (pgoff >= rpgend)
                                continue;
-                        /* handle inexactly overlapping matches between mappings */
+                        /* handle inexactly overlapping matches between
-                        if (vma->vm_pgoff != pgoff || vmpglen != pglen) {
+                         * mappings */
+                        if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
+                            !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
+                                /* new mapping is not a subset of the region */
                                if (!(capabilities & BDI_CAP_MAP_DIRECT))
                                        goto sharing_violation;
                                continue;
                        }
-                        /* we've found a VMA we can share */
+                        /* we've found a region we can share */
-                        atomic_inc(&vma->vm_usage);
+                        atomic_inc(&pregion->vm_usage);
+                        vma->vm_region = pregion;
-                        vml->vma = vma;
+                        start = pregion->vm_start;
-                        result = (void *) vma->vm_start;
+                        start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
-                        goto shared;
+                        vma->vm_start = start;
+                        vma->vm_end = start + len;
+                        if (pregion->vm_flags & VM_MAPPED_COPY) {
+                                kdebug("share copy");
+                                vma->vm_flags |= VM_MAPPED_COPY;
+                        } else {
+                                kdebug("share mmap");
+                                ret = do_mmap_shared_file(vma);
+                                if (ret < 0) {
+                                        vma->vm_region = NULL;
+                                        vma->vm_start = 0;
+                                        vma->vm_end = 0;
+                                        atomic_dec(&pregion->vm_usage);
+                                        pregion = NULL;
+                                        goto error_just_free;
+                                }
+                        }
+                        fput(region->vm_file);
+                        kmem_cache_free(vm_region_jar, region);
+                        region = pregion;
+                        result = start;
+                        goto share;
                }
-        dont_share_VMAs:
-                vma = NULL;
                /* obtain the address at which to make a shared mapping
                 * - this is the hook for quasi-memory character devices to
                 *   tell us the location of a shared mapping
@@ -995,113 +1316,93 @@ unsigned long do_mmap_pgoff(struct file *file,
                        if (IS_ERR((void *) addr)) {
                                ret = addr;
                                if (ret != (unsigned long) -ENOSYS)
-                                        goto error;
+                                        goto error_just_free;
                                /* the driver refused to tell us where to site
                                 * the mapping so we'll have to attempt to copy
                                 * it */
                                ret = (unsigned long) -ENODEV;
                                if (!(capabilities & BDI_CAP_MAP_COPY))
-                                        goto error;
+                                        goto error_just_free;
                                capabilities &= ~BDI_CAP_MAP_DIRECT;
+                        } else {
+                                vma->vm_start = region->vm_start = addr;
+                                vma->vm_end = region->vm_end = addr + len;
                        }
                }
        }
-        /* we're going to need a VMA struct as well */
+        vma->vm_region = region;
-        vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
-        if (!vma)
-                goto error_getting_vma;
-        INIT_LIST_HEAD(&vma->anon_vma_node);
-        atomic_set(&vma->vm_usage, 1);
-        if (file) {
-                get_file(file);
-                if (vm_flags & VM_EXECUTABLE) {
-                        added_exe_file_vma(current->mm);
-                        vma->vm_mm = current->mm;
-                }
-        }
-        vma->vm_file    = file;
-        vma->vm_flags   = vm_flags;
-        vma->vm_start   = addr;
-        vma->vm_end     = addr + len;
-        vma->vm_pgoff   = pgoff;
-        vml->vma = vma;
        /* set up the mapping */
        if (file && vma->vm_flags & VM_SHARED)
-                ret = do_mmap_shared_file(vma, len);
+                ret = do_mmap_shared_file(vma);
        else
-                ret = do_mmap_private(vma, len);
+                ret = do_mmap_private(vma, region, len);
        if (ret < 0)
-                goto error;
+                goto error_put_region;
-        /* okay... we have a mapping; now we have to register it */
-        result = (void *) vma->vm_start;
-        if (vma->vm_flags & VM_MAPPED_COPY) {
+        add_nommu_region(region);
-                realalloc += kobjsize(result);
-                askedalloc += len;
-        }
-        realalloc += kobjsize(vma);
+        /* okay... we have a mapping; now we have to register it */
-        askedalloc += sizeof(*vma);
+        result = vma->vm_start;
        current->mm->total_vm += len >> PAGE_SHIFT;
-        add_nommu_vma(vma);
+share:
+        add_vma_to_mm(current->mm, vma);
- shared:
-        realalloc += kobjsize(vml);
-        askedalloc += sizeof(*vml);
-        add_vma_to_mm(current->mm, vml);
+        up_write(&nommu_region_sem);
-        up_write(&nommu_vma_sem);
        if (prot & PROT_EXEC)
-                flush_icache_range((unsigned long) result,
+                flush_icache_range(result, result + len);
-                                   (unsigned long) result + len);
-#ifdef DEBUG
+        kleave(" = %lx", result);
-        printk("do_mmap:\n");
+        return result;
-        show_process_blocks();
-#endif
-        return (unsigned long) result;
+error_put_region:
+        __put_nommu_region(region);
- error:
-        up_write(&nommu_vma_sem);
-        kfree(vml);
        if (vma) {
                if (vma->vm_file) {
                        fput(vma->vm_file);
                        if (vma->vm_flags & VM_EXECUTABLE)
                                removed_exe_file_vma(vma->vm_mm);
                }
-                kfree(vma);
+                kmem_cache_free(vm_area_cachep, vma);
        }
+        kleave(" = %d [pr]", ret);
        return ret;
- sharing_violation:
+error_just_free:
-        up_write(&nommu_vma_sem);
+        up_write(&nommu_region_sem);
-        printk("Attempt to share mismatched mappings\n");
+error:
-        kfree(vml);
+        fput(region->vm_file);
-        return -EINVAL;
+        kmem_cache_free(vm_region_jar, region);
+        fput(vma->vm_file);
+        if (vma->vm_flags & VM_EXECUTABLE)
+                removed_exe_file_vma(vma->vm_mm);
+        kmem_cache_free(vm_area_cachep, vma);
+        kleave(" = %d", ret);
+        return ret;
- error_getting_vma:
+sharing_violation:
-        up_write(&nommu_vma_sem);
+        up_write(&nommu_region_sem);
-        kfree(vml);
+        printk(KERN_WARNING "Attempt to share mismatched mappings\n");
-        printk("Allocation of vma for %lu byte allocation from process %d failed\n",
+        ret = -EINVAL;
+        goto error;
+error_getting_vma:
+        kmem_cache_free(vm_region_jar, region);
+        printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
+               " from process %d failed\n",
               len, current->pid);
        show_free_areas();
        return -ENOMEM;
- error_getting_vml:
+error_getting_region:
-        printk("Allocation of vml for %lu byte allocation from process %d failed\n",
+        printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
+               " from process %d failed\n",
               len, current->pid);
        show_free_areas();
        return -ENOMEM;
@@ -1109,90 +1410,188 @@ unsigned long do_mmap_pgoff(struct file *file,
 EXPORT_SYMBOL(do_mmap_pgoff);
 /*
- * handle mapping disposal for uClinux
+ * split a vma into two pieces at address 'addr', a new vma is allocated either
+ * for the first part or the tail.
 */
-static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma)
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+              unsigned long addr, int new_below)
 {
-        if (vma) {
+        struct vm_area_struct *new;
-                down_write(&nommu_vma_sem);
+        struct vm_region *region;
+        unsigned long npages;
-                if (atomic_dec_and_test(&vma->vm_usage)) {
+        kenter("");
-                        delete_nommu_vma(vma);
-                        if (vma->vm_ops && vma->vm_ops->close)
+        /* we're only permitted to split anonymous regions that have a single
-                                vma->vm_ops->close(vma);
+         * owner */
+        if (vma->vm_file ||
+            atomic_read(&vma->vm_region->vm_usage) != 1)
+                return -ENOMEM;
-                        /* IO memory and memory shared directly out of the pagecache from
+        if (mm->map_count >= sysctl_max_map_count)
-                         * ramfs/tmpfs mustn't be released here */
+                return -ENOMEM;
-                        if (vma->vm_flags & VM_MAPPED_COPY) {
-                                realalloc -= kobjsize((void *) vma->vm_start);
-                                askedalloc -= vma->vm_end - vma->vm_start;
-                                kfree((void *) vma->vm_start);
-                        }
-                        realalloc -= kobjsize(vma);
+        region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
-                        askedalloc -= sizeof(*vma);
+        if (!region)
+                return -ENOMEM;
-                        if (vma->vm_file) {
+        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-                                fput(vma->vm_file);
+        if (!new) {
-                                if (vma->vm_flags & VM_EXECUTABLE)
+                kmem_cache_free(vm_region_jar, region);
-                                        removed_exe_file_vma(mm);
+                return -ENOMEM;
-                        }
+        }
-                        kfree(vma);
-                }
+        /* most fields are the same, copy all, and then fixup */
+        *new = *vma;
+        *region = *vma->vm_region;
+        new->vm_region = region;
+        npages = (addr - vma->vm_start) >> PAGE_SHIFT;
-                up_write(&nommu_vma_sem);
+        if (new_below) {
+                region->vm_top = region->vm_end = new->vm_end = addr;
+        } else {
+                region->vm_start = new->vm_start = addr;
+                region->vm_pgoff = new->vm_pgoff += npages;
        }
+        if (new->vm_ops && new->vm_ops->open)
+                new->vm_ops->open(new);
+        delete_vma_from_mm(vma);
+        down_write(&nommu_region_sem);
+        delete_nommu_region(vma->vm_region);
+        if (new_below) {
+                vma->vm_region->vm_start = vma->vm_start = addr;
+                vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
+        } else {
+                vma->vm_region->vm_end = vma->vm_end = addr;
+                vma->vm_region->vm_top = addr;
+        }
+        add_nommu_region(vma->vm_region);
+        add_nommu_region(new->vm_region);
+        up_write(&nommu_region_sem);
+        add_vma_to_mm(mm, vma);
+        add_vma_to_mm(mm, new);
+        return 0;
 }
 /*
- * release a mapping
+ * shrink a VMA by removing the specified chunk from either the beginning or
- * - under NOMMU conditions the parameters must match exactly to the mapping to
+ * the end
- *   be removed
 */
-int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
+static int shrink_vma(struct mm_struct *mm,
+                      struct vm_area_struct *vma,
+                      unsigned long from, unsigned long to)
 {
-        struct vm_list_struct *vml, **parent;
+        struct vm_region *region;
-        unsigned long end = addr + len;
-#ifdef DEBUG
+        kenter("");
-        printk("do_munmap:\n");
-#endif
-        for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) {
+        /* adjust the VMA's pointers, which may reposition it in the MM's tree
-                if ((*parent)->vma->vm_start > addr)
+         * and list */
-                        break;
+        delete_vma_from_mm(vma);
-                if ((*parent)->vma->vm_start == addr &&
+        if (from > vma->vm_start)
-                    ((len == 0) || ((*parent)->vma->vm_end == end)))
+                vma->vm_end = from;
-                        goto found;
+        else
+                vma->vm_start = to;
+        add_vma_to_mm(mm, vma);
+        /* cut the backing region down to size */
+        region = vma->vm_region;
+        BUG_ON(atomic_read(&region->vm_usage) != 1);
+        down_write(&nommu_region_sem);
+        delete_nommu_region(region);
+        if (from > region->vm_start) {
+                to = region->vm_top;
+                region->vm_top = region->vm_end = from;
+        } else {
+                region->vm_start = to;
        }
+        add_nommu_region(region);
+        up_write(&nommu_region_sem);
-        printk("munmap of non-mmaped memory by process %d (%s): %p\n",
+        free_page_series(from, to);
-               current->pid, current->comm, (void *) addr);
+        return 0;
-        return -EINVAL;
+}
- found:
+/*
-        vml = *parent;
+ * release a mapping
+ * - under NOMMU conditions the chunk to be unmapped must be backed by a single
+ *   VMA, though it need not cover the whole VMA
+ */
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
+{
+        struct vm_area_struct *vma;
+        struct rb_node *rb;
+        unsigned long end = start + len;
+        int ret;
-        put_vma(mm, vml->vma);
+        kenter(",%lx,%zx", start, len);
-        *parent = vml->next;
+        if (len == 0)
-        realalloc -= kobjsize(vml);
+                return -EINVAL;
-        askedalloc -= sizeof(*vml);
-        kfree(vml);
-        update_hiwater_vm(mm);
+        /* find the first potentially overlapping VMA */
-        mm->total_vm -= len >> PAGE_SHIFT;
+        vma = find_vma(mm, start);
+        if (!vma) {
+                printk(KERN_WARNING
+                       "munmap of memory not mmapped by process %d (%s):"
+                       " 0x%lx-0x%lx\n",
+                       current->pid, current->comm, start, start + len - 1);
+                return -EINVAL;
+        }
-#ifdef DEBUG
+        /* we're allowed to split an anonymous VMA but not a file-backed one */
-        show_process_blocks();
+        if (vma->vm_file) {
-#endif
+                do {
+                        if (start > vma->vm_start) {
+                                kleave(" = -EINVAL [miss]");
+                                return -EINVAL;
+                        }
+                        if (end == vma->vm_end)
+                                goto erase_whole_vma;
+                        rb = rb_next(&vma->vm_rb);
+                        vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+                } while (rb);
+                kleave(" = -EINVAL [split file]");
+                return -EINVAL;
+        } else {
+                /* the chunk must be a subset of the VMA found */
+                if (start == vma->vm_start && end == vma->vm_end)
+                        goto erase_whole_vma;
+                if (start < vma->vm_start || end > vma->vm_end) {
+                        kleave(" = -EINVAL [superset]");
+                        return -EINVAL;
+                }
+                if (start & ~PAGE_MASK) {
+                        kleave(" = -EINVAL [unaligned start]");
+                        return -EINVAL;
+                }
+                if (end != vma->vm_end && end & ~PAGE_MASK) {
+                        kleave(" = -EINVAL [unaligned split]");
+                        return -EINVAL;
+                }
+                if (start != vma->vm_start && end != vma->vm_end) {
+                        ret = split_vma(mm, vma, start, 1);
+                        if (ret < 0) {
+                                kleave(" = %d [split]", ret);
+                                return ret;
+                        }
+                }
+                return shrink_vma(mm, vma, start, end);
+        }
+erase_whole_vma:
+        delete_vma_from_mm(vma);
+        delete_vma(mm, vma);
+        kleave(" = 0");
        return 0;
 }
 EXPORT_SYMBOL(do_munmap);
-asmlinkage long sys_munmap(unsigned long addr, size_t len)
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
        int ret;
        struct mm_struct *mm = current->mm;
@@ -1204,32 +1603,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
 }
 /*
- * Release all mappings
+ * release all the mappings made in a process's VM space
 */
-void exit_mmap(struct mm_struct * mm)
+void exit_mmap(struct mm_struct *mm)
 {
-        struct vm_list_struct *tmp;
+        struct vm_area_struct *vma;
-        if (mm) {
-#ifdef DEBUG
-                printk("Exit_mmap:\n");
-#endif
-                mm->total_vm = 0;
+        if (!mm)
+                return;
-                while ((tmp = mm->context.vmlist)) {
+        kenter("");
-                        mm->context.vmlist = tmp->next;
-                        put_vma(mm, tmp->vma);
-                        realalloc -= kobjsize(tmp);
+        mm->total_vm = 0;
-                        askedalloc -= sizeof(*tmp);
-                        kfree(tmp);
-                }
-#ifdef DEBUG
+        while ((vma = mm->mmap)) {
-                show_process_blocks();
+                mm->mmap = vma->vm_next;
-#endif
+                delete_vma_from_mm(vma);
+                delete_vma(mm, vma);
        }
+        kleave("");
 }
 unsigned long do_brk(unsigned long addr, unsigned long len)
@@ -1242,8 +1635,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
 *
 * under NOMMU conditions, we only permit changing a mapping's size, and only
- * as long as it stays within the hole allocated by the kmalloc() call in
+ * as long as it stays within the region allocated by do_mmap_private() and the
- * do_mmap_pgoff() and the block is not shareable
+ * block is not shareable
 *
 * MREMAP_FIXED is not supported under NOMMU conditions
 */
@@ -1254,13 +1647,16 @@ unsigned long do_mremap(unsigned long addr,
        struct vm_area_struct *vma;
        /* insanity checks first */
-        if (new_len == 0)
+        if (old_len == 0 || new_len == 0)
                return (unsigned long) -EINVAL;
+        if (addr & ~PAGE_MASK)
+                return -EINVAL;
        if (flags & MREMAP_FIXED && new_addr != addr)
                return (unsigned long) -EINVAL;
-        vma = find_vma_exact(current->mm, addr);
+        vma = find_vma_exact(current->mm, addr, old_len);
        if (!vma)
                return (unsigned long) -EINVAL;
@@ -1270,22 +1666,18 @@ unsigned long do_mremap(unsigned long addr,
        if (vma->vm_flags & VM_MAYSHARE)
                return (unsigned long) -EPERM;
-        if (new_len > kobjsize((void *) addr))
+        if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
                return (unsigned long) -ENOMEM;
        /* all checks complete - do it */
        vma->vm_end = vma->vm_start + new_len;
-        askedalloc -= old_len;
-        askedalloc += new_len;
        return vma->vm_start;
 }
 EXPORT_SYMBOL(do_mremap);
-asmlinkage unsigned long sys_mremap(unsigned long addr,
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-        unsigned long old_len, unsigned long new_len,
+                unsigned long, new_len, unsigned long, flags,
-        unsigned long flags, unsigned long new_addr)
+                unsigned long, new_addr)
 {
        unsigned long ret;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9afe6e4e..40ba05061a4f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
-static DEFINE_SPINLOCK(zone_scan_mutex);
+static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 /**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                printk(KERN_WARNING "%s invoked oom-killer: "
                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
                        current->comm, gfp_mask, order, current->oomkilladj);
+                task_lock(current);
+                cpuset_print_task_mems_allowed(current);
+                task_unlock(current);
                dump_stack();
                show_mem();
                if (sysctl_oom_dump_tasks)
@@ -426,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        unsigned long points = 0;
        struct task_struct *p;
-        cgroup_lock();
        read_lock(&tasklist_lock);
 retry:
        p = select_bad_process(&points, mem);
@@ -441,7 +443,6 @@ retry:
                goto retry;
 out:
        read_unlock(&tasklist_lock);
-        cgroup_unlock();
 }
 #endif
@@ -470,7 +471,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        struct zone *zone;
        int ret = 1;
-        spin_lock(&zone_scan_mutex);
+        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                if (zone_is_oom_locked(zone)) {
                        ret = 0;
@@ -480,7 +481,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                /*
-                 * Lock each zone in the zonelist under zone_scan_mutex so a
+                 * Lock each zone in the zonelist under zone_scan_lock so a
                 * parallel invocation of try_set_zone_oom() doesn't succeed
                 * when it shouldn't.
                 */
@@ -488,7 +489,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        }
 out:
-        spin_unlock(&zone_scan_mutex);
+        spin_unlock(&zone_scan_lock);
        return ret;
 }
@@ -502,11 +503,82 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        struct zoneref *z;
        struct zone *zone;
-        spin_lock(&zone_scan_mutex);
+        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                zone_clear_flag(zone, ZONE_OOM_LOCKED);
        }
-        spin_unlock(&zone_scan_mutex);
+        spin_unlock(&zone_scan_lock);
+}
+/*
+ * Must be called with tasklist_lock held for read.
+ */
+static void __out_of_memory(gfp_t gfp_mask, int order)
+{
+        if (sysctl_oom_kill_allocating_task) {
+                oom_kill_process(current, gfp_mask, order, 0, NULL,
+                                "Out of memory (oom_kill_allocating_task)");
+        } else {
+                unsigned long points;
+                struct task_struct *p;
+retry:
+                /*
+                 * Rambo mode: Shoot down a process and hope it solves whatever
+                 * issues we may have.
+                 */
+                p = select_bad_process(&points, NULL);
+                if (PTR_ERR(p) == -1UL)
+                        return;
+                /* Found nothing?!?! Either we hang forever, or we panic. */
+                if (!p) {
+                        read_unlock(&tasklist_lock);
+                        panic("Out of memory and no killable processes...\n");
+                }
+                if (oom_kill_process(p, gfp_mask, order, points, NULL,
+                                     "Out of memory"))
+                        goto retry;
+        }
+}
+/*
+ * pagefault handler calls into here because it is out of memory but
+ * doesn't know exactly how or why.
+ */
+void pagefault_out_of_memory(void)
+{
+        unsigned long freed = 0;
+        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+        if (freed > 0)
+                /* Got some memory back in the last second. */
+                return;
+        /*
+         * If this is from memcg, oom-killer is already invoked.
+         * and not worth to go system-wide-oom.
+         */
+        if (mem_cgroup_oom_called(current))
+                goto rest_and_return;
+        if (sysctl_panic_on_oom)
+                panic("out of memory from page fault. panic_on_oom is selected.\n");
+        read_lock(&tasklist_lock);
+        __out_of_memory(0, 0); /* unknown gfp_mask and order */
+        read_unlock(&tasklist_lock);
+        /*
+         * Give "p" a good chance of killing itself before we
+         * retry to allocate memory.
+         */
+rest_and_return:
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_uninterruptible(1);
 }
 /**
@@ -522,8 +594,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-        struct task_struct *p;
-        unsigned long points = 0;
        unsigned long freed = 0;
        enum oom_constraint constraint;
@@ -544,7 +614,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
        switch (constraint) {
        case CONSTRAINT_MEMORY_POLICY:
-                oom_kill_process(current, gfp_mask, order, points, NULL,
+                oom_kill_process(current, gfp_mask, order, 0, NULL,
                                "No available memory (MPOL_BIND)");
                break;
@@ -553,35 +623,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                        panic("out of memory. panic_on_oom is selected\n");
                /* Fall-through */
        case CONSTRAINT_CPUSET:
-                if (sysctl_oom_kill_allocating_task) {
+                __out_of_memory(gfp_mask, order);
-                        oom_kill_process(current, gfp_mask, order, points, NULL,
-                                        "Out of memory (oom_kill_allocating_task)");
-                        break;
-                }
-retry:
-                /*
-                 * Rambo mode: Shoot down a process and hope it solves whatever
-                 * issues we may have.
-                 */
-                p = select_bad_process(&points, NULL);
-                if (PTR_ERR(p) == -1UL)
-                        goto out;
-                /* Found nothing?!?! Either we hang forever, or we panic. */
-                if (!p) {
-                        read_unlock(&tasklist_lock);
-                        panic("Out of memory and no killable processes...\n");
-                }
-                if (oom_kill_process(p, gfp_mask, order, points, NULL,
-                                     "Out of memory"))
-                        goto retry;
                break;
        }
-out:
        read_unlock(&tasklist_lock);
        /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35fd03f..dc32dae01e5f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
 int dirty_background_ratio = 5;
 /*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
+/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
 int vm_dirty_ratio = 10;
 /*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
+/*
 * The interval between `kupdate'-style writebacks, in jiffies
 */
 int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
 {
        unsigned long dirty_total;
-        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        if (vm_dirty_bytes)
+                dirty_total = vm_dirty_bytes / PAGE_SIZE;
+        else
+                dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+                                100;
        return 2 + ilog2(dirty_total - 1);
 }
 /*
- * update the period when the dirty ratio changes.
+ * update the period when the dirty threshold changes.
 */
+static void update_completion_period(void)
+{
+        int shift = calc_period_shift();
+        prop_change_shift(&vm_completions, shift);
+        prop_change_shift(&vm_dirties, shift);
+}
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                dirty_background_bytes = 0;
+        return ret;
+}
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                dirty_background_ratio = 0;
+        return ret;
+}
 int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
-        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        int ret;
+        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-                int shift = calc_period_shift();
+                update_completion_period();
-                prop_change_shift(&vm_completions, shift);
+                vm_dirty_bytes = 0;
-                prop_change_shift(&vm_dirties, shift);
+        }
+        return ret;
+}
+int dirty_bytes_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_bytes = vm_dirty_bytes;
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+                update_completion_period();
+                vm_dirty_ratio = 0;
        }
        return ret;
 }
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
 }
 void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-                 struct backing_dev_info *bdi)
+                 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 {
-        int background_ratio;           /* Percentages */
+        unsigned long background;
-        int dirty_ratio;
+        unsigned long dirty;
-        long background;
-        long dirty;
        unsigned long available_memory = determine_dirtyable_memory();
        struct task_struct *tsk;
-        dirty_ratio = vm_dirty_ratio;
+        if (vm_dirty_bytes)
-        if (dirty_ratio < 5)
+                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-                dirty_ratio = 5;
+        else {
+                int dirty_ratio;
+                dirty_ratio = vm_dirty_ratio;
+                if (dirty_ratio < 5)
+                        dirty_ratio = 5;
+                dirty = (dirty_ratio * available_memory) / 100;
+        }
-        background_ratio = dirty_background_ratio;
+        if (dirty_background_bytes)
-        if (background_ratio >= dirty_ratio)
+                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
-                background_ratio = dirty_ratio / 2;
+        else
+                background = (dirty_background_ratio * available_memory) / 100;
-        background = (background_ratio * available_memory) / 100;
+        if (background >= dirty)
-        dirty = (dirty_ratio * available_memory) / 100;
+                background = dirty / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 {
        long nr_reclaimable, bdi_nr_reclaimable;
        long nr_writeback, bdi_nr_writeback;
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
-        long bdi_thresh;
+        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
        for ( ; ; ) {
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
        };
        for ( ; ; ) {
-                long background_thresh;
+                unsigned long background_thresh;
-                long dirty_thresh;
+                unsigned long dirty_thresh;
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
        int done = 0;
        struct pagevec pvec;
        int nr_pages;
+        pgoff_t uninitialized_var(writeback_index);
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
-        int scanned = 0;
+        pgoff_t done_index;
+        int cycled;
        int range_whole = 0;
        long nr_to_write = wbc->nr_to_write;
@@ -881,83 +953,143 @@ int write_cache_pages(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
-                index = mapping->writeback_index; /* Start from prev offset */
+                writeback_index = mapping->writeback_index; /* prev offset */
+                index = writeback_index;
+                if (index == 0)
+                        cycled = 1;
+                else
+                        cycled = 0;
                end = -1;
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
-                scanned = 1;
+                cycled = 1; /* ignore range_cyclic tests */
        }
 retry:
-        while (!done && (index <= end) &&
+        done_index = index;
-               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        while (!done && (index <= end)) {
-                                              PAGECACHE_TAG_DIRTY,
+                int i;
-                                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-                unsigned i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
-                scanned = 1;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        /*
-                         * At this point we hold neither mapping->tree_lock nor
+                         * At this point, the page may be truncated or
-                         * lock on the page itself: the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
-                         * invalidated (changing page->mapping to NULL), or even
+                         * even swizzled back from swapper_space to tmpfs file
-                         * swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
-                         * mapping
+                         * because we have a reference on the page.
                         */
+                        if (page->index > end) {
+                                /*
+                                 * can't be range_cyclic (1st pass) because
+                                 * end == -1 in that case.
+                                 */
+                                done = 1;
+                                break;
+                        }
+                        done_index = page->index + 1;
                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
                                unlock_page(page);
                                continue;
                        }
-                        if (!wbc->range_cyclic && page->index > end) {
+                        if (!PageDirty(page)) {
-                                done = 1;
+                                /* someone wrote it for us */
-                                unlock_page(page);
+                                goto continue_unlock;
-                                continue;
                        }
-                        if (wbc->sync_mode != WB_SYNC_NONE)
+                        if (PageWriteback(page)) {
-                                wait_on_page_writeback(page);
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
-                        if (PageWriteback(page) ||
+                                else
-                            !clear_page_dirty_for_io(page)) {
+                                        goto continue_unlock;
-                                unlock_page(page);
-                                continue;
                        }
-                        ret = (*writepage)(page, wbc, data);
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
-                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                        ret = (*writepage)(page, wbc, data);
-                                unlock_page(page);
+                        if (unlikely(ret)) {
-                                ret = 0;
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
-                        }
+                                        unlock_page(page);
-                        if (ret || (--nr_to_write <= 0))
+                                        ret = 0;
+                                } else {
+                                        /*
+                                         * done_index is set past this page,
+                                         * so media errors will not choke
+                                         * background writeout for the entire
+                                         * file. This has consequences for
+                                         * range_cyclic semantics (ie. it may
+                                         * not be suitable for data integrity
+                                         * writeout).
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0)
+                                nr_to_write--;
+                        else if (wbc->sync_mode == WB_SYNC_NONE) {
+                                /*
+                                 * We stop writing back only if we are not
+                                 * doing integrity sync. In case of integrity
+                                 * sync we have to keep going because someone
+                                 * may be concurrently dirtying pages, and we
+                                 * might have synced a lot of newly appeared
+                                 * dirty pages, but have not synced all of the
+                                 * old dirty pages.
+                                 */
                                done = 1;
+                                break;
+                        }
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
                                done = 1;
+                                break;
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
-        if (!scanned && !done) {
+        if (!cycled) {
                /*
+                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-                scanned = 1;
+                cycled = 1;
                index = 0;
+                end = writeback_index - 1;
                goto retry;
        }
        if (!wbc->no_nrwrite_index_update) {
                if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
-                        mapping->writeback_index = index;
+                        mapping->writeback_index = done_index;
                wbc->nr_to_write = nr_to_write;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac01474563..5675b3073854 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
-long nr_swap_pages;
+unsigned long highest_memmap_pfn __read_mostly;
 int percpu_pagelist_fraction;
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
 static void bad_page(struct page *page)
 {
-        printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
+        static unsigned long resume;
-                "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+        static unsigned long nr_shown;
-                current->comm, page, (int)(2*sizeof(unsigned long)),
+        static unsigned long nr_unshown;
-                (unsigned long)page->flags, page->mapping,
-                page_mapcount(page), page_count(page));
+        /*
+         * Allow a burst of 60 reports, then keep quiet for that minute;
+         * or allow a steady drip of one report per second.
+         */
+        if (nr_shown == 60) {
+                if (time_before(jiffies, resume)) {
+                        nr_unshown++;
+                        goto out;
+                }
+                if (nr_unshown) {
+                        printk(KERN_ALERT
+                              "BUG: Bad page state: %lu messages suppressed\n",
+                                nr_unshown);
+                        nr_unshown = 0;
+                }
+                nr_shown = 0;
+        }
+        if (nr_shown++ == 0)
+                resume = jiffies + 60 * HZ;
+        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+                current->comm, page_to_pfn(page));
+        printk(KERN_ALERT
+                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+                page, (void *)page->flags, page_count(page),
+                page_mapcount(page), page->mapping, page->index);
-        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
-                KERN_EMERG "Backtrace:\n");
        dump_stack();
-        page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
+out:
-        set_page_count(page, 0);
+        /* Leave bad fields for debug, except PageBuddy could make trouble */
-        reset_page_mapcount(page);
+        __ClearPageBuddy(page);
-        page->mapping = NULL;
        add_taint(TAINT_BAD_PAGE);
 }
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
 }
 #endif
-static void destroy_compound_page(struct page *page, unsigned long order)
+static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        int bad = 0;
-        if (unlikely(compound_order(page) != order))
+        if (unlikely(compound_order(page) != order) ||
+            unlikely(!PageHead(page))) {
                bad_page(page);
+                bad++;
+        }
-        if (unlikely(!PageHead(page)))
-                        bad_page(page);
        __ClearPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (unlikely(!PageTail(p) |
+                if (unlikely(!PageTail(p) | (p->first_page != page))) {
-                                (p->first_page != page)))
                        bad_page(page);
+                        bad++;
+                }
                __ClearPageTail(p);
        }
+        return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
        int migratetype = get_pageblock_migratetype(page);
        if (unlikely(PageCompound(page)))
-                destroy_compound_page(page, order);
+                if (unlikely(destroy_compound_page(page, order)))
+                        return;
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (page_count(page) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+                (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
                bad_page(page);
-        if (PageDirty(page))
+                return 1;
-                __ClearPageDirty(page);
+        }
-        if (PageSwapBacked(page))
+        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-                __ClearPageSwapBacked(page);
+                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-        /*
+        return 0;
-         * For now, we report if PG_reserved was found set, but do not
-         * clear it, and do not free the page.  But we shall soon need
-         * to do more, for when the ZERO_PAGE count wraps negative.
-         */
-        return PageReserved(page);
 }
 /*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
        int i;
-        int reserved = 0;
+        int bad = 0;
        for (i = 0 ; i < (1 << order) ; ++i)
-                reserved += free_pages_check(page + i);
+                bad += free_pages_check(page + i);
-        if (reserved)
+        if (bad)
                return;
        if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (page_count(page) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
+                (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
                bad_page(page);
-        /*
-         * For now, we report if PG_reserved was found set, but do not
-         * clear it, and do not allocate the page: as a safety net.
-         */
-        if (PageReserved(page))
                return 1;
+        }
-        page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
-                        1 << PG_referenced | 1 << PG_arch_1 |
-                        1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
-#ifdef CONFIG_UNEVICTABLE_LRU
-                        | 1 << PG_mlocked
-#endif
-                        );
        set_page_private(page, 0);
        set_page_refcounted(page);
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        unsigned long pfn;
        struct zone *z;
+        if (highest_memmap_pfn < end_pfn - 1)
+                highest_memmap_pfn = end_pfn - 1;
        z = &NODE_DATA(nid)->node_zones[zone];
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
 {
        unsigned long usemapsize = usemap_size(zonesize);
        zone->pageblock_flags = NULL;
-        if (usemapsize) {
+        if (usemapsize)
                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
-                memset(zone->pageblock_flags, 0, usemapsize);
-        }
 }
 #else
 static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
-                        printk(KERN_DEBUG
+                        if (memmap_pages)
-                                "  %s zone: %lu pages used for memmap\n",
+                                printk(KERN_DEBUG
-                                zone_names[j], memmap_pages);
+                                       "  %s zone: %lu pages used for memmap\n",
+                                       zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
                                "  %s zone: %lu pages exceeds realsize %lu\n",
@@ -3509,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        INIT_LIST_HEAD(&zone->lru[l].list);
                        zone->lru[l].nr_scan = 0;
                }
-                zone->recent_rotated[0] = 0;
+                zone->reclaim_stat.recent_rotated[0] = 0;
-                zone->recent_rotated[1] = 0;
+                zone->reclaim_stat.recent_rotated[1] = 0;
-                zone->recent_scanned[0] = 0;
+                zone->reclaim_stat.recent_scanned[0] = 0;
-                zone->recent_scanned[1] = 0;
+                zone->reclaim_stat.recent_scanned[1] = 0;
                zap_zone_vm_stats(zone);
                zone->flags = 0;
                if (!size)
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
-void setup_per_zone_inactive_ratio(void)
+static void setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
        return table;
 }
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page *pfn_to_page(unsigned long pfn)
-{
-        return __pfn_to_page(pfn);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        return __page_to_pfn(page);
-}
-EXPORT_SYMBOL(pfn_to_page);
-EXPORT_SYMBOL(page_to_pfn);
-#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
                                                        unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff750519..7006a11350c8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -8,6 +8,7 @@
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
 #include <linux/cgroup.h>
+#include <linux/swapops.h>
 static void __meminit
 __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
        pc->flags = 0;
        pc->mem_cgroup = NULL;
        pc->page = pfn_to_page(pfn);
+        INIT_LIST_HEAD(&pc->lru);
 }
 static unsigned long total_usage;
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void)
        int nid, fail;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return;
        for_each_online_node(nid)  {
@@ -101,15 +103,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 }
 /* __alloc_bootmem...() is protected by !slab_available() */
-int __init_refok init_section_page_cgroup(unsigned long pfn)
+static int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
-        struct mem_section *section;
+        struct mem_section *section = __pfn_to_section(pfn);
        struct page_cgroup *base, *pc;
        unsigned long table_size;
        int nid, index;
-        section = __pfn_to_section(pfn);
        if (!section->page_cgroup) {
                nid = page_to_nid(pfn_to_page(pfn));
                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
@@ -145,7 +145,6 @@ int __init_refok init_section_page_cgroup(unsigned long pfn)
                __init_page_cgroup(pc, pfn + index);
        }
-        section = __pfn_to_section(pfn);
        section->page_cgroup = base - pfn;
        total_usage += table_size;
        return 0;
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void)
        unsigned long pfn;
        int fail = 0;
-        if (mem_cgroup_subsys.disabled)
+        if (mem_cgroup_disabled())
                return;
        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 }
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static DEFINE_MUTEX(swap_cgroup_mutex);
+struct swap_cgroup_ctrl {
+        struct page **map;
+        unsigned long length;
+};
+struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+/*
+ * This 8bytes seems big..maybe we can reduce this when we can use "id" for
+ * cgroup rather than pointer.
+ */
+struct swap_cgroup {
+        struct mem_cgroup       *val;
+};
+#define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
+#define SC_POS_MASK     (SC_PER_PAGE - 1)
+/*
+ * SwapCgroup implements "lookup" and "exchange" operations.
+ * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
+ * against SwapCache. At swap_free(), this is accessed directly from swap.
+ *
+ * This means,
+ *  - we have no race in "exchange" when we're accessed via SwapCache because
+ *    SwapCache(and its swp_entry) is under lock.
+ *  - When called via swap_free(), there is no user of this entry and no race.
+ * Then, we don't need lock around "exchange".
+ *
+ * TODO: we can push these buffers out to HIGHMEM.
+ */
+/*
+ * allocate buffer for swap_cgroup.
+ */
+static int swap_cgroup_prepare(int type)
+{
+        struct page *page;
+        struct swap_cgroup_ctrl *ctrl;
+        unsigned long idx, max;
+        if (!do_swap_account)
+                return 0;
+        ctrl = &swap_cgroup_ctrl[type];
+        for (idx = 0; idx < ctrl->length; idx++) {
+                page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+                if (!page)
+                        goto not_enough_page;
+                ctrl->map[idx] = page;
+        }
+        return 0;
+not_enough_page:
+        max = idx;
+        for (idx = 0; idx < max; idx++)
+                __free_page(ctrl->map[idx]);
+        return -ENOMEM;
+}
+/**
+ * swap_cgroup_record - record mem_cgroup for this swp_entry.
+ * @ent: swap entry to be recorded into
+ * @mem: mem_cgroup to be recorded
+ *
+ * Returns old value at success, NULL at failure.
+ * (Of course, old value can be NULL.)
+ */
+struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
+{
+        int type = swp_type(ent);
+        unsigned long offset = swp_offset(ent);
+        unsigned long idx = offset / SC_PER_PAGE;
+        unsigned long pos = offset & SC_POS_MASK;
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        struct mem_cgroup *old;
+        if (!do_swap_account)
+                return NULL;
+        ctrl = &swap_cgroup_ctrl[type];
+        mappage = ctrl->map[idx];
+        sc = page_address(mappage);
+        sc += pos;
+        old = sc->val;
+        sc->val = mem;
+        return old;
+}
+/**
+ * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
+ * @ent: swap entry to be looked up.
+ *
+ * Returns pointer to mem_cgroup at success. NULL at failure.
+ */
+struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
+{
+        int type = swp_type(ent);
+        unsigned long offset = swp_offset(ent);
+        unsigned long idx = offset / SC_PER_PAGE;
+        unsigned long pos = offset & SC_POS_MASK;
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        struct mem_cgroup *ret;
+        if (!do_swap_account)
+                return NULL;
+        ctrl = &swap_cgroup_ctrl[type];
+        mappage = ctrl->map[idx];
+        sc = page_address(mappage);
+        sc += pos;
+        ret = sc->val;
+        return ret;
+}
+int swap_cgroup_swapon(int type, unsigned long max_pages)
+{
+        void *array;
+        unsigned long array_size;
+        unsigned long length;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return 0;
+        length = ((max_pages/SC_PER_PAGE) + 1);
+        array_size = length * sizeof(void *);
+        array = vmalloc(array_size);
+        if (!array)
+                goto nomem;
+        memset(array, 0, array_size);
+        ctrl = &swap_cgroup_ctrl[type];
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl->length = length;
+        ctrl->map = array;
+        if (swap_cgroup_prepare(type)) {
+                /* memory shortage */
+                ctrl->map = NULL;
+                ctrl->length = 0;
+                vfree(array);
+                mutex_unlock(&swap_cgroup_mutex);
+                goto nomem;
+        }
+        mutex_unlock(&swap_cgroup_mutex);
+        printk(KERN_INFO
+                "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
+                " and %ld bytes to hold mem_cgroup pointers on swap\n",
+                array_size, length * PAGE_SIZE);
+        printk(KERN_INFO
+        "swap_cgroup can be disabled by noswapaccount boot option.\n");
+        return 0;
+nomem:
+        printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
+        printk(KERN_INFO
+                "swap_cgroup can be disabled by noswapaccount boot option\n");
+        return -ENOMEM;
+}
+void swap_cgroup_swapoff(int type)
+{
+        int i;
+        struct swap_cgroup_ctrl *ctrl;
+        if (!do_swap_account)
+                return;
+        mutex_lock(&swap_cgroup_mutex);
+        ctrl = &swap_cgroup_ctrl[type];
+        if (ctrl->map) {
+                for (i = 0; i < ctrl->length; i++) {
+                        struct page *page = ctrl->map[i];
+                        if (page)
+                                __free_page(page);
+                }
+                vfree(ctrl->map);
+                ctrl->map = NULL;
+                ctrl->length = 0;
+        }
+        mutex_unlock(&swap_cgroup_mutex);
+}
+#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c4480eaf0..dc6ce0afbded 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
        struct bio *bio;
        int ret = 0, rw = WRITE;
-        if (remove_exclusive_swap_page(page)) {
+        if (try_to_free_swap(page)) {
                unlock_page(page);
                goto out;
        }
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
        struct bio *bio;
        int ret = 0;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(PageUptodate(page));
+        VM_BUG_ON(PageUptodate(page));
        bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
                                end_swap_bio_read);
        if (bio == NULL) {
diff --git a/mm/pdflush.c b/mm/pdflush.c
index a0a14c4d5072..15de509b68fd 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -172,7 +172,16 @@ static int __pdflush(struct pdflush_work *my_work)
 static int pdflush(void *dummy)
 {
        struct pdflush_work my_work;
-        cpumask_t cpus_allowed;
+        cpumask_var_t cpus_allowed;
+        /*
+         * Since the caller doesn't even check kthread_run() worked, let's not
+         * freak out too much if this fails.
+         */
+        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+                printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
+                return 0;
+        }
        /*
         * pdflush can spend a lot of time doing encryption via dm-crypt.  We
@@ -187,8 +196,9 @@ static int pdflush(void *dummy)
         * This is needed as pdflush's are dynamically created and destroyed.
         * The boottime pdflush's are easily placed w/o these 2 lines.
         */
-        cpuset_cpus_allowed(current, &cpus_allowed);
+        cpuset_cpus_allowed(current, cpus_allowed);
-        set_cpus_allowed_ptr(current, &cpus_allowed);
+        set_cpus_allowed_ptr(current, cpus_allowed);
+        free_cpumask_var(cpus_allowed);
        return __pdflush(&my_work);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 10993942d6c9..ac4af8cffbf9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
-#include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
 #include <asm/tlbflush.h>
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
 * Getting a lock on a stable anon_vma from a page off the LRU is
 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+static struct anon_vma *page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
        return NULL;
 }
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+static void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
                goto out_unmap;
        }
-        if (ptep_clear_flush_young_notify(vma, address, pte))
+        if (ptep_clear_flush_young_notify(vma, address, pte)) {
-                referenced++;
+                /*
+                 * Don't treat a reference through a sequentially read
+                 * mapping as such.  If the page has been used in
+                 * another mapping, we will catch it; if this other
+                 * mapping is already gone, the unmap path will have
+                 * set PG_referenced or activated the page.
+                 */
+                if (likely(!VM_SequentialReadHint(vma)))
+                        referenced++;
+        }
        /* Pretend the page is referenced if the task has the
           swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
 void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+        SetPageSwapBacked(page);
+        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        __page_set_anon_rmap(page, vma, address);
+        if (page_evictable(page, vma))
+                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+        else
+                add_page_to_unevictable_list(page);
 }
 /**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
 */
 void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 {
-        BUG_ON(page_mapcount(page) == 0);
        if (PageAnon(page))
                __page_check_anon_rmap(page, vma, address);
        atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
 /**
 * page_remove_rmap - take down pte mapping from a page
 * @page: page to remove mapping from
- * @vma: the vm area in which the mapping is removed
 *
 * The caller needs to hold the pte lock.
 */
-void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
-                if (unlikely(page_mapcount(page) < 0)) {
-                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
-                        printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
-                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
-                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
-                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
-                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
-                        if (vma->vm_ops) {
-                                print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
-                        }
-                        if (vma->vm_file && vma->vm_file->f_op)
-                                print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
-                        BUG();
-                }
                /*
                 * Now that the last pte has gone, s390 must transfer dirty
                 * flag from storage key to struct page.  We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, anon_rss);
-#ifdef CONFIG_MIGRATION
+                } else if (PAGE_MIGRATION) {
-                } else {
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         */
                        BUG_ON(!migration);
                        entry = make_migration_entry(page, pte_write(pteval));
-#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-        } else
+        } else if (PAGE_MIGRATION && migration) {
-#ifdef CONFIG_MIGRATION
-        if (migration) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
-#endif
                dec_mm_counter(mm, file_rss);
-        page_remove_rmap(page, vma);
+        page_remove_rmap(page);
        page_cache_release(page);
 out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                if (pte_dirty(pteval))
                        set_page_dirty(page);
-                page_remove_rmap(page, vma);
+                page_remove_rmap(page);
                page_cache_release(page);
                dec_mm_counter(mm, file_rss);
                (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index dd5588f5d939..75199888a6bd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
+ * tiny-shmem:
+ * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
+ *
 * This file is released under the GPL.
 */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+static struct vfsmount *shm_mnt;
+#ifdef CONFIG_SHMEM
 /*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
 #include <linux/generic_acl.h>
-#include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/shmem_fs.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
@@ -921,7 +929,11 @@ found:
        error = 1;
        if (!inode)
                goto out;
-        /* Precharge page using GFP_KERNEL while we can wait */
+        /*
+         * Charge page using GFP_KERNEL while we can wait.
+         * Charged back to the user(not to caller) when swap account is used.
+         * add_to_page_cache() will be called with GFP_NOWAIT.
+         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
@@ -1313,15 +1325,19 @@ repeat:
                } else {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* allow reclaim from this memory cgroup */
-                                error = mem_cgroup_shrink_usage(current->mm,
+                                error = mem_cgroup_shrink_usage(swappage,
+                                                                current->mm,
                                                                gfp);
-                                if (error)
+                                if (error) {
+                                        unlock_page(swappage);
+                                        page_cache_release(swappage);
                                        goto failed;
+                                }
                        }
+                        unlock_page(swappage);
+                        page_cache_release(swappage);
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
@@ -1372,7 +1388,7 @@ repeat:
                        /* Precharge page while we can wait, compensate after */
                        error = mem_cgroup_cache_charge(filepage, current->mm,
-                                                        gfp & ~__GFP_HIGHMEM);
+                                        GFP_KERNEL);
                        if (error) {
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
@@ -1445,7 +1461,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-        mark_page_accessed(vmf->page);
        return ret | VM_FAULT_LOCKED;
 }
@@ -2487,7 +2502,6 @@ static struct file_system_type tmpfs_fs_type = {
        .get_sb         = shmem_get_sb,
        .kill_sb        = kill_litter_super,
 };
-static struct vfsmount *shm_mnt;
 static int __init init_tmpfs(void)
 {
@@ -2526,7 +2540,51 @@ out4:
        shm_mnt = ERR_PTR(error);
        return error;
 }
-module_init(init_tmpfs)
+#else /* !CONFIG_SHMEM */
+/*
+ * tiny-shmem: simple shmemfs and tmpfs using ramfs code
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+#include <linux/ramfs.h>
+static struct file_system_type tmpfs_fs_type = {
+        .name           = "tmpfs",
+        .get_sb         = ramfs_get_sb,
+        .kill_sb        = kill_litter_super,
+};
+static int __init init_tmpfs(void)
+{
+        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+        shm_mnt = kern_mount(&tmpfs_fs_type);
+        BUG_ON(IS_ERR(shm_mnt));
+        return 0;
+}
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+        return 0;
+}
+#define shmem_file_operations ramfs_file_operations
+#define shmem_vm_ops generic_file_vm_ops
+#define shmem_get_inode ramfs_get_inode
+#define shmem_acct_size(a, b) 0
+#define shmem_unacct_size(a, b) do {} while (0)
+#define SHMEM_MAX_BYTES LLONG_MAX
+#endif /* CONFIG_SHMEM */
+/* common code */
 /**
 * shmem_file_setup - get an unlinked file living in tmpfs
@@ -2570,12 +2628,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!inode)
                goto close_file;
-        SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+#ifdef CONFIG_SHMEM
+        SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT;
+#endif
        d_instantiate(dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &shmem_file_operations);
+                  &shmem_file_operations);
+#ifndef CONFIG_MMU
+        error = ramfs_nommu_expand_for_mapping(inode, size);
+        if (error)
+                goto close_file;
+#endif
        return file;
 close_file:
@@ -2608,3 +2674,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        vma->vm_ops = &shmem_vm_ops;
        return 0;
 }
+module_init(init_tmpfs)
diff --git a/mm/slab.c b/mm/slab.c
index f97e564bdf11..ddc41f337d58 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2157,7 +2157,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * We use cache_chain_mutex to ensure a consistent view of
-         * cpu_online_map as well.  Please see cpuup_callback
+         * cpu_online_mask as well.  Please see cpuup_callback
         */
        get_online_cpus();
        mutex_lock(&cache_chain_mutex);
diff --git a/mm/slub.c b/mm/slub.c
index 6cb7ad107852..bdc9abb08a23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1970,7 +1970,7 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu,
                                kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
 static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
-static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
+static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
 static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
                                                        int cpu, gfp_t flags)
@@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
 static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
 {
        if (c < per_cpu(kmem_cache_cpu, cpu) ||
-                        c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+                        c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
                kfree(c);
                return;
        }
@@ -2045,13 +2045,13 @@ static void init_alloc_cpu_cpu(int cpu)
 {
        int i;
-        if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
+        if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
                return;
        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
-        cpu_set(cpu, kmem_cach_cpu_free_init_once);
+        cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
 }
 static void __init init_alloc_cpu(void)
@@ -2254,7 +2254,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
                 * Add some empty padding so that we can catch
                 * overwrites from earlier objects rather than let
                 * tracking information or the free pointer be
-                 * corrupted if an user writes before the start
+                 * corrupted if a user writes before the start
                 * of the object.
                 */
                size += sizeof(void *);
@@ -3451,7 +3451,7 @@ struct location {
        long max_time;
        long min_pid;
        long max_pid;
-        cpumask_t cpus;
+        DECLARE_BITMAP(cpus, NR_CPUS);
        nodemask_t nodes;
 };
@@ -3526,7 +3526,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
                                if (track->pid > l->max_pid)
                                        l->max_pid = track->pid;
-                                cpu_set(track->cpu, l->cpus);
+                                cpumask_set_cpu(track->cpu,
+                                                to_cpumask(l->cpus));
                        }
                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
                        return 1;
@@ -3556,8 +3557,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
        l->max_time = age;
        l->min_pid = track->pid;
        l->max_pid = track->pid;
-        cpus_clear(l->cpus);
+        cpumask_clear(to_cpumask(l->cpus));
-        cpu_set(track->cpu, l->cpus);
+        cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
        nodes_clear(l->nodes);
        node_set(page_to_nid(virt_to_page(track)), l->nodes);
        return 1;
@@ -3638,11 +3639,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
                        len += sprintf(buf + len, " pid=%ld",
                                l->min_pid);
-                if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
+                if (num_online_cpus() > 1 &&
+                                !cpumask_empty(to_cpumask(l->cpus)) &&
                                len < PAGE_SIZE - 60) {
                        len += sprintf(buf + len, " cpus=");
                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
-                                        l->cpus);
+                                                 to_cpumask(l->cpus));
                }
                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
diff --git a/mm/swap.c b/mm/swap.c
index b135ec90cdeb..8adb9feb61e1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -151,6 +151,26 @@ void  rotate_reclaimable_page(struct page *page)
        }
 }
+static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+                                     int file, int rotated)
+{
+        struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
+        struct zone_reclaim_stat *memcg_reclaim_stat;
+        memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
+        reclaim_stat->recent_scanned[file]++;
+        if (rotated)
+                reclaim_stat->recent_rotated[file]++;
+        if (!memcg_reclaim_stat)
+                return;
+        memcg_reclaim_stat->recent_scanned[file]++;
+        if (rotated)
+                memcg_reclaim_stat->recent_rotated[file]++;
+}
 /*
 * FIXME: speed this up?
 */
@@ -168,10 +188,8 @@ void activate_page(struct page *page)
                lru += LRU_ACTIVE;
                add_page_to_lru_list(zone, page, lru);
                __count_vm_event(PGACTIVATE);
-                mem_cgroup_move_lists(page, lru);
-                zone->recent_rotated[!!file]++;
+                update_page_reclaim_stat(zone, page, !!file, 1);
-                zone->recent_scanned[!!file]++;
        }
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -246,25 +264,6 @@ void add_page_to_unevictable_list(struct page *page)
        spin_unlock_irq(&zone->lru_lock);
 }
-/**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
- *
- * place @page on active or unevictable LRU list, depending on
- * page_evictable().  Note that if the page is not evictable,
- * it goes directly back onto it's zone's unevictable list.  It does
- * NOT use a per cpu pagevec.
- */
-void lru_cache_add_active_or_unevictable(struct page *page,
-                                        struct vm_area_struct *vma)
-{
-        if (page_evictable(page, vma))
-                lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
-        else
-                add_page_to_unevictable_list(page);
-}
 /*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +397,6 @@ void __pagevec_release(struct pagevec *pvec)
 EXPORT_SYMBOL(__pagevec_release);
 /*
- * pagevec_release() for pages which are known to not be on the LRU
- *
- * This function reinitialises the caller's pagevec.
- */
-void __pagevec_release_nonlru(struct pagevec *pvec)
-{
-        int i;
-        struct pagevec pages_to_free;
-        pagevec_init(&pages_to_free, pvec->cold);
-        for (i = 0; i < pagevec_count(pvec); i++) {
-                struct page *page = pvec->pages[i];
-                VM_BUG_ON(PageLRU(page));
-                if (put_page_testzero(page))
-                        pagevec_add(&pages_to_free, page);
-        }
-        pagevec_free(&pages_to_free);
-        pagevec_reinit(pvec);
-}
-/*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
@@ -427,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
        int i;
        struct zone *zone = NULL;
        VM_BUG_ON(is_unevictable_lru(lru));
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                struct zone *pagezone = page_zone(page);
                int file;
+                int active;
                if (pagezone != zone) {
                        if (zone)
@@ -444,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
                VM_BUG_ON(PageUnevictable(page));
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
+                active = is_active_lru(lru);
                file = is_file_lru(lru);
-                zone->recent_scanned[file]++;
+                if (active)
-                if (is_active_lru(lru)) {
                        SetPageActive(page);
-                        zone->recent_rotated[file]++;
+                update_page_reclaim_stat(zone, page, file, active);
-                }
                add_page_to_lru_list(zone, page, lru);
        }
        if (zone)
@@ -495,8 +473,7 @@ void pagevec_swap_free(struct pagevec *pvec)
                struct page *page = pvec->pages[i];
                if (PageSwapCache(page) && trylock_page(page)) {
-                        if (PageSwapCache(page))
+                        try_to_free_swap(page);
-                                remove_exclusive_swap_page_ref(page);
                        unlock_page(page);
                }
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c9029cef..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
+#include <linux/page_cgroup.h>
 #include <asm/pgtable.h>
@@ -72,10 +73,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
        int error;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(PageSwapCache(page));
+        VM_BUG_ON(PageSwapCache(page));
-        BUG_ON(PagePrivate(page));
+        VM_BUG_ON(!PageSwapBacked(page));
-        BUG_ON(!PageSwapBacked(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
                page_cache_get(page);
@@ -108,10 +109,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 */
 void __delete_from_swap_cache(struct page *page)
 {
-        BUG_ON(!PageLocked(page));
+        swp_entry_t ent = {.val = page_private(page)};
-        BUG_ON(!PageSwapCache(page));
-        BUG_ON(PageWriteback(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(PagePrivate(page));
+        VM_BUG_ON(!PageSwapCache(page));
+        VM_BUG_ON(PageWriteback(page));
        radix_tree_delete(&swapper_space.page_tree, page_private(page));
        set_page_private(page, 0);
@@ -119,6 +121,7 @@ void __delete_from_swap_cache(struct page *page)
        total_swapcache_pages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
+        mem_cgroup_uncharge_swapcache(page, ent);
 }
 /**
@@ -129,13 +132,13 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page * page, gfp_t gfp_mask)
+int add_to_swap(struct page *page)
 {
        swp_entry_t entry;
        int err;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(!PageUptodate(page));
+        VM_BUG_ON(!PageUptodate(page));
        for (;;) {
                entry = get_swap_page();
@@ -154,7 +157,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
                 * Add it to the swap cache and mark it dirty
                 */
                err = add_to_swap_cache(page, entry,
-                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
+                                __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
@@ -196,14 +199,14 @@ void delete_from_swap_cache(struct page *page)
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
- * here because we are going to recheck again inside 
+ * here because we are going to recheck again inside
- * exclusive_swap_page() _with_ the lock. 
+ * try_to_free_swap() _with_ the lock.
 *                                      - Marcelo
 */
 static inline void free_swap_cache(struct page *page)
 {
-        if (PageSwapCache(page) && trylock_page(page)) {
+        if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
-                remove_exclusive_swap_page(page);
+                try_to_free_swap(page);
                unlock_page(page);
        }
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..7e6304dfafab 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/shm.h>
 #include <linux/blkdev.h>
+#include <linux/random.h>
 #include <linux/writeback.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -32,9 +33,11 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
+#include <linux/page_cgroup.h>
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
+long nr_swap_pages;
 long total_swap_pages;
 static int swap_overflow;
 static int least_priority;
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
        up_read(&swap_unplug_sem);
 }
+/*
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static int discard_swap(struct swap_info_struct *si)
+{
+        struct swap_extent *se;
+        int err = 0;
+        list_for_each_entry(se, &si->extent_list, list) {
+                sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
+                sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+                if (se->start_page == 0) {
+                        /* Do not discard the swap header page! */
+                        start_block += 1 << (PAGE_SHIFT - 9);
+                        nr_blocks -= 1 << (PAGE_SHIFT - 9);
+                        if (!nr_blocks)
+                                continue;
+                }
+                err = blkdev_issue_discard(si->bdev, start_block,
+                                                nr_blocks, GFP_KERNEL);
+                if (err)
+                        break;
+                cond_resched();
+        }
+        return err;             /* That will often be -EOPNOTSUPP */
+}
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                 pgoff_t start_page, pgoff_t nr_pages)
+{
+        struct swap_extent *se = si->curr_swap_extent;
+        int found_extent = 0;
+        while (nr_pages) {
+                struct list_head *lh;
+                if (se->start_page <= start_page &&
+                    start_page < se->start_page + se->nr_pages) {
+                        pgoff_t offset = start_page - se->start_page;
+                        sector_t start_block = se->start_block + offset;
+                        sector_t nr_blocks = se->nr_pages - offset;
+                        if (nr_blocks > nr_pages)
+                                nr_blocks = nr_pages;
+                        start_page += nr_blocks;
+                        nr_pages -= nr_blocks;
+                        if (!found_extent++)
+                                si->curr_swap_extent = se;
+                        start_block <<= PAGE_SHIFT - 9;
+                        nr_blocks <<= PAGE_SHIFT - 9;
+                        if (blkdev_issue_discard(si->bdev, start_block,
+                                                        nr_blocks, GFP_NOIO))
+                                break;
+                }
+                lh = se->list.next;
+                if (lh == &si->extent_list)
+                        lh = lh->next;
+                se = list_entry(lh, struct swap_extent, list);
+        }
+}
+static int wait_for_discard(void *word)
+{
+        schedule();
+        return 0;
+}
 #define SWAPFILE_CLUSTER        256
 #define LATENCY_LIMIT           256
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
-        unsigned long offset, last_in_cluster;
+        unsigned long offset;
+        unsigned long scan_base;
+        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+        int found_free_cluster = 0;
-        /* 
+        /*
         * We try to cluster swap pages by allocating them sequentially
         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
         * way, however, we resort to first-free allocation, starting
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
         * all over the entire swap partition, so that we reduce
         * overall disk seek times between swap pages.  -- sct
         * But we do now try to find an empty cluster.  -Andrea
+         * And we let swap pages go all over an SSD partition.  Hugh
         */
        si->flags += SWP_SCANNING;
-        if (unlikely(!si->cluster_nr)) {
+        scan_base = offset = si->cluster_next;
-                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+        if (unlikely(!si->cluster_nr--)) {
-                        goto lowest;
+                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                        goto checks;
+                }
+                if (si->flags & SWP_DISCARDABLE) {
+                        /*
+                         * Start range check on racing allocations, in case
+                         * they overlap the cluster we eventually decide on
+                         * (we scan without swap_lock to allow preemption).
+                         * It's hardly conceivable that cluster_nr could be
+                         * wrapped during our scan, but don't depend on it.
+                         */
+                        if (si->lowest_alloc)
+                                goto checks;
+                        si->lowest_alloc = si->max;
+                        si->highest_alloc = 0;
+                }
                spin_unlock(&swap_lock);
-                offset = si->lowest_bit;
+                /*
+                 * If seek is expensive, start searching for new cluster from
+                 * start of partition, to minimize the span of allocated swap.
+                 * But if seek is cheap, search from our current position, so
+                 * that swap is allocated from all over the partition: if the
+                 * Flash Translation Layer only remaps within limited zones,
+                 * we don't want to wear out the first zone too quickly.
+                 */
+                if (!(si->flags & SWP_SOLIDSTATE))
+                        scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
                /* Locate the first empty (unaligned) cluster */
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&swap_lock);
-                                si->cluster_next = offset-SWAPFILE_CLUSTER+1;
+                                offset -= SWAPFILE_CLUSTER - 1;
-                                goto cluster;
+                                si->cluster_next = offset;
+                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
+                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
                                cond_resched();
                                latency_ration = LATENCY_LIMIT;
                        }
                }
+                offset = si->lowest_bit;
+                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+                /* Locate the first empty (unaligned) cluster */
+                for (; last_in_cluster < scan_base; offset++) {
+                        if (si->swap_map[offset])
+                                last_in_cluster = offset + SWAPFILE_CLUSTER;
+                        else if (offset == last_in_cluster) {
+                                spin_lock(&swap_lock);
+                                offset -= SWAPFILE_CLUSTER - 1;
+                                si->cluster_next = offset;
+                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
+                                goto checks;
+                        }
+                        if (unlikely(--latency_ration < 0)) {
+                                cond_resched();
+                                latency_ration = LATENCY_LIMIT;
+                        }
+                }
+                offset = scan_base;
                spin_lock(&swap_lock);
-                goto lowest;
+                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                si->lowest_alloc = 0;
        }
-        si->cluster_nr--;
+checks:
-cluster:
+        if (!(si->flags & SWP_WRITEOK))
-        offset = si->cluster_next;
-        if (offset > si->highest_bit)
-lowest:         offset = si->lowest_bit;
-checks: if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
                goto no_page;
-        if (!si->swap_map[offset]) {
+        if (offset > si->highest_bit)
-                if (offset == si->lowest_bit)
+                scan_base = offset = si->lowest_bit;
-                        si->lowest_bit++;
+        if (si->swap_map[offset])
-                if (offset == si->highest_bit)
+                goto scan;
-                        si->highest_bit--;
-                si->inuse_pages++;
+        if (offset == si->lowest_bit)
-                if (si->inuse_pages == si->pages) {
+                si->lowest_bit++;
-                        si->lowest_bit = si->max;
+        if (offset == si->highest_bit)
-                        si->highest_bit = 0;
+                si->highest_bit--;
+        si->inuse_pages++;
+        if (si->inuse_pages == si->pages) {
+                si->lowest_bit = si->max;
+                si->highest_bit = 0;
+        }
+        si->swap_map[offset] = 1;
+        si->cluster_next = offset + 1;
+        si->flags -= SWP_SCANNING;
+        if (si->lowest_alloc) {
+                /*
+                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * for a free cluster in progress or just completed.
+                 */
+                if (found_free_cluster) {
+                        /*
+                         * To optimize wear-levelling, discard the
+                         * old data of the cluster, taking care not to
+                         * discard any of its pages that have already
+                         * been allocated by racing tasks (offset has
+                         * already stepped over any at the beginning).
+                         */
+                        if (offset < si->highest_alloc &&
+                            si->lowest_alloc <= last_in_cluster)
+                                last_in_cluster = si->lowest_alloc - 1;
+                        si->flags |= SWP_DISCARDING;
+                        spin_unlock(&swap_lock);
+                        if (offset < last_in_cluster)
+                                discard_swap_cluster(si, offset,
+                                        last_in_cluster - offset + 1);
+                        spin_lock(&swap_lock);
+                        si->lowest_alloc = 0;
+                        si->flags &= ~SWP_DISCARDING;
+                        smp_mb();       /* wake_up_bit advises this */
+                        wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+                } else if (si->flags & SWP_DISCARDING) {
+                        /*
+                         * Delay using pages allocated by racing tasks
+                         * until the whole discard has been issued. We
+                         * could defer that delay until swap_writepage,
+                         * but it's easier to keep this self-contained.
+                         */
+                        spin_unlock(&swap_lock);
+                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                                wait_for_discard, TASK_UNINTERRUPTIBLE);
+                        spin_lock(&swap_lock);
+                } else {
+                        /*
+                         * Note pages allocated by racing tasks while
+                         * scan for a free cluster is in progress, so
+                         * that its final discard can exclude them.
+                         */
+                        if (offset < si->lowest_alloc)
+                                si->lowest_alloc = offset;
+                        if (offset > si->highest_alloc)
+                                si->highest_alloc = offset;
                }
-                si->swap_map[offset] = 1;
-                si->cluster_next = offset + 1;
-                si->flags -= SWP_SCANNING;
-                return offset;
        }
+        return offset;
+scan:
        spin_unlock(&swap_lock);
        while (++offset <= si->highest_bit) {
                if (!si->swap_map[offset]) {
@@ -165,8 +356,18 @@ checks:	if (!(si->flags & SWP_WRITEOK))
                        latency_ration = LATENCY_LIMIT;
                }
        }
+        offset = si->lowest_bit;
+        while (++offset < scan_base) {
+                if (!si->swap_map[offset]) {
+                        spin_lock(&swap_lock);
+                        goto checks;
+                }
+                if (unlikely(--latency_ration < 0)) {
+                        cond_resched();
+                        latency_ration = LATENCY_LIMIT;
+                }
+        }
        spin_lock(&swap_lock);
-        goto lowest;
 no_page:
        si->flags -= SWP_SCANNING;
@@ -268,10 +469,11 @@ bad_nofile:
        printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
        return NULL;
-}       
+}
-static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
+static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
 {
+        unsigned long offset = swp_offset(ent);
        int count = p->swap_map[offset];
        if (count < SWAP_MAP_MAX) {
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
                                swap_list.next = p - swap_info;
                        nr_swap_pages++;
                        p->inuse_pages--;
+                        mem_cgroup_uncharge_swap(ent);
                }
        }
        return count;
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
-                swap_entry_free(p, swp_offset(entry));
+                swap_entry_free(p, entry);
                spin_unlock(&swap_lock);
        }
 }
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page)
 }
 /*
- * We can use this swap cache entry directly
+ * We can write to an anon page without COW if there are no other references
- * if there are no other references to it.
+ * to it.  And as a side-effect, free up its swap: because the old content
+ * on disk will never be read, and seeking back there to write new content
+ * later would only waste time away from clustering.
 */
-int can_share_swap_page(struct page *page)
+int reuse_swap_page(struct page *page)
 {
        int count;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
        count = page_mapcount(page);
-        if (count <= 1 && PageSwapCache(page))
+        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
+                if (count == 1 && !PageWriteback(page)) {
+                        delete_from_swap_cache(page);
+                        SetPageDirty(page);
+                }
+        }
        return count == 1;
 }
 /*
- * Work out if there are any other processes sharing this
+ * If swap is getting full, or if there are no more mappings of this page,
- * swap cache page. Free it if you can. Return success.
+ * then try_to_free_swap is called to free its swap space.
 */
-static int remove_exclusive_swap_page_count(struct page *page, int count)
+int try_to_free_swap(struct page *page)
 {
-        int retval;
+        VM_BUG_ON(!PageLocked(page));
-        struct swap_info_struct * p;
-        swp_entry_t entry;
-        BUG_ON(PagePrivate(page));
-        BUG_ON(!PageLocked(page));
        if (!PageSwapCache(page))
                return 0;
        if (PageWriteback(page))
                return 0;
-        if (page_count(page) != count) /* us + cache + ptes */
+        if (page_swapcount(page))
                return 0;
-        entry.val = page_private(page);
+        delete_from_swap_cache(page);
-        p = swap_info_get(entry);
+        SetPageDirty(page);
-        if (!p)
+        return 1;
-                return 0;
-        /* Is the only swap cache user the cache itself? */
-        retval = 0;
-        if (p->swap_map[swp_offset(entry)] == 1) {
-                /* Recheck the page count with the swapcache lock held.. */
-                spin_lock_irq(&swapper_space.tree_lock);
-                if ((page_count(page) == count) && !PageWriteback(page)) {
-                        __delete_from_swap_cache(page);
-                        SetPageDirty(page);
-                        retval = 1;
-                }
-                spin_unlock_irq(&swapper_space.tree_lock);
-        }
-        spin_unlock(&swap_lock);
-        if (retval) {
-                swap_free(entry);
-                page_cache_release(page);
-        }
-        return retval;
-}
-/*
- * Most of the time the page should have two references: one for the
- * process and one for the swap cache.
- */
-int remove_exclusive_swap_page(struct page *page)
-{
-        return remove_exclusive_swap_page_count(page, 2);
-}
-/*
- * The pageout code holds an extra reference to the page.  That raises
- * the reference count to test for to 2 for a page that is only in the
- * swap cache plus 1 for each process that maps the page.
- */
-int remove_exclusive_swap_page_ref(struct page *page)
-{
-        return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
 }
 /*
 * Free the swap entry like above, but also try to
 * free the page cache entry if it is the last user.
 */
-void free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        struct page *page = NULL;
        if (is_migration_entry(entry))
-                return;
+                return 1;
        p = swap_info_get(entry);
        if (p) {
-                if (swap_entry_free(p, swp_offset(entry)) == 1) {
+                if (swap_entry_free(p, entry) == 1) {
                        page = find_get_page(&swapper_space, entry.val);
                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry)
                spin_unlock(&swap_lock);
        }
        if (page) {
-                int one_user;
+                /*
+                 * Not mapped elsewhere, or swap space full? Free it!
-                BUG_ON(PagePrivate(page));
+                 * Also recheck PageSwapCache now page is locked (above).
-                one_user = (page_count(page) == 2);
+                 */
-                /* Only cache user (+us), or swap space full? Free it! */
-                /* Also recheck PageSwapCache after page is locked (above) */
                if (PageSwapCache(page) && !PageWriteback(page) &&
-                                        (one_user || vm_swap_full())) {
+                                (!page_mapped(page) || vm_swap_full())) {
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
                unlock_page(page);
                page_cache_release(page);
        }
+        return p != NULL;
 }
 #ifdef CONFIG_HIBERNATION
@@ -530,17 +693,20 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
+        struct mem_cgroup *ptr = NULL;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
-        if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
                ret = -ENOMEM;
+                goto out_nolock;
+        }
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
                if (ret > 0)
-                        mem_cgroup_uncharge_page(page);
+                        mem_cgroup_cancel_charge_swapin(ptr);
                ret = 0;
                goto out;
        }
@@ -550,6 +716,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
        page_add_anon_rmap(page, vma, addr);
+        mem_cgroup_commit_charge_swapin(page, ptr);
        swap_free(entry);
        /*
         * Move the page to the active list so it is not
@@ -558,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        activate_page(page);
 out:
        pte_unmap_unlock(pte, ptl);
+out_nolock:
        return ret;
 }
@@ -776,10 +944,10 @@ static int try_to_unuse(unsigned int type)
                        break;
                }
-                /* 
+                /*
                 * Get a page for the entry, using the existing swap
                 * cache page if there is one.  Otherwise, get a clean
-                 * page and read the swap into it. 
+                 * page and read the swap into it.
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
@@ -930,7 +1098,16 @@ static int try_to_unuse(unsigned int type)
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
-                if (PageSwapCache(page))
+                /*
+                 * It is conceivable that a racing task removed this page from
+                 * swap cache just before we acquired the page lock at the top,
+                 * or while we dropped it in unuse_mm().  The page might even
+                 * be back in swap cache on another swap area: that we must not
+                 * delete, since it may not have been written out to swap yet.
+                 */
+                if (PageSwapCache(page) &&
+                    likely(page_private(page) == entry.val))
                        delete_from_swap_cache(page);
                /*
@@ -1203,27 +1380,7 @@ out:
        return ret;
 }
-#if 0   /* We don't need this yet */
+SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
-#include <linux/backing-dev.h>
-int page_queue_congested(struct page *page)
-{
-        struct backing_dev_info *bdi;
-        BUG_ON(!PageLocked(page));      /* It pins the swap_info_struct */
-        if (PageSwapCache(page)) {
-                swp_entry_t entry = { .val = page_private(page) };
-                struct swap_info_struct *sis;
-                sis = get_swap_info_struct(swp_type(entry));
-                bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
-        } else
-                bdi = page->mapping->backing_dev_info;
-        return bdi_write_congested(bdi);
-}
-#endif
-asmlinkage long sys_swapoff(const char __user * specialfile)
 {
        struct swap_info_struct * p = NULL;
        unsigned short *swap_map;
@@ -1233,7 +1390,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        char * pathname;
        int i, type, prev;
        int err;
-        
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1253,7 +1410,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        spin_lock(&swap_lock);
        for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
                p = swap_info + type;
-                if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping)
                                break;
                }
@@ -1343,6 +1500,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+        /* Destroy swap account informatin */
+        swap_cgroup_swapoff(type);
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);
@@ -1426,12 +1586,12 @@ static int swap_show(struct seq_file *swap, void *v)
        file = ptr->swap_file;
        len = seq_path(swap, &file->f_path, " \t\n\\");
        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
-                       len < 40 ? 40 - len : 1, " ",
+                        len < 40 ? 40 - len : 1, " ",
-                       S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
+                        S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
                                "partition" : "file\t",
-                       ptr->pages << (PAGE_SHIFT - 10),
+                        ptr->pages << (PAGE_SHIFT - 10),
-                       ptr->inuse_pages << (PAGE_SHIFT - 10),
+                        ptr->inuse_pages << (PAGE_SHIFT - 10),
-                       ptr->prio);
+                        ptr->prio);
        return 0;
 }
@@ -1476,7 +1636,7 @@ late_initcall(max_swapfiles_check);
 *
 * The swapon system call
 */
-asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
        struct swap_info_struct * p;
        char *name = NULL;
@@ -1487,12 +1647,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        int i, prev;
        int error;
        union swap_header *swap_header = NULL;
-        int swap_header_version;
        unsigned int nr_good_pages = 0;
        int nr_extents = 0;
        sector_t span;
        unsigned long maxpages = 1;
-        int swapfilesize;
+        unsigned long swapfilepages;
        unsigned short *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -1570,7 +1729,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
-        swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        /*
         * Read the swap header.
@@ -1584,102 +1743,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = PTR_ERR(page);
                goto bad_swap;
        }
-        kmap(page);
+        swap_header = kmap(page);
-        swap_header = page_address(page);
-        if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
-                swap_header_version = 1;
-        else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
-                swap_header_version = 2;
-        else {
                printk(KERN_ERR "Unable to find swap-space signature\n");
                error = -EINVAL;
                goto bad_swap;
        }
-        
-        switch (swap_header_version) {
+        /* swap partition endianess hack... */
-        case 1:
+        if (swab32(swap_header->info.version) == 1) {
-                printk(KERN_ERR "version 0 swap is no longer supported. "
+                swab32s(&swap_header->info.version);
-                        "Use mkswap -v1 %s\n", name);
+                swab32s(&swap_header->info.last_page);
+                swab32s(&swap_header->info.nr_badpages);
+                for (i = 0; i < swap_header->info.nr_badpages; i++)
+                        swab32s(&swap_header->info.badpages[i]);
+        }
+        /* Check the swap header's sub-version */
+        if (swap_header->info.version != 1) {
+                printk(KERN_WARNING
+                       "Unable to handle swap header version %d\n",
+                       swap_header->info.version);
                error = -EINVAL;
                goto bad_swap;
-        case 2:
+        }
-                /* swap partition endianess hack... */
-                if (swab32(swap_header->info.version) == 1) {
-                        swab32s(&swap_header->info.version);
-                        swab32s(&swap_header->info.last_page);
-                        swab32s(&swap_header->info.nr_badpages);
-                        for (i = 0; i < swap_header->info.nr_badpages; i++)
-                                swab32s(&swap_header->info.badpages[i]);
-                }
-                /* Check the swap header's sub-version and the size of
-                   the swap file and bad block lists */
-                if (swap_header->info.version != 1) {
-                        printk(KERN_WARNING
-                               "Unable to handle swap header version %d\n",
-                               swap_header->info.version);
-                        error = -EINVAL;
-                        goto bad_swap;
-                }
-                p->lowest_bit  = 1;
+        p->lowest_bit  = 1;
-                p->cluster_next = 1;
+        p->cluster_next = 1;
-                /*
+        /*
-                 * Find out how many pages are allowed for a single swap
+         * Find out how many pages are allowed for a single swap
-                 * device. There are two limiting factors: 1) the number of
+         * device. There are two limiting factors: 1) the number of
-                 * bits for the swap offset in the swp_entry_t type and
+         * bits for the swap offset in the swp_entry_t type and
-                 * 2) the number of bits in the a swap pte as defined by
+         * 2) the number of bits in the a swap pte as defined by
-                 * the different architectures. In order to find the
+         * the different architectures. In order to find the
-                 * largest possible bit mask a swap entry with swap type 0
+         * largest possible bit mask a swap entry with swap type 0
-                 * and swap offset ~0UL is created, encoded to a swap pte,
+         * and swap offset ~0UL is created, encoded to a swap pte,
-                 * decoded to a swp_entry_t again and finally the swap
+         * decoded to a swp_entry_t again and finally the swap
-                 * offset is extracted. This will mask all the bits from
+         * offset is extracted. This will mask all the bits from
-                 * the initial ~0UL mask that can't be encoded in either
+         * the initial ~0UL mask that can't be encoded in either
-                 * the swp_entry_t or the architecture definition of a
+         * the swp_entry_t or the architecture definition of a
-                 * swap pte.
+         * swap pte.
-                 */
+         */
-                maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
+        maxpages = swp_offset(pte_to_swp_entry(
-                if (maxpages > swap_header->info.last_page)
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
-                        maxpages = swap_header->info.last_page;
+        if (maxpages > swap_header->info.last_page)
-                p->highest_bit = maxpages - 1;
+                maxpages = swap_header->info.last_page;
+        p->highest_bit = maxpages - 1;
-                error = -EINVAL;
+        error = -EINVAL;
-                if (!maxpages)
+        if (!maxpages)
-                        goto bad_swap;
+                goto bad_swap;
-                if (swapfilesize && maxpages > swapfilesize) {
+        if (swapfilepages && maxpages > swapfilepages) {
-                        printk(KERN_WARNING
+                printk(KERN_WARNING
-                               "Swap area shorter than signature indicates\n");
+                       "Swap area shorter than signature indicates\n");
-                        goto bad_swap;
+                goto bad_swap;
-                }
+        }
-                if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
-                        goto bad_swap;
+                goto bad_swap;
-                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
-                        goto bad_swap;
+                goto bad_swap;
-                /* OK, set up the swap map and apply the bad block list */
+        /* OK, set up the swap map and apply the bad block list */
-                swap_map = vmalloc(maxpages * sizeof(short));
+        swap_map = vmalloc(maxpages * sizeof(short));
-                if (!swap_map) {
+        if (!swap_map) {
-                        error = -ENOMEM;
+                error = -ENOMEM;
-                        goto bad_swap;
+                goto bad_swap;
-                }
+        }
-                error = 0;
+        memset(swap_map, 0, maxpages * sizeof(short));
-                memset(swap_map, 0, maxpages * sizeof(short));
+        for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                for (i = 0; i < swap_header->info.nr_badpages; i++) {
+                int page_nr = swap_header->info.badpages[i];
-                        int page_nr = swap_header->info.badpages[i];
+                if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
-                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
+                        error = -EINVAL;
-                                error = -EINVAL;
-                        else
-                                swap_map[page_nr] = SWAP_MAP_BAD;
-                }
-                nr_good_pages = swap_header->info.last_page -
-                                swap_header->info.nr_badpages -
-                                1 /* header page */;
-                if (error)
                        goto bad_swap;
+                }
+                swap_map[page_nr] = SWAP_MAP_BAD;
        }
+        error = swap_cgroup_swapon(type, maxpages);
+        if (error)
+                goto bad_swap;
+        nr_good_pages = swap_header->info.last_page -
+                        swap_header->info.nr_badpages -
+                        1 /* header page */;
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
@@ -1697,6 +1846,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
+        if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+                p->flags |= SWP_SOLIDSTATE;
+                p->cluster_next = 1 + (random32() % p->highest_bit);
+        }
+        if (discard_swap(p) == 0)
+                p->flags |= SWP_DISCARDABLE;
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1861,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
-        p->flags = SWP_ACTIVE;
+        p->flags |= SWP_WRITEOK;
        nr_swap_pages += nr_good_pages;
        total_swap_pages += nr_good_pages;
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk\n",
+                        "Priority:%d extents:%d across:%lluk %s%s\n",
                nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
-                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+                (p->flags & SWP_DISCARDABLE) ? "D" : "");
        /* insert swap space into swap_list: */
        prev = -1;
@@ -1738,6 +1896,7 @@ bad_swap:
                bd_release(bdev);
        }
        destroy_swap_extents(p);
+        swap_cgroup_swapoff(type);
 bad_swap_2:
        spin_lock(&swap_lock);
        p->swap_file = NULL;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d575ee6e..000000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
- *
- * Matt Mackall <mpm@selenic.com> January, 2004
- * derived from mm/shmem.c and fs/ramfs/inode.c
- *
- * This is intended for small system where the benefits of the full
- * shmem code (swap-backed and resource-limited) are outweighed by
- * their complexity. On systems without swap this code should be
- * effectively equivalent, but much lighter weight.
- */
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
-        .name           = "tmpfs",
-        .get_sb         = ramfs_get_sb,
-        .kill_sb        = kill_litter_super,
-};
-static struct vfsmount *shm_mnt;
-static int __init init_tmpfs(void)
-{
-        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
-        shm_mnt = kern_mount(&tmpfs_fs_type);
-        BUG_ON(IS_ERR(shm_mnt));
-        return 0;
-}
-module_init(init_tmpfs)
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: vm_flags
- */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
-{
-        int error;
-        struct file *file;
-        struct inode *inode;
-        struct dentry *dentry, *root;
-        struct qstr this;
-        if (IS_ERR(shm_mnt))
-                return (void *)shm_mnt;
-        error = -ENOMEM;
-        this.name = name;
-        this.len = strlen(name);
-        this.hash = 0; /* will go */
-        root = shm_mnt->mnt_root;
-        dentry = d_alloc(root, &this);
-        if (!dentry)
-                goto put_memory;
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
-        error = -ENOSPC;
-        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
-        if (!inode)
-                goto close_file;
-        d_instantiate(dentry, inode);
-        inode->i_size = size;
-        inode->i_nlink = 0;     /* It is unlinked */
-        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &ramfs_file_operations);
-#ifndef CONFIG_MMU
-        error = ramfs_nommu_expand_for_mapping(inode, size);
-        if (error)
-                goto close_file;
-#endif
-        return file;
-close_file:
-        put_filp(file);
-put_dentry:
-        dput(dentry);
-put_memory:
-        return ERR_PTR(error);
-}
-EXPORT_SYMBOL_GPL(shmem_file_setup);
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
-{
-        struct file *file;
-        loff_t size = vma->vm_end - vma->vm_start;
-        file = shmem_file_setup("dev/zero", size, vma->vm_flags);
-        if (IS_ERR(file))
-                return PTR_ERR(file);
-        if (vma->vm_file)
-                fput(vma->vm_file);
-        vma->vm_file = file;
-        vma->vm_ops = &generic_file_vm_ops;
-        return 0;
-}
-int shmem_unuse(swp_entry_t entry, struct page *page)
-{
-        return 0;
-}
-#ifndef CONFIG_MMU
-unsigned long shmem_get_unmapped_area(struct file *file,
-                                      unsigned long addr,
-                                      unsigned long len,
-                                      unsigned long pgoff,
-                                      unsigned long flags)
-{
-        return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1ddb77ba3995..75f49d312e8c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,6 +23,7 @@
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/bootmem.h>
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -151,11 +152,12 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 *
 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
 */
-static int vmap_page_range(unsigned long addr, unsigned long end,
+static int vmap_page_range(unsigned long start, unsigned long end,
                                pgprot_t prot, struct page **pages)
 {
        pgd_t *pgd;
        unsigned long next;
+        unsigned long addr = start;
        int err = 0;
        int nr = 0;
@@ -167,7 +169,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-        flush_cache_vmap(addr, end);
+        flush_cache_vmap(start, end);
        if (unlikely(err))
                return err;
@@ -380,8 +382,9 @@ found:
                        goto retry;
                }
                if (printk_ratelimit())
-                        printk(KERN_WARNING "vmap allocation failed: "
+                        printk(KERN_WARNING
-                                 "use vmalloc=<size> to increase size.\n");
+                                "vmap allocation for size %lu failed: "
+                                "use vmalloc=<size> to increase size.\n", size);
                return ERR_PTR(-EBUSY);
        }
@@ -431,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
        vunmap_page_range(va->va_start, va->va_end);
 }
+static void vmap_debug_free_range(unsigned long start, unsigned long end)
+{
+        /*
+         * Unmap page tables and force a TLB flush immediately if
+         * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
+         * bugs similarly to those in linear kernel virtual address
+         * space after a page has been freed.
+         *
+         * All the lazy freeing logic is still retained, in order to
+         * minimise intrusiveness of this debugging feature.
+         *
+         * This is going to be *slow* (linear kernel virtual address
+         * debugging doesn't do a broadcast TLB flush so it is a lot
+         * faster).
+         */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        vunmap_page_range(start, end);
+        flush_tlb_kernel_range(start, end);
+#endif
+}
 /*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
@@ -911,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
        BUG_ON(addr & (PAGE_SIZE-1));
        debug_check_no_locks_freed(mem, size);
+        vmap_debug_free_range(addr, addr+size);
        if (likely(count <= VMAP_MAX_ALLOC))
                vb_free(mem, size);
@@ -959,6 +984,8 @@ EXPORT_SYMBOL(vm_map_ram);
 void __init vmalloc_init(void)
 {
+        struct vmap_area *va;
+        struct vm_struct *tmp;
        int i;
        for_each_possible_cpu(i) {
@@ -971,6 +998,14 @@ void __init vmalloc_init(void)
                vbq->nr_dirty = 0;
        }
+        /* Import existing vmlist entries. */
+        for (tmp = vmlist; tmp; tmp = tmp->next) {
+                va = alloc_bootmem(sizeof(struct vmap_area));
+                va->flags = tmp->flags | VM_VM_AREA;
+                va->va_start = (unsigned long)tmp->addr;
+                va->va_end = va->va_start + tmp->size;
+                __insert_vmap_area(va);
+        }
        vmap_initialized = true;
 }
@@ -1127,6 +1162,8 @@ struct vm_struct *remove_vm_area(const void *addr)
        if (va && va->flags & VM_VM_AREA) {
                struct vm_struct *vm = va->private;
                struct vm_struct *tmp, **p;
+                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
                vm->size -= PAGE_SIZE;
@@ -1374,7 +1411,8 @@ void *vmalloc_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+        ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                             PAGE_KERNEL, -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
@@ -1419,7 +1457,8 @@ EXPORT_SYMBOL(vmalloc_node);
 void *vmalloc_exec(unsigned long size)
 {
-        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+                              -1, __builtin_return_address(0));
 }
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1439,7 +1478,8 @@ void *vmalloc_exec(unsigned long size)
 */
 void *vmalloc_32(unsigned long size)
 {
-        return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
+        return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+                              -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
@@ -1455,7 +1495,8 @@ void *vmalloc_32_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
+        ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+                             -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62e7f62fb559..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
+        /* Number of pages freed so far during a call to shrink_zones() */
+        unsigned long nr_reclaimed;
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
-#define scan_global_lru(sc)     (!(sc)->mem_cgroup)
+#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
 #else
-#define scan_global_lru(sc)     (1)
+#define scanning_global_lru(sc) (1)
 #endif
+static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
+                                                  struct scan_control *sc)
+{
+        if (!scanning_global_lru(sc))
+                return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
+        return &zone->reclaim_stat;
+}
+static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
+                                   enum lru_list lru)
+{
+        if (!scanning_global_lru(sc))
+                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+        return zone_page_state(zone, NR_LRU_BASE + lru);
+}
 /*
 * Add a shrinker callback to be called from the vm
 */
@@ -509,7 +531,6 @@ redo:
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
        }
-        mem_cgroup_move_lists(page, lru);
        /*
         * page's status can change while we move it among lru. If an evictable
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page)
        lru = !!TestClearPageActive(page) + page_is_file_cache(page);
        lru_cache_add_lru(page, lru);
-        mem_cgroup_move_lists(page, lru);
        put_page(page);
 }
 #endif /* CONFIG_UNEVICTABLE_LRU */
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        referenced && page_mapping_inuse(page))
                        goto activate_locked;
-#ifdef CONFIG_SWAP
                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageAnon(page) && !PageSwapCache(page)) {
                        if (!(sc->gfp_mask & __GFP_IO))
                                goto keep_locked;
-                        switch (try_to_munlock(page)) {
+                        if (!add_to_swap(page))
-                        case SWAP_FAIL:         /* shouldn't happen */
-                        case SWAP_AGAIN:
-                                goto keep_locked;
-                        case SWAP_MLOCK:
-                                goto cull_mlocked;
-                        case SWAP_SUCCESS:
-                                ; /* fall thru'; add to swap cache */
-                        }
-                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                        may_enter_fs = 1;
                }
-#endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
@@ -752,6 +761,8 @@ free_it:
                continue;
 cull_mlocked:
+                if (PageSwapCache(page))
+                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
                continue;
@@ -759,7 +770,7 @@ cull_mlocked:
 activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && vm_swap_full())
-                        remove_exclusive_swap_page_ref(page);
+                        try_to_free_swap(page);
                VM_BUG_ON(PageActive(page));
                SetPageActive(page);
                pgactivate++;
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
                return ret;
        ret = -EBUSY;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
                 */
                ClearPageLRU(page);
                ret = 0;
+                mem_cgroup_del_lru(page);
        }
        return ret;
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
        struct pagevec pvec;
        unsigned long nr_scanned = 0;
        unsigned long nr_reclaimed = 0;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        pagevec_init(&pvec, 1);
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
                                                -count[LRU_INACTIVE_ANON]);
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc))
                        zone->pages_scanned += nr_scan;
-                        zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-                        zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+                reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-                        zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+                reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
-                        zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+                reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-                }
+                reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
-                } else if (scan_global_lru(sc))
+                } else if (scanning_global_lru(sc))
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                        SetPageLRU(page);
                        lru = page_lru(page);
                        add_page_to_lru_list(zone, page, lru);
-                        mem_cgroup_move_lists(page, lru);
+                        if (PageActive(page)) {
-                        if (PageActive(page) && scan_global_lru(sc)) {
                                int file = !!page_is_file_cache(page);
-                                zone->recent_rotated[file]++;
+                                reclaim_stat->recent_rotated[file]++;
                        }
                        if (!pagevec_add(&pvec, page)) {
                                spin_unlock_irq(&zone->lru_lock);
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
                zone->prev_priority = priority;
 }
-static inline int zone_is_near_oom(struct zone *zone)
-{
-        return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
-}
 /*
 * This moves pages from the active list to the inactive list.
 *
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct pagevec pvec;
        enum lru_list lru;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * zone->pages_scanned is used for detect zone's oom
         * mem_cgroup remembers nr_scan by itself.
         */
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                zone->pages_scanned += pgscanned;
-                zone->recent_scanned[!!file] += pgmoved;
        }
+        reclaim_stat->recent_scanned[!!file] += pgmoved;
        if (file)
                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                list_add(&page->lru, &l_inactive);
        }
+        /*
+         * Move the pages to the [file or anon] inactive list.
+         */
+        pagevec_init(&pvec, 1);
+        pgmoved = 0;
+        lru = LRU_BASE + file * LRU_FILE;
        spin_lock_irq(&zone->lru_lock);
        /*
         * Count referenced pages from currently used mappings as
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * This helps balance scan pressure between file and anonymous
         * pages in get_scan_ratio.
         */
-        zone->recent_rotated[!!file] += pgmoved;
+        reclaim_stat->recent_rotated[!!file] += pgmoved;
-        /*
-         * Move the pages to the [file or anon] inactive list.
-         */
-        pagevec_init(&pvec, 1);
-        pgmoved = 0;
-        lru = LRU_BASE + file * LRU_FILE;
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                ClearPageActive(page);
                list_move(&page->lru, &zone->lru[lru].list);
-                mem_cgroup_move_lists(page, lru);
+                mem_cgroup_add_lru_list(page, lru);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        pagevec_release(&pvec);
 }
+static int inactive_anon_is_low_global(struct zone *zone)
+{
+        unsigned long active, inactive;
+        active = zone_page_state(zone, NR_ACTIVE_ANON);
+        inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+        if (inactive * zone->inactive_ratio < active)
+                return 1;
+        return 0;
+}
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ * @sc:   scan control of this context
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
+{
+        int low;
+        if (scanning_global_lru(sc))
+                low = inactive_anon_is_low_global(zone);
+        else
+                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+        return low;
+}
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        struct zone *zone, struct scan_control *sc, int priority)
 {
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                return 0;
        }
-        if (lru == LRU_ACTIVE_ANON &&
+        if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
-            (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
                shrink_active_list(nr_to_scan, zone, sc, priority, file);
                return 0;
        }
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        unsigned long anon, file, free;
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
-                zone_page_state(zone, NR_INACTIVE_ANON);
-        file  = zone_page_state(zone, NR_ACTIVE_FILE) +
-                zone_page_state(zone, NR_INACTIVE_FILE);
-        free  = zone_page_state(zone, NR_FREE_PAGES);
        /* If we have no swap space, do not bother scanning anon pages. */
        if (nr_swap_pages <= 0) {
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
                return;
        }
-        /* If we have very few page cache pages, force-scan anon pages. */
+        anon  = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
-        if (unlikely(file + free <= zone->pages_high)) {
+                zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
-                percent[0] = 100;
+        file  = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
-                percent[1] = 0;
+                zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
-                return;
+        if (scanning_global_lru(sc)) {
+                free  = zone_page_state(zone, NR_FREE_PAGES);
+                /* If we have very few page cache pages,
+                   force-scan anon pages. */
+                if (unlikely(file + free <= zone->pages_high)) {
+                        percent[0] = 100;
+                        percent[1] = 0;
+                        return;
+                }
        }
        /*
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
-        if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                spin_lock_irq(&zone->lru_lock);
-                zone->recent_scanned[0] /= 2;
+                reclaim_stat->recent_scanned[0] /= 2;
-                zone->recent_rotated[0] /= 2;
+                reclaim_stat->recent_rotated[0] /= 2;
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (unlikely(zone->recent_scanned[1] > file / 4)) {
+        if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
                spin_lock_irq(&zone->lru_lock);
-                zone->recent_scanned[1] /= 2;
+                reclaim_stat->recent_scanned[1] /= 2;
-                zone->recent_rotated[1] /= 2;
+                reclaim_stat->recent_rotated[1] /= 2;
                spin_unlock_irq(&zone->lru_lock);
        }
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
         */
-        ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+        ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
-        ap /= zone->recent_rotated[0] + 1;
+        ap /= reclaim_stat->recent_rotated[0] + 1;
-        fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
-        fp /= zone->recent_rotated[1] + 1;
+        fp /= reclaim_stat->recent_rotated[1] + 1;
        /* Normalize to percentages */
        percent[0] = 100 * ap / (ap + fp + 1);
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
                                struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
-        unsigned long nr_reclaimed = 0;
        unsigned long percent[2];       /* anon @ 0; file @ 1 */
        enum lru_list l;
+        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long swap_cluster_max = sc->swap_cluster_max;
        get_scan_ratio(zone, sc, percent);
        for_each_evictable_lru(l) {
-                if (scan_global_lru(sc)) {
+                int file = is_file_lru(l);
-                        int file = is_file_lru(l);
+                int scan;
-                        int scan;
+                scan = zone_page_state(zone, NR_LRU_BASE + l);
-                        scan = zone_page_state(zone, NR_LRU_BASE + l);
+                if (priority) {
-                        if (priority) {
+                        scan >>= priority;
-                                scan >>= priority;
+                        scan = (scan * percent[file]) / 100;
-                                scan = (scan * percent[file]) / 100;
+                }
-                        }
+                if (scanning_global_lru(sc)) {
                        zone->lru[l].nr_scan += scan;
                        nr[l] = zone->lru[l].nr_scan;
-                        if (nr[l] >= sc->swap_cluster_max)
+                        if (nr[l] >= swap_cluster_max)
                                zone->lru[l].nr_scan = 0;
                        else
                                nr[l] = 0;
-                } else {
+                } else
-                        /*
+                        nr[l] = scan;
-                         * This reclaim occurs not because zone memory shortage
-                         * but because memory controller hits its limit.
-                         * Don't modify zone reclaim related data.
-                         */
-                        nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
-                                                                priority, l);
-                }
        }
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
                        if (nr[l]) {
-                                nr_to_scan = min(nr[l],
+                                nr_to_scan = min(nr[l], swap_cluster_max);
-                                        (unsigned long)sc->swap_cluster_max);
                                nr[l] -= nr_to_scan;
                                nr_reclaimed += shrink_list(l, nr_to_scan,
-                                                        zone, sc, priority);
+                                                            zone, sc, priority);
                        }
                }
+                /*
+                 * On large memory systems, scan >> priority can become
+                 * really large. This is fine for the starting priority;
+                 * we want to put equal scanning pressure on each zone.
+                 * However, if the VM has a harder time of freeing pages,
+                 * with multiple processes reclaiming pages, the total
+                 * freeing target can get unreasonably large.
+                 */
+                if (nr_reclaimed > swap_cluster_max &&
+                        priority < DEF_PRIORITY && !current_is_kswapd())
+                        break;
        }
+        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+        if (inactive_anon_is_low(zone, sc))
-                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
-        else if (!scan_global_lru(sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
        throttle_vm_writeout(sc->gfp_mask);
-        return nr_reclaimed;
 }
 /*
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 * b) The zones may be over pages_high but they must go *over* pages_high to
 *    satisfy the `incremental min' zone defense algorithm.
 *
- * Returns the number of reclaimed pages.
- *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
-        unsigned long nr_reclaimed = 0;
        struct zoneref *z;
        struct zone *zone;
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
                        note_zone_scanning_priority(zone, priority);
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                                        priority);
                }
-                nr_reclaimed += shrink_zone(priority, zone, sc);
+                shrink_zone(priority, zone, sc);
        }
-        return nr_reclaimed;
 }
 /*
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        int priority;
        unsigned long ret = 0;
        unsigned long total_scanned = 0;
-        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
        struct zoneref *z;
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        delayacct_freepages_start();
-        if (scan_global_lru(sc))
+        if (scanning_global_lru(sc))
                count_vm_event(ALLOCSTALL);
        /*
         * mem_cgroup will not do shrink_slab.
         */
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-                nr_reclaimed += shrink_zones(priority, zonelist, sc);
+                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
                 */
-                if (scan_global_lru(sc)) {
+                if (scanning_global_lru(sc)) {
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
-                                nr_reclaimed += reclaim_state->reclaimed_slab;
+                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
                        }
                }
                total_scanned += sc->nr_scanned;
-                if (nr_reclaimed >= sc->swap_cluster_max) {
+                if (sc->nr_reclaimed >= sc->swap_cluster_max) {
-                        ret = nr_reclaimed;
+                        ret = sc->nr_reclaimed;
                        goto out;
                }
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        congestion_wait(WRITE, HZ/10);
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
-        if (!sc->all_unreclaimable && scan_global_lru(sc))
+        if (!sc->all_unreclaimable && scanning_global_lru(sc))
-                ret = nr_reclaimed;
+                ret = sc->nr_reclaimed;
 out:
        /*
         * Now that we've scanned all the zones at this priority level, note
@@ -1629,7 +1671,7 @@ out:
        if (priority < 0)
                priority = 0;
-        if (scan_global_lru(sc)) {
+        if (scanning_global_lru(sc)) {
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
-                                                gfp_t gfp_mask)
+                                           gfp_t gfp_mask,
+                                           bool noswap,
+                                           unsigned int swappiness)
 {
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_swap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
-                .swappiness = vm_swappiness,
+                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
        };
        struct zonelist *zonelist;
+        if (noswap)
+                sc.may_swap = 0;
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
        int priority;
        int i;
        unsigned long total_scanned;
-        unsigned long nr_reclaimed;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 loop_again:
        total_scanned = 0;
-        nr_reclaimed = 0;
+        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
@@ -1766,7 +1812,7 @@ loop_again:
                         * Do some background aging of the anon list, to give
                         * pages a chance to be referenced before reclaiming.
                         */
-                        if (inactive_anon_is_low(zone))
+                        if (inactive_anon_is_low(zone, &sc))
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
@@ -1817,11 +1863,11 @@ loop_again:
                         */
                        if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
                                                end_zone, 0))
-                                nr_reclaimed += shrink_zone(priority, zone, &sc);
+                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
-                        nr_reclaimed += reclaim_state->reclaimed_slab;
+                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
                        if (zone_is_all_unreclaimable(zone))
                                continue;
@@ -1835,7 +1881,7 @@ loop_again:
                         * even in laptop mode
                         */
                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
+                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
                if (all_zones_ok)
@@ -1853,7 +1899,7 @@ loop_again:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1872,10 +1918,27 @@ out:
                try_to_freeze();
+                /*
+                 * Fragmentation may mean that the system cannot be
+                 * rebalanced for high-order allocations in all zones.
+                 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
+                 * it means the zones have been fully scanned and are still
+                 * not balanced. For high-order allocations, there is
+                 * little point trying all over again as kswapd may
+                 * infinite loop.
+                 *
+                 * Instead, recheck all watermarks at order-0 as they
+                 * are the most important. If watermarks are ok, kswapd will go
+                 * back to sleep. High-order users can still perform direct
+                 * reclaim if they wish.
+                 */
+                if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+                        order = sc.order = 0;
                goto loop_again;
        }
-        return nr_reclaimed;
+        return sc.nr_reclaimed;
 }
 /*
@@ -1902,7 +1965,7 @@ static int kswapd(void *p)
        };
        node_to_cpumask_ptr(cpumask, pgdat->node_id);
-        if (!cpus_empty(*cpumask))
+        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
        current->reclaim_state = &reclaim_state;
@@ -2141,7 +2204,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
                        pg_data_t *pgdat = NODE_DATA(nid);
                        node_to_cpumask_ptr(mask, pgdat->node_id);
-                        if (any_online_cpu(*mask) < nr_cpu_ids)
+                        if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
                                /* One of our CPUs online: restore mask */
                                set_cpus_allowed_ptr(pgdat->kswapd, mask);
                }
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
        int priority;
-        unsigned long nr_reclaimed = 0;
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                priority = ZONE_RECLAIM_PRIORITY;
                do {
                        note_zone_scanning_priority(zone, priority);
-                        nr_reclaimed += shrink_zone(priority, zone, &sc);
+                        shrink_zone(priority, zone, &sc);
                        priority--;
-                } while (priority >= 0 && nr_reclaimed < nr_pages);
+                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Update nr_reclaimed by the number of slab pages we
                 * reclaimed from this zone.
                 */
-                nr_reclaimed += slab_reclaimable -
+                sc.nr_reclaimed += slab_reclaimable -
                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
        }
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        return nr_reclaimed >= nr_pages;
+        return sc.nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2393,6 +2455,7 @@ retry:
                __dec_zone_state(zone, NR_UNEVICTABLE);
                list_move(&page->lru, &zone->lru[l].list);
+                mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
                __inc_zone_state(zone, NR_INACTIVE_ANON + l);
                __count_vm_event(UNEVICTABLE_PGRESCUED);
        } else {
@@ -2401,6 +2464,7 @@ retry:
                 */
                SetPageUnevictable(page);
                list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+                mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
                if (page_evictable(page, NULL))
                        goto retry;
        }
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 * back onto @zone's unevictable list.
 */
 #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-void scan_zone_unevictable_pages(struct zone *zone)
+static void scan_zone_unevictable_pages(struct zone *zone)
 {
        struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
        unsigned long scan;
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
 * that has possibly/probably made some previously unevictable pages
 * evictable.
 */
-void scan_all_zones_unevictable_pages(void)
+static void scan_all_zones_unevictable_pages(void)
 {
        struct zone *zone;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3ccfda23adc..91149746bb8d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -20,7 +20,7 @@
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 {
        int cpu;
        int i;
@@ -43,7 +43,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
 void all_vm_events(unsigned long *ret)
 {
        get_online_cpus();
-        sum_vm_events(ret, &cpu_online_map);
+        sum_vm_events(ret, cpu_online_mask);
        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
author	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
committer	James Morris <jmorris@namei.org>	2009-02-05 19:01:45 -0500
commit	cb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree	7c06d8f30783115e3384721046258ce615b129c5 /mm
parent	8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parent	f01d1d546abb2f4028b5299092f529eefb01253a (diff)