29 files changed, 1183 insertions, 865 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a816..a5b77811fdf2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
          example on NUMA systems to put pages nearer to the processors accessing
          the page.
-config RESOURCES_64BIT
-        bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
-        default 64BIT
-        help
-          This option allows memory and IO resources to be 64 bit.
 config PHYS_ADDR_T_64BIT
        def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7c..72255be57f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
-                           readahead.o swap.o truncate.o vmscan.o \
+                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
                           page_isolation.o mm_init.o $(mmu-y)
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
-obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a7c6c5613ec9..8e8587444132 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
 {
        struct backing_dev_info *bdi = m->private;
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
-        long bdi_thresh;
+        unsigned long bdi_thresh;
        get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142a..51a0ccf61e0e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        unsigned long fallback = 0;
        unsigned long min, max, start, sidx, midx, step;
+        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+                align, goal, limit);
        BUG_ON(!size);
        BUG_ON(align & (align - 1));
        BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
        if (!bdata->node_bootmem_map)
                return NULL;
-        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
-                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
-                align, goal, limit);
        min = bdata->node_min_pfn;
        max = bdata->node_low_pfn;
diff --git a/mm/filemap.c b/mm/filemap.c
index f5769b4dc075..2f55a1e2baf7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        int ret;
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
-                .nr_to_write = mapping->nrpages * 2,
+                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };
@@ -741,7 +741,14 @@ repeat:
                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        return NULL;
-                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+                /*
+                 * We want a regular kernel memory (not highmem or DMA etc)
+                 * allocation for the radix tree nodes, but we need to honour
+                 * the context-specific requirements the caller has asked for.
+                 * GFP_RECLAIM_MASK collects those requirements.
+                 */
+                err = add_to_page_cache_lru(page, mapping, index,
+                        (gfp_mask & GFP_RECLAIM_MASK));
                if (unlikely(err)) {
                        page_cache_release(page);
                        page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
                return NULL;
        }
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
-        if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
+        if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
                page_cache_release(page);
                page = NULL;
        }
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        goto out; /* skip atime */
                size = i_size_read(inode);
                if (pos < size) {
-                        retval = filemap_write_and_wait(mapping);
+                        retval = filemap_write_and_wait_range(mapping, pos,
+                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
@@ -1530,7 +1538,6 @@ retry_find:
        /*
         * Found the page and have a reference on it.
         */
-        mark_page_accessed(page);
        ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (count != ocount)
                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-        /*
-         * Unmap all mmappings of the file up-front.
-         *
-         * This will cause any pte dirty bits to be propagated into the
-         * pageframes for the subsequent filemap_write_and_wait().
-         */
        write_len = iov_length(iov, *nr_segs);
        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
-        if (mapping_mapped(mapping))
-                unmap_mapping_range(mapping, pos, write_len, 0);
-        written = filemap_write_and_wait(mapping);
+        written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
        if (written)
                goto out;
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
         * the file data here, to try to honour O_DIRECT expectations.
         */
        if (unlikely(file->f_flags & O_DIRECT) && written)
-                status = filemap_write_and_wait(mapping);
+                status = filemap_write_and_wait_range(mapping,
+                                        pos, pos + written - 1);
        return written ? written : status;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush_notify(vma, address, pte);
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7b..62d5bbda921a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                if (page) {
                        if (pte_dirty(pte))
                                set_page_dirty(page);
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
                        page_cache_release(page);
                        update_hiwater_rss(mm);
                        dec_mm_counter(mm, file_rss);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb89..618e98304080 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
 }
 /*
+ * Return the size of the pages allocated when backing a VMA. In the majority
+ * cases this will be same size as used by the page table entries.
+ */
+unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
+{
+        struct hstate *hstate;
+        if (!is_vm_hugetlb_page(vma))
+                return PAGE_SIZE;
+        hstate = hstate_vma(vma);
+        return 1UL << (hstate->order + PAGE_SHIFT);
+}
+/*
+ * Return the page size being used by the MMU to back a VMA. In the majority
+ * of cases, the page size used by the kernel matches the MMU size. On
+ * architectures where it differs, an architecture-specific version of this
+ * function is required.
+ */
+#ifndef vma_mmu_pagesize
+unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
+{
+        return vma_kernel_pagesize(vma);
+}
+#endif
+/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
 {
        int i;
-        if (unlikely(sz > MAX_ORDER_NR_PAGES))
+        if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
-                return clear_gigantic_page(page, addr, sz);
+                clear_gigantic_page(page, addr, sz);
+                return;
+        }
        might_sleep();
        for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
        int i;
        struct hstate *h = hstate_vma(vma);
-        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                return copy_gigantic_page(dst, src, addr, vma);
+                copy_gigantic_page(dst, src, addr, vma);
+                return;
+        }
        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 }
-__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
+int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
        struct huge_bootmem_page *m;
        int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
                         * puts them into the mem_map).
                         */
                        m = addr;
-                        if (m)
+                        goto found;
-                                goto found;
                }
                hstate_next_node(h);
                nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb68..478223b73a2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
 /*
 * in mm/page_alloc.c
 */
+extern unsigned long highest_memmap_pfn;
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 /*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 #define GUP_FLAGS_WRITE                  0x1
 #define GUP_FLAGS_FORCE                  0x2
 #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+#define GUP_FLAGS_IGNORE_SIGKILL         0x8
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int len, int flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0c..51ee96545579 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -779,7 +779,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
        return 0;
 }
-int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+                                   unsigned long long val)
 {
        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
diff --git a/mm/memory.c b/mm/memory.c
index 7b9db658aca2..3f8fa06b963b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/swapops.h>
+#include <linux/elf.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -59,9 +62,6 @@
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
-#include <linux/swapops.h>
-#include <linux/elf.h>
 #include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
-                          unsigned long vaddr)
+                          pte_t pte, struct page *page)
-{
+{
-        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
-                        "vm_flags = %lx, vaddr = %lx\n",
+        pud_t *pud = pud_offset(pgd, addr);
-                (long long)pte_val(pte),
+        pmd_t *pmd = pmd_offset(pud, addr);
-                (vma->vm_mm == current->mm ? current->comm : "???"),
+        struct address_space *mapping;
-                vma->vm_flags, vaddr);
+        pgoff_t index;
+        static unsigned long resume;
+        static unsigned long nr_shown;
+        static unsigned long nr_unshown;
+        /*
+         * Allow a burst of 60 reports, then keep quiet for that minute;
+         * or allow a steady drip of one report per second.
+         */
+        if (nr_shown == 60) {
+                if (time_before(jiffies, resume)) {
+                        nr_unshown++;
+                        return;
+                }
+                if (nr_unshown) {
+                        printk(KERN_ALERT
+                                "BUG: Bad page map: %lu messages suppressed\n",
+                                nr_unshown);
+                        nr_unshown = 0;
+                }
+                nr_shown = 0;
+        }
+        if (nr_shown++ == 0)
+                resume = jiffies + 60 * HZ;
+        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
+        index = linear_page_index(vma, addr);
+        printk(KERN_ALERT
+                "BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
+                current->comm,
+                (long long)pte_val(pte), (long long)pmd_val(*pmd));
+        if (page) {
+                printk(KERN_ALERT
+                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+                page, (void *)page->flags, page_count(page),
+                page_mapcount(page), page->mapping, page->index);
+        }
+        printk(KERN_ALERT
+                "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+                (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+        /*
+         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
+         */
+        if (vma->vm_ops)
+                print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
+                                (unsigned long)vma->vm_ops->fault);
+        if (vma->vm_file && vma->vm_file->f_op)
+                print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
+                                (unsigned long)vma->vm_file->f_op->mmap);
        dump_stack();
+        add_taint(TAINT_BAD_PAGE);
 }
 static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                                pte_t pte)
 {
-        unsigned long pfn;
+        unsigned long pfn = pte_pfn(pte);
        if (HAVE_PTE_SPECIAL) {
-                if (likely(!pte_special(pte))) {
+                if (likely(!pte_special(pte)))
-                        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                        goto check_pfn;
-                        return pte_page(pte);
+                if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
-                }
+                        print_bad_pte(vma, addr, pte, NULL);
-                VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
                return NULL;
        }
        /* !HAVE_PTE_SPECIAL case follows: */
-        pfn = pte_pfn(pte);
        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                }
        }
-        VM_BUG_ON(!pfn_valid(pfn));
+check_pfn:
+        if (unlikely(pfn > highest_memmap_pfn)) {
+                print_bad_pte(vma, addr, pte, NULL);
+                return NULL;
+        }
        /*
         * NOTE! We still have PageReserved() pages in the page tables.
-         *
         * eg. VDSO mappings can cause them to exist.
         */
 out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        else {
                                if (pte_dirty(ptent))
                                        set_page_dirty(page);
-                                if (pte_young(ptent))
+                                if (pte_young(ptent) &&
-                                        SetPageReferenced(page);
+                                    likely(!VM_SequentialReadHint(vma)))
+                                        mark_page_accessed(page);
                                file_rss--;
                        }
-                        page_remove_rmap(page, vma);
+                        page_remove_rmap(page);
+                        if (unlikely(page_mapcount(page) < 0))
+                                print_bad_pte(vma, addr, ptent, page);
                        tlb_remove_page(tlb, page);
                        continue;
                }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                 */
                if (unlikely(details))
                        continue;
-                if (!pte_file(ptent))
+                if (pte_file(ptent)) {
-                        free_swap_and_cache(pte_to_swp_entry(ptent));
+                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
+                                print_bad_pte(vma, addr, ptent, NULL);
+                } else if
+                  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
+                        print_bad_pte(vma, addr, ptent, NULL);
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        int write = !!(flags & GUP_FLAGS_WRITE);
        int force = !!(flags & GUP_FLAGS_FORCE);
        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+        int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
        if (len <= 0)
                return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        struct page *page;
                        /*
-                         * If tsk is ooming, cut off its access to large memory
+                         * If we have a pending SIGKILL, don't keep faulting
-                         * allocations. It has a pending SIGKILL, but it can't
+                         * pages and potentially allocating memory, unless
-                         * be processed until returning to user space.
+                         * current is handling munlock--e.g., on exit. In
+                         * that case, we are not allocating memory.  Rather,
+                         * we're only unlocking already resident/mapped pages.
                         */
-                        if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+                        if (unlikely(!ignore_sigkill &&
-                                return i ? i : -ENOMEM;
+                                        fatal_signal_pending(current)))
+                                return i ? i : -ERESTARTSYS;
                        if (write)
                                foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * do_wp_page has broken COW when necessary,
                                 * even if maybe_mkwrite decided not to set
                                 * pte_write. We can thus safely do subsequent
-                                 * page lookups as if they were reads.
+                                 * page lookups as if they were reads. But only
+                                 * do so when looping for pte_write is futile:
+                                 * in some cases userspace may also be wanting
+                                 * to write to the gotten user page, which a
+                                 * read fault here might prevent (a readonly
+                                 * page might get reCOWed by userspace write).
                                 */
-                                if (ret & VM_FAULT_WRITE)
+                                if ((ret & VM_FAULT_WRITE) &&
+                                    !(vma->vm_flags & VM_WRITE))
                                        foll_flags &= ~FOLL_WRITE;
                                cond_resched();
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
        BUG_ON(pmd_huge(*pmd));
+        arch_enter_lazy_mmu_mode();
        token = pmd_pgtable(*pmd);
        do {
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
+        arch_leave_lazy_mmu_mode();
        if (mm != &init_mm)
                pte_unmap_unlock(pte-1, ptl);
        return err;
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * not dirty accountable.
         */
        if (PageAnon(old_page)) {
-                if (trylock_page(old_page)) {
+                if (!trylock_page(old_page)) {
-                        reuse = can_share_swap_page(old_page);
+                        page_cache_get(old_page);
-                        unlock_page(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        lock_page(old_page);
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte)) {
+                                unlock_page(old_page);
+                                page_cache_release(old_page);
+                                goto unlock;
+                        }
+                        page_cache_release(old_page);
                }
+                reuse = reuse_swap_page(old_page);
+                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
                /*
@@ -1943,11 +2025,7 @@ gotten:
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-                SetPageSwapBacked(new_page);
-                lru_cache_add_active_or_unevictable(new_page, vma);
                page_add_new_anon_rmap(new_page, vma, address);
-//TODO:  is this safe?  do_anonymous_page() does it this way.
                set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                if (old_page) {
@@ -1973,7 +2051,7 @@ gotten:
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                        page_remove_rmap(old_page, vma);
+                        page_remove_rmap(old_page);
                }
                /* Free the old page.. */
@@ -2374,7 +2452,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        inc_mm_counter(mm, anon_rss);
        pte = mk_pte(page, vma->vm_page_prot);
-        if (write_access && can_share_swap_page(page)) {
+        if (write_access && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                write_access = 0;
        }
@@ -2385,7 +2463,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        swap_free(entry);
        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
-                remove_exclusive_swap_page(page);
+                try_to_free_swap(page);
        unlock_page(page);
        if (write_access) {
@@ -2442,8 +2520,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
        inc_mm_counter(mm, anon_rss);
-        SetPageSwapBacked(page);
-        lru_cache_add_active_or_unevictable(page, vma);
        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
@@ -2591,8 +2667,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
-                        SetPageSwapBacked(page);
-                        lru_cache_add_active_or_unevictable(page, vma);
                        page_add_new_anon_rmap(page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
@@ -2602,7 +2676,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                get_page(dirty_page);
                        }
                }
-//TODO:  is this safe?  do_anonymous_page() does it this way.
                set_pte_at(mm, address, page_table, entry);
                /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2739,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                return 0;
-        if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
+        if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-                        !(vma->vm_flags & VM_CAN_NONLINEAR))) {
                /*
                 * Page table corrupted: show pte and kill process.
                 */
-                print_bad_pte(vma, orig_pte, address);
+                print_bad_pte(vma, address, orig_pte, NULL);
                return VM_FAULT_OOM;
        }
@@ -2953,7 +3025,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 {
        resource_size_t phys_addr;
        unsigned long prot = 0;
-        void *maddr;
+        void __iomem *maddr;
        int offset = addr & (PAGE_SIZE-1);
        if (follow_phys(vma, addr, write, &prot, &phys_addr))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17371185468..c083cf5fd6df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        return 0;
 }
-static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(int nid, struct zone *zone,
+                                        unsigned long phys_start_pfn)
 {
        int nr_pages = PAGES_PER_SECTION;
        int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
        if (ret < 0)
                return ret;
-        return register_new_memory(__pfn_to_section(phys_start_pfn));
+        return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 * call this function after deciding the zone to which to
 * add the new pages.
 */
-int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
-                 unsigned long nr_pages)
+                        unsigned long nr_pages)
 {
        unsigned long i;
        int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
        end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
        for (i = start_sec; i <= end_sec; i++) {
-                err = __add_section(zone, i << PFN_SECTION_SHIFT);
+                err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
                /*
                 * EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
 }
 static struct page *
-hotremove_migrate_alloc(struct page *page,
+hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-                        unsigned long private,
-                        int **x)
 {
-        /* This should be improoooooved!! */
+        /* This should be improooooved!! */
-        return alloc_page(GFP_HIGHUSER_PAGECACHE);
+        return alloc_page(GFP_HIGHUSER_MOVABLE);
 }
 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab8c08b..55373983c9c6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -300,12 +300,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         * Now we know that no one else is looking at the page.
         */
        get_page(newpage);      /* add cache reference */
-#ifdef CONFIG_SWAP
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
                set_page_private(newpage, page_private(page));
        }
-#endif
        radix_tree_replace_slot(pslot, newpage);
@@ -373,9 +371,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        mlock_migrate_page(newpage, page);
-#ifdef CONFIG_SWAP
        ClearPageSwapCache(page);
-#endif
        ClearPagePrivate(page);
        set_page_private(page, 0);
        /* page->mapping contains a flag for PageAnon() */
@@ -848,12 +844,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                struct vm_area_struct *vma;
                struct page *page;
-                /*
-                 * A valid page pointer that will not match any of the
-                 * pages that will be moved.
-                 */
-                pp->page = ZERO_PAGE(0);
                err = -EFAULT;
                vma = find_vma(mm, pp->addr);
                if (!vma || !vma_migratable(vma))
@@ -919,41 +909,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                         const int __user *nodes,
                         int __user *status, int flags)
 {
-        struct page_to_node *pm = NULL;
+        struct page_to_node *pm;
        nodemask_t task_nodes;
-        int err = 0;
+        unsigned long chunk_nr_pages;
-        int i;
+        unsigned long chunk_start;
+        int err;
        task_nodes = cpuset_mems_allowed(task);
-        /* Limit nr_pages so that the multiplication may not overflow */
+        err = -ENOMEM;
-        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+        pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
-                err = -E2BIG;
+        if (!pm)
-                goto out;
-        }
-        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
-        if (!pm) {
-                err = -ENOMEM;
                goto out;
-        }
        /*
-         * Get parameters from user space and initialize the pm
+         * Store a chunk of page_to_node array in a page,
-         * array. Return various errors if the user did something wrong.
+         * but keep the last one as a marker
         */
-        for (i = 0; i < nr_pages; i++) {
+        chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
-                const void __user *p;
-                err = -EFAULT;
+        for (chunk_start = 0;
-                if (get_user(p, pages + i))
+             chunk_start < nr_pages;
-                        goto out_pm;
+             chunk_start += chunk_nr_pages) {
+                int j;
-                pm[i].addr = (unsigned long)p;
+                if (chunk_start + chunk_nr_pages > nr_pages)
-                if (nodes) {
+                        chunk_nr_pages = nr_pages - chunk_start;
+                /* fill the chunk pm with addrs and nodes from user-space */
+                for (j = 0; j < chunk_nr_pages; j++) {
+                        const void __user *p;
                        int node;
-                        if (get_user(node, nodes + i))
+                        err = -EFAULT;
+                        if (get_user(p, pages + j + chunk_start))
+                                goto out_pm;
+                        pm[j].addr = (unsigned long) p;
+                        if (get_user(node, nodes + j + chunk_start))
                                goto out_pm;
                        err = -ENODEV;
@@ -964,22 +956,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
                        if (!node_isset(node, task_nodes))
                                goto out_pm;
-                        pm[i].node = node;
+                        pm[j].node = node;
-                } else
+                }
-                        pm[i].node = 0; /* anything to not match MAX_NUMNODES */
-        }
+                /* End marker for this chunk */
-        /* End marker */
+                pm[chunk_nr_pages].node = MAX_NUMNODES;
-        pm[nr_pages].node = MAX_NUMNODES;
+                /* Migrate this chunk */
+                err = do_move_page_to_node_array(mm, pm,
+                                                 flags & MPOL_MF_MOVE_ALL);
+                if (err < 0)
+                        goto out_pm;
-        err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
-        if (err >= 0)
                /* Return status information */
-                for (i = 0; i < nr_pages; i++)
+                for (j = 0; j < chunk_nr_pages; j++)
-                        if (put_user(pm[i].status, status + i))
+                        if (put_user(pm[j].status, status + j + chunk_start)) {
                                err = -EFAULT;
+                                goto out_pm;
+                        }
+        }
+        err = 0;
 out_pm:
-        vfree(pm);
+        free_page((unsigned long)pm);
 out:
        return err;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56e7616..e125156c664e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                  (atomic_read(&mm->mm_users) != 0));
        /*
-         * mlock:   don't page populate if page has PROT_NONE permission.
+         * mlock:   don't page populate if vma has PROT_NONE permission.
-         * munlock: the pages always do munlock althrough
+         * munlock: always do munlock although the vma has PROT_NONE
-         *          its has PROT_NONE permission.
+         *          permission, or SIGKILL is pending.
         */
        if (!mlock)
-                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
+                             GUP_FLAGS_IGNORE_SIGKILL;
        if (vma->vm_flags & VM_WRITE)
                gup_flags |= GUP_FLAGS_WRITE;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c778fcfd9bd..a910c045cfd4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 static void __vma_link_file(struct vm_area_struct *vma)
 {
-        struct file * file;
+        struct file *file;
        file = vma->vm_file;
        if (file) {
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 * insert vm structure into list and rbtree and anon_vma,
 * but it has already been inserted into prio_tree earlier.
 */
-static void
+static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
-        struct vm_area_struct * __vma, * prev;
+        struct vm_area_struct *__vma, *prev;
-        struct rb_node ** rb_link, * rb_parent;
+        struct rb_node **rb_link, *rb_parent;
        __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
        BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 * The caller must hold down_write(current->mm->mmap_sem).
 */
-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, unsigned long pgoff)
 {
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
 {
        struct vm_area_struct *vma = NULL, *prev = NULL;
-        struct rb_node * rb_node;
+        struct rb_node *rb_node;
        if (!mm)
                goto out;
@@ -1541,7 +1540,7 @@ out:
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
-static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct rlimit *rlim = current->signal->rlim;
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm)
        arch_exit_mmap(mm);
        mmu_notifier_release(mm);
+        if (!mm->mmap)  /* Can happen if dup_mmap() received an OOM */
+                return;
        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
        tlb = tlb_gather_mmu(mm, 1);
-        /* Don't update_hiwater_rss(mm) here, do_exit already did */
+        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cfb4c4852062..d0f6e7ce09f1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -22,6 +22,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                ptent = pte_mkwrite(ptent);
                        ptep_modify_prot_commit(mm, addr, pte, ptent);
-#ifdef CONFIG_MIGRATION
+                } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
-                } else if (!pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
                        if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                set_pte_at(mm, addr, pte,
                                        swp_entry_to_pte(entry));
                        }
-#endif
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9afe6e4e..6b9e758c98a5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
-static DEFINE_SPINLOCK(zone_scan_mutex);
+static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 /**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                printk(KERN_WARNING "%s invoked oom-killer: "
                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
                        current->comm, gfp_mask, order, current->oomkilladj);
+                task_lock(current);
+                cpuset_print_task_mems_allowed(current);
+                task_unlock(current);
                dump_stack();
                show_mem();
                if (sysctl_oom_dump_tasks)
@@ -470,7 +473,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        struct zone *zone;
        int ret = 1;
-        spin_lock(&zone_scan_mutex);
+        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                if (zone_is_oom_locked(zone)) {
                        ret = 0;
@@ -480,7 +483,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                /*
-                 * Lock each zone in the zonelist under zone_scan_mutex so a
+                 * Lock each zone in the zonelist under zone_scan_lock so a
                 * parallel invocation of try_set_zone_oom() doesn't succeed
                 * when it shouldn't.
                 */
@@ -488,7 +491,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        }
 out:
-        spin_unlock(&zone_scan_mutex);
+        spin_unlock(&zone_scan_lock);
        return ret;
 }
@@ -502,11 +505,74 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        struct zoneref *z;
        struct zone *zone;
-        spin_lock(&zone_scan_mutex);
+        spin_lock(&zone_scan_lock);
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                zone_clear_flag(zone, ZONE_OOM_LOCKED);
        }
-        spin_unlock(&zone_scan_mutex);
+        spin_unlock(&zone_scan_lock);
+}
+/*
+ * Must be called with tasklist_lock held for read.
+ */
+static void __out_of_memory(gfp_t gfp_mask, int order)
+{
+        if (sysctl_oom_kill_allocating_task) {
+                oom_kill_process(current, gfp_mask, order, 0, NULL,
+                                "Out of memory (oom_kill_allocating_task)");
+        } else {
+                unsigned long points;
+                struct task_struct *p;
+retry:
+                /*
+                 * Rambo mode: Shoot down a process and hope it solves whatever
+                 * issues we may have.
+                 */
+                p = select_bad_process(&points, NULL);
+                if (PTR_ERR(p) == -1UL)
+                        return;
+                /* Found nothing?!?! Either we hang forever, or we panic. */
+                if (!p) {
+                        read_unlock(&tasklist_lock);
+                        panic("Out of memory and no killable processes...\n");
+                }
+                if (oom_kill_process(p, gfp_mask, order, points, NULL,
+                                     "Out of memory"))
+                        goto retry;
+        }
+}
+/*
+ * pagefault handler calls into here because it is out of memory but
+ * doesn't know exactly how or why.
+ */
+void pagefault_out_of_memory(void)
+{
+        unsigned long freed = 0;
+        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+        if (freed > 0)
+                /* Got some memory back in the last second. */
+                return;
+        if (sysctl_panic_on_oom)
+                panic("out of memory from page fault. panic_on_oom is selected.\n");
+        read_lock(&tasklist_lock);
+        __out_of_memory(0, 0); /* unknown gfp_mask and order */
+        read_unlock(&tasklist_lock);
+        /*
+         * Give "p" a good chance of killing itself before we
+         * retry to allocate memory.
+         */
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_uninterruptible(1);
 }
 /**
@@ -522,8 +588,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-        struct task_struct *p;
-        unsigned long points = 0;
        unsigned long freed = 0;
        enum oom_constraint constraint;
@@ -544,7 +608,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
        switch (constraint) {
        case CONSTRAINT_MEMORY_POLICY:
-                oom_kill_process(current, gfp_mask, order, points, NULL,
+                oom_kill_process(current, gfp_mask, order, 0, NULL,
                                "No available memory (MPOL_BIND)");
                break;
@@ -553,35 +617,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                        panic("out of memory. panic_on_oom is selected\n");
                /* Fall-through */
        case CONSTRAINT_CPUSET:
-                if (sysctl_oom_kill_allocating_task) {
+                __out_of_memory(gfp_mask, order);
-                        oom_kill_process(current, gfp_mask, order, points, NULL,
-                                        "Out of memory (oom_kill_allocating_task)");
-                        break;
-                }
-retry:
-                /*
-                 * Rambo mode: Shoot down a process and hope it solves whatever
-                 * issues we may have.
-                 */
-                p = select_bad_process(&points, NULL);
-                if (PTR_ERR(p) == -1UL)
-                        goto out;
-                /* Found nothing?!?! Either we hang forever, or we panic. */
-                if (!p) {
-                        read_unlock(&tasklist_lock);
-                        panic("Out of memory and no killable processes...\n");
-                }
-                if (oom_kill_process(p, gfp_mask, order, points, NULL,
-                                     "Out of memory"))
-                        goto retry;
                break;
        }
-out:
        read_unlock(&tasklist_lock);
        /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35fd03f..b493db7841dc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
 int dirty_background_ratio = 5;
 /*
+ * dirty_background_bytes starts at 0 (disabled) so that it is a function of
+ * dirty_background_ratio * the amount of dirtyable memory
+ */
+unsigned long dirty_background_bytes;
+/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
 int vm_dirty_ratio = 10;
 /*
+ * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
+ * vm_dirty_ratio * the amount of dirtyable memory
+ */
+unsigned long vm_dirty_bytes;
+/*
 * The interval between `kupdate'-style writebacks, in jiffies
 */
 int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
 {
        unsigned long dirty_total;
-        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        if (vm_dirty_bytes)
+                dirty_total = vm_dirty_bytes / PAGE_SIZE;
+        else
+                dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+                                100;
        return 2 + ilog2(dirty_total - 1);
 }
 /*
- * update the period when the dirty ratio changes.
+ * update the period when the dirty threshold changes.
 */
+static void update_completion_period(void)
+{
+        int shift = calc_period_shift();
+        prop_change_shift(&vm_completions, shift);
+        prop_change_shift(&vm_dirties, shift);
+}
+int dirty_background_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                dirty_background_bytes = 0;
+        return ret;
+}
+int dirty_background_bytes_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write)
+                dirty_background_ratio = 0;
+        return ret;
+}
 int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
        int old_ratio = vm_dirty_ratio;
-        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        int ret;
+        ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-                int shift = calc_period_shift();
+                update_completion_period();
-                prop_change_shift(&vm_completions, shift);
+                vm_dirty_bytes = 0;
-                prop_change_shift(&vm_dirties, shift);
+        }
+        return ret;
+}
+int dirty_bytes_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_bytes = vm_dirty_bytes;
+        int ret;
+        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
+                update_completion_period();
+                vm_dirty_ratio = 0;
        }
        return ret;
 }
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
 }
 void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
-                 struct backing_dev_info *bdi)
+                 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
 {
-        int background_ratio;           /* Percentages */
+        unsigned long background;
-        int dirty_ratio;
+        unsigned long dirty;
-        long background;
-        long dirty;
        unsigned long available_memory = determine_dirtyable_memory();
        struct task_struct *tsk;
-        dirty_ratio = vm_dirty_ratio;
+        if (vm_dirty_bytes)
-        if (dirty_ratio < 5)
+                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-                dirty_ratio = 5;
+        else {
+                int dirty_ratio;
-        background_ratio = dirty_background_ratio;
+                dirty_ratio = vm_dirty_ratio;
-        if (background_ratio >= dirty_ratio)
+                if (dirty_ratio < 5)
-                background_ratio = dirty_ratio / 2;
+                        dirty_ratio = 5;
+                dirty = (dirty_ratio * available_memory) / 100;
+        }
+        if (dirty_background_bytes)
+                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+        else
+                background = (dirty_background_ratio * available_memory) / 100;
-        background = (background_ratio * available_memory) / 100;
+        if (background >= dirty)
-        dirty = (dirty_ratio * available_memory) / 100;
+                background = dirty / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
                background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 {
        long nr_reclaimable, bdi_nr_reclaimable;
        long nr_writeback, bdi_nr_writeback;
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
-        long bdi_thresh;
+        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
-        long background_thresh;
+        unsigned long background_thresh;
-        long dirty_thresh;
+        unsigned long dirty_thresh;
        for ( ; ; ) {
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
        };
        for ( ; ; ) {
-                long background_thresh;
+                unsigned long background_thresh;
-                long dirty_thresh;
+                unsigned long dirty_thresh;
                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
        int done = 0;
        struct pagevec pvec;
        int nr_pages;
+        pgoff_t uninitialized_var(writeback_index);
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
-        int scanned = 0;
+        pgoff_t done_index;
+        int cycled;
        int range_whole = 0;
        long nr_to_write = wbc->nr_to_write;
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
-                index = mapping->writeback_index; /* Start from prev offset */
+                writeback_index = mapping->writeback_index; /* prev offset */
+                index = writeback_index;
+                if (index == 0)
+                        cycled = 1;
+                else
+                        cycled = 0;
                end = -1;
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
-                scanned = 1;
+                cycled = 1; /* ignore range_cyclic tests */
        }
 retry:
-        while (!done && (index <= end) &&
+        done_index = index;
-               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+        while (!done && (index <= end)) {
-                                              PAGECACHE_TAG_DIRTY,
+                int i;
-                                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-                unsigned i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
-                scanned = 1;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        /*
-                         * At this point we hold neither mapping->tree_lock nor
+                         * At this point, the page may be truncated or
-                         * lock on the page itself: the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
-                         * invalidated (changing page->mapping to NULL), or even
+                         * even swizzled back from swapper_space to tmpfs file
-                         * swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
-                         * mapping
+                         * because we have a reference on the page.
                         */
+                        if (page->index > end) {
+                                /*
+                                 * can't be range_cyclic (1st pass) because
+                                 * end == -1 in that case.
+                                 */
+                                done = 1;
+                                break;
+                        }
+                        done_index = page->index + 1;
                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
                                unlock_page(page);
                                continue;
                        }
-                        if (!wbc->range_cyclic && page->index > end) {
+                        if (!PageDirty(page)) {
-                                done = 1;
+                                /* someone wrote it for us */
-                                unlock_page(page);
+                                goto continue_unlock;
-                                continue;
                        }
-                        if (wbc->sync_mode != WB_SYNC_NONE)
+                        if (PageWriteback(page)) {
-                                wait_on_page_writeback(page);
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
-                        if (PageWriteback(page) ||
+                                else
-                            !clear_page_dirty_for_io(page)) {
+                                        goto continue_unlock;
-                                unlock_page(page);
-                                continue;
                        }
-                        ret = (*writepage)(page, wbc, data);
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
-                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                        ret = (*writepage)(page, wbc, data);
-                                unlock_page(page);
+                        if (unlikely(ret)) {
-                                ret = 0;
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        /*
+                                         * done_index is set past this page,
+                                         * so media errors will not choke
+                                         * background writeout for the entire
+                                         * file. This has consequences for
+                                         * range_cyclic semantics (ie. it may
+                                         * not be suitable for data integrity
+                                         * writeout).
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (wbc->sync_mode == WB_SYNC_NONE) {
+                                wbc->nr_to_write--;
+                                if (wbc->nr_to_write <= 0) {
+                                        done = 1;
+                                        break;
+                                }
                        }
-                        if (ret || (--nr_to_write <= 0))
-                                done = 1;
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
                                done = 1;
+                                break;
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
-        if (!scanned && !done) {
+        if (!cycled) {
                /*
+                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-                scanned = 1;
+                cycled = 1;
                index = 0;
+                end = writeback_index - 1;
                goto retry;
        }
        if (!wbc->no_nrwrite_index_update) {
                if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
-                        mapping->writeback_index = index;
+                        mapping->writeback_index = done_index;
                wbc->nr_to_write = nr_to_write;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac01474563..7bf22e045318 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
-long nr_swap_pages;
+unsigned long highest_memmap_pfn __read_mostly;
 int percpu_pagelist_fraction;
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
 static void bad_page(struct page *page)
 {
-        printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
+        static unsigned long resume;
-                "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+        static unsigned long nr_shown;
-                current->comm, page, (int)(2*sizeof(unsigned long)),
+        static unsigned long nr_unshown;
-                (unsigned long)page->flags, page->mapping,
-                page_mapcount(page), page_count(page));
+        /*
+         * Allow a burst of 60 reports, then keep quiet for that minute;
+         * or allow a steady drip of one report per second.
+         */
+        if (nr_shown == 60) {
+                if (time_before(jiffies, resume)) {
+                        nr_unshown++;
+                        goto out;
+                }
+                if (nr_unshown) {
+                        printk(KERN_ALERT
+                              "BUG: Bad page state: %lu messages suppressed\n",
+                                nr_unshown);
+                        nr_unshown = 0;
+                }
+                nr_shown = 0;
+        }
+        if (nr_shown++ == 0)
+                resume = jiffies + 60 * HZ;
+        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+                current->comm, page_to_pfn(page));
+        printk(KERN_ALERT
+                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
+                page, (void *)page->flags, page_count(page),
+                page_mapcount(page), page->mapping, page->index);
-        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
-                KERN_EMERG "Backtrace:\n");
        dump_stack();
-        page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD;
+out:
-        set_page_count(page, 0);
+        /* Leave bad fields for debug, except PageBuddy could make trouble */
-        reset_page_mapcount(page);
+        __ClearPageBuddy(page);
-        page->mapping = NULL;
        add_taint(TAINT_BAD_PAGE);
 }
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
 }
 #endif
-static void destroy_compound_page(struct page *page, unsigned long order)
+static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        int bad = 0;
-        if (unlikely(compound_order(page) != order))
+        if (unlikely(compound_order(page) != order) ||
+            unlikely(!PageHead(page))) {
                bad_page(page);
+                bad++;
+        }
-        if (unlikely(!PageHead(page)))
-                        bad_page(page);
        __ClearPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (unlikely(!PageTail(p) |
+                if (unlikely(!PageTail(p) | (p->first_page != page))) {
-                                (p->first_page != page)))
                        bad_page(page);
+                        bad++;
+                }
                __ClearPageTail(p);
        }
+        return bad;
 }
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
        int migratetype = get_pageblock_migratetype(page);
        if (unlikely(PageCompound(page)))
-                destroy_compound_page(page, order);
+                if (unlikely(destroy_compound_page(page, order)))
+                        return;
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (page_count(page) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
+                (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
                bad_page(page);
-        if (PageDirty(page))
+                return 1;
-                __ClearPageDirty(page);
+        }
-        if (PageSwapBacked(page))
+        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-                __ClearPageSwapBacked(page);
+                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-        /*
+        return 0;
-         * For now, we report if PG_reserved was found set, but do not
-         * clear it, and do not free the page.  But we shall soon need
-         * to do more, for when the ZERO_PAGE count wraps negative.
-         */
-        return PageReserved(page);
 }
 /*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 {
        unsigned long flags;
        int i;
-        int reserved = 0;
+        int bad = 0;
        for (i = 0 ; i < (1 << order) ; ++i)
-                reserved += free_pages_check(page + i);
+                bad += free_pages_check(page + i);
-        if (reserved)
+        if (bad)
                return;
        if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (page_count(page) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
+                (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
                bad_page(page);
-        /*
-         * For now, we report if PG_reserved was found set, but do not
-         * clear it, and do not allocate the page: as a safety net.
-         */
-        if (PageReserved(page))
                return 1;
+        }
-        page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
-                        1 << PG_referenced | 1 << PG_arch_1 |
-                        1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
-#ifdef CONFIG_UNEVICTABLE_LRU
-                        | 1 << PG_mlocked
-#endif
-                        );
        set_page_private(page, 0);
        set_page_refcounted(page);
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        unsigned long pfn;
        struct zone *z;
+        if (highest_memmap_pfn < end_pfn - 1)
+                highest_memmap_pfn = end_pfn - 1;
        z = &NODE_DATA(nid)->node_zones[zone];
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
 {
        unsigned long usemapsize = usemap_size(zonesize);
        zone->pageblock_flags = NULL;
-        if (usemapsize) {
+        if (usemapsize)
                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
-                memset(zone->pageblock_flags, 0, usemapsize);
-        }
 }
 #else
 static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
-                        printk(KERN_DEBUG
+                        if (memmap_pages)
-                                "  %s zone: %lu pages used for memmap\n",
+                                printk(KERN_DEBUG
-                                zone_names[j], memmap_pages);
+                                       "  %s zone: %lu pages used for memmap\n",
+                                       zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
                                "  %s zone: %lu pages exceeds realsize %lu\n",
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
-void setup_per_zone_inactive_ratio(void)
+static void setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
        return table;
 }
-#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-struct page *pfn_to_page(unsigned long pfn)
-{
-        return __pfn_to_page(pfn);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        return __page_to_pfn(page);
-}
-EXPORT_SYMBOL(pfn_to_page);
-EXPORT_SYMBOL(page_to_pfn);
-#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
 static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
                                                        unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff750519..d6507a660ed6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -101,7 +101,7 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
 }
 /* __alloc_bootmem...() is protected by !slab_available() */
-int __init_refok init_section_page_cgroup(unsigned long pfn)
+static int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
        struct mem_section *section;
        struct page_cgroup *base, *pc;
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c4480eaf0..dc6ce0afbded 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
        struct bio *bio;
        int ret = 0, rw = WRITE;
-        if (remove_exclusive_swap_page(page)) {
+        if (try_to_free_swap(page)) {
                unlock_page(page);
                goto out;
        }
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
        struct bio *bio;
        int ret = 0;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(PageUptodate(page));
+        VM_BUG_ON(PageUptodate(page));
        bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
                                end_swap_bio_read);
        if (bio == NULL) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 10993942d6c9..ac4af8cffbf9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
-#include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
+#include <linux/migrate.h>
 #include <asm/tlbflush.h>
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
 * Getting a lock on a stable anon_vma from a page off the LRU is
 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+static struct anon_vma *page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
        return NULL;
 }
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+static void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
                goto out_unmap;
        }
-        if (ptep_clear_flush_young_notify(vma, address, pte))
+        if (ptep_clear_flush_young_notify(vma, address, pte)) {
-                referenced++;
+                /*
+                 * Don't treat a reference through a sequentially read
+                 * mapping as such.  If the page has been used in
+                 * another mapping, we will catch it; if this other
+                 * mapping is already gone, the unmap path will have
+                 * set PG_referenced or activated the page.
+                 */
+                if (likely(!VM_SequentialReadHint(vma)))
+                        referenced++;
+        }
        /* Pretend the page is referenced if the task has the
           swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
 void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+        SetPageSwapBacked(page);
+        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        __page_set_anon_rmap(page, vma, address);
+        if (page_evictable(page, vma))
+                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
+        else
+                add_page_to_unevictable_list(page);
 }
 /**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
 */
 void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 {
-        BUG_ON(page_mapcount(page) == 0);
        if (PageAnon(page))
                __page_check_anon_rmap(page, vma, address);
        atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
 /**
 * page_remove_rmap - take down pte mapping from a page
 * @page: page to remove mapping from
- * @vma: the vm area in which the mapping is removed
 *
 * The caller needs to hold the pte lock.
 */
-void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
+void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
-                if (unlikely(page_mapcount(page) < 0)) {
-                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
-                        printk (KERN_EMERG "  page pfn = %lx\n", page_to_pfn(page));
-                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
-                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
-                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
-                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
-                        if (vma->vm_ops) {
-                                print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
-                        }
-                        if (vma->vm_file && vma->vm_file->f_op)
-                                print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
-                        BUG();
-                }
                /*
                 * Now that the last pte has gone, s390 must transfer dirty
                 * flag from storage key to struct page.  We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, anon_rss);
-#ifdef CONFIG_MIGRATION
+                } else if (PAGE_MIGRATION) {
-                } else {
                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         */
                        BUG_ON(!migration);
                        entry = make_migration_entry(page, pte_write(pteval));
-#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-        } else
+        } else if (PAGE_MIGRATION && migration) {
-#ifdef CONFIG_MIGRATION
-        if (migration) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
-#endif
                dec_mm_counter(mm, file_rss);
-        page_remove_rmap(page, vma);
+        page_remove_rmap(page);
        page_cache_release(page);
 out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                if (pte_dirty(pteval))
                        set_page_dirty(page);
-                page_remove_rmap(page, vma);
+                page_remove_rmap(page);
                page_cache_release(page);
                dec_mm_counter(mm, file_rss);
                (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d4871f3a..5941f9801363 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
+ * tiny-shmem:
+ * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
+ *
 * This file is released under the GPL.
 */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/vfs.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+static struct vfsmount *shm_mnt;
+#ifdef CONFIG_SHMEM
 /*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
 #include <linux/generic_acl.h>
-#include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/shmem_fs.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
@@ -1444,7 +1452,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-        mark_page_accessed(vmf->page);
        return ret | VM_FAULT_LOCKED;
 }
@@ -2486,7 +2493,6 @@ static struct file_system_type tmpfs_fs_type = {
        .get_sb         = shmem_get_sb,
        .kill_sb        = kill_litter_super,
 };
-static struct vfsmount *shm_mnt;
 static int __init init_tmpfs(void)
 {
@@ -2525,7 +2531,51 @@ out4:
        shm_mnt = ERR_PTR(error);
        return error;
 }
-module_init(init_tmpfs)
+#else /* !CONFIG_SHMEM */
+/*
+ * tiny-shmem: simple shmemfs and tmpfs using ramfs code
+ *
+ * This is intended for small system where the benefits of the full
+ * shmem code (swap-backed and resource-limited) are outweighed by
+ * their complexity. On systems without swap this code should be
+ * effectively equivalent, but much lighter weight.
+ */
+#include <linux/ramfs.h>
+static struct file_system_type tmpfs_fs_type = {
+        .name           = "tmpfs",
+        .get_sb         = ramfs_get_sb,
+        .kill_sb        = kill_litter_super,
+};
+static int __init init_tmpfs(void)
+{
+        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+        shm_mnt = kern_mount(&tmpfs_fs_type);
+        BUG_ON(IS_ERR(shm_mnt));
+        return 0;
+}
+int shmem_unuse(swp_entry_t entry, struct page *page)
+{
+        return 0;
+}
+#define shmem_file_operations ramfs_file_operations
+#define shmem_vm_ops generic_file_vm_ops
+#define shmem_get_inode ramfs_get_inode
+#define shmem_acct_size(a, b) 0
+#define shmem_unacct_size(a, b) do {} while (0)
+#define SHMEM_MAX_BYTES LLONG_MAX
+#endif /* CONFIG_SHMEM */
+/* common code */
 /**
 * shmem_file_setup - get an unlinked file living in tmpfs
@@ -2569,12 +2619,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!inode)
                goto close_file;
+#ifdef CONFIG_SHMEM
        SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+#endif
        d_instantiate(dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &shmem_file_operations);
+                  &shmem_file_operations);
+#ifndef CONFIG_MMU
+        error = ramfs_nommu_expand_for_mapping(inode, size);
+        if (error)
+                goto close_file;
+#endif
        return file;
 close_file:
@@ -2606,3 +2664,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        vma->vm_ops = &shmem_vm_ops;
        return 0;
 }
+module_init(init_tmpfs)
diff --git a/mm/swap.c b/mm/swap.c
index b135ec90cdeb..ba2c0e8b8b54 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -246,25 +246,6 @@ void add_page_to_unevictable_list(struct page *page)
        spin_unlock_irq(&zone->lru_lock);
 }
-/**
- * lru_cache_add_active_or_unevictable
- * @page:  the page to be added to LRU
- * @vma:   vma in which page is mapped for determining reclaimability
- *
- * place @page on active or unevictable LRU list, depending on
- * page_evictable().  Note that if the page is not evictable,
- * it goes directly back onto it's zone's unevictable list.  It does
- * NOT use a per cpu pagevec.
- */
-void lru_cache_add_active_or_unevictable(struct page *page,
-                                        struct vm_area_struct *vma)
-{
-        if (page_evictable(page, vma))
-                lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
-        else
-                add_page_to_unevictable_list(page);
-}
 /*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +379,6 @@ void __pagevec_release(struct pagevec *pvec)
 EXPORT_SYMBOL(__pagevec_release);
 /*
- * pagevec_release() for pages which are known to not be on the LRU
- *
- * This function reinitialises the caller's pagevec.
- */
-void __pagevec_release_nonlru(struct pagevec *pvec)
-{
-        int i;
-        struct pagevec pages_to_free;
-        pagevec_init(&pages_to_free, pvec->cold);
-        for (i = 0; i < pagevec_count(pvec); i++) {
-                struct page *page = pvec->pages[i];
-                VM_BUG_ON(PageLRU(page));
-                if (put_page_testzero(page))
-                        pagevec_add(&pages_to_free, page);
-        }
-        pagevec_free(&pages_to_free);
-        pagevec_reinit(pvec);
-}
-/*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
@@ -495,8 +454,7 @@ void pagevec_swap_free(struct pagevec *pvec)
                struct page *page = pvec->pages[i];
                if (PageSwapCache(page) && trylock_page(page)) {
-                        if (PageSwapCache(page))
+                        try_to_free_swap(page);
-                                remove_exclusive_swap_page_ref(page);
                        unlock_page(page);
                }
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c9029cef..81c825f67a7f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -72,10 +72,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
        int error;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(PageSwapCache(page));
+        VM_BUG_ON(PageSwapCache(page));
-        BUG_ON(PagePrivate(page));
+        VM_BUG_ON(!PageSwapBacked(page));
-        BUG_ON(!PageSwapBacked(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
                page_cache_get(page);
@@ -108,10 +108,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 */
 void __delete_from_swap_cache(struct page *page)
 {
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(!PageSwapCache(page));
+        VM_BUG_ON(!PageSwapCache(page));
-        BUG_ON(PageWriteback(page));
+        VM_BUG_ON(PageWriteback(page));
-        BUG_ON(PagePrivate(page));
        radix_tree_delete(&swapper_space.page_tree, page_private(page));
        set_page_private(page, 0);
@@ -129,13 +128,13 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page * page, gfp_t gfp_mask)
+int add_to_swap(struct page *page)
 {
        swp_entry_t entry;
        int err;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
-        BUG_ON(!PageUptodate(page));
+        VM_BUG_ON(!PageUptodate(page));
        for (;;) {
                entry = get_swap_page();
@@ -154,7 +153,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
                 * Add it to the swap cache and mark it dirty
                 */
                err = add_to_swap_cache(page, entry,
-                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
+                                __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
@@ -196,14 +195,14 @@ void delete_from_swap_cache(struct page *page)
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
- * here because we are going to recheck again inside 
+ * here because we are going to recheck again inside
- * exclusive_swap_page() _with_ the lock. 
+ * try_to_free_swap() _with_ the lock.
 *                                      - Marcelo
 */
 static inline void free_swap_cache(struct page *page)
 {
-        if (PageSwapCache(page) && trylock_page(page)) {
+        if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
-                remove_exclusive_swap_page(page);
+                try_to_free_swap(page);
                unlock_page(page);
        }
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..eec5ca758a23 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/shm.h>
 #include <linux/blkdev.h>
+#include <linux/random.h>
 #include <linux/writeback.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -35,6 +36,7 @@
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
+long nr_swap_pages;
 long total_swap_pages;
 static int swap_overflow;
 static int least_priority;
@@ -83,15 +85,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
        up_read(&swap_unplug_sem);
 }
+/*
+ * swapon tell device that all the old swap contents can be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static int discard_swap(struct swap_info_struct *si)
+{
+        struct swap_extent *se;
+        int err = 0;
+        list_for_each_entry(se, &si->extent_list, list) {
+                sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
+                sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+                if (se->start_page == 0) {
+                        /* Do not discard the swap header page! */
+                        start_block += 1 << (PAGE_SHIFT - 9);
+                        nr_blocks -= 1 << (PAGE_SHIFT - 9);
+                        if (!nr_blocks)
+                                continue;
+                }
+                err = blkdev_issue_discard(si->bdev, start_block,
+                                                nr_blocks, GFP_KERNEL);
+                if (err)
+                        break;
+                cond_resched();
+        }
+        return err;             /* That will often be -EOPNOTSUPP */
+}
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                 pgoff_t start_page, pgoff_t nr_pages)
+{
+        struct swap_extent *se = si->curr_swap_extent;
+        int found_extent = 0;
+        while (nr_pages) {
+                struct list_head *lh;
+                if (se->start_page <= start_page &&
+                    start_page < se->start_page + se->nr_pages) {
+                        pgoff_t offset = start_page - se->start_page;
+                        sector_t start_block = se->start_block + offset;
+                        sector_t nr_blocks = se->nr_pages - offset;
+                        if (nr_blocks > nr_pages)
+                                nr_blocks = nr_pages;
+                        start_page += nr_blocks;
+                        nr_pages -= nr_blocks;
+                        if (!found_extent++)
+                                si->curr_swap_extent = se;
+                        start_block <<= PAGE_SHIFT - 9;
+                        nr_blocks <<= PAGE_SHIFT - 9;
+                        if (blkdev_issue_discard(si->bdev, start_block,
+                                                        nr_blocks, GFP_NOIO))
+                                break;
+                }
+                lh = se->list.next;
+                if (lh == &si->extent_list)
+                        lh = lh->next;
+                se = list_entry(lh, struct swap_extent, list);
+        }
+}
+static int wait_for_discard(void *word)
+{
+        schedule();
+        return 0;
+}
 #define SWAPFILE_CLUSTER        256
 #define LATENCY_LIMIT           256
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
-        unsigned long offset, last_in_cluster;
+        unsigned long offset;
+        unsigned long scan_base;
+        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+        int found_free_cluster = 0;
-        /* 
+        /*
         * We try to cluster swap pages by allocating them sequentially
         * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
         * way, however, we resort to first-free allocation, starting
@@ -99,16 +182,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
         * all over the entire swap partition, so that we reduce
         * overall disk seek times between swap pages.  -- sct
         * But we do now try to find an empty cluster.  -Andrea
+         * And we let swap pages go all over an SSD partition.  Hugh
         */
        si->flags += SWP_SCANNING;
-        if (unlikely(!si->cluster_nr)) {
+        scan_base = offset = si->cluster_next;
-                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+        if (unlikely(!si->cluster_nr--)) {
-                        goto lowest;
+                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
+                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                        goto checks;
+                }
+                if (si->flags & SWP_DISCARDABLE) {
+                        /*
+                         * Start range check on racing allocations, in case
+                         * they overlap the cluster we eventually decide on
+                         * (we scan without swap_lock to allow preemption).
+                         * It's hardly conceivable that cluster_nr could be
+                         * wrapped during our scan, but don't depend on it.
+                         */
+                        if (si->lowest_alloc)
+                                goto checks;
+                        si->lowest_alloc = si->max;
+                        si->highest_alloc = 0;
+                }
                spin_unlock(&swap_lock);
-                offset = si->lowest_bit;
+                /*
+                 * If seek is expensive, start searching for new cluster from
+                 * start of partition, to minimize the span of allocated swap.
+                 * But if seek is cheap, search from our current position, so
+                 * that swap is allocated from all over the partition: if the
+                 * Flash Translation Layer only remaps within limited zones,
+                 * we don't want to wear out the first zone too quickly.
+                 */
+                if (!(si->flags & SWP_SOLIDSTATE))
+                        scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
                /* Locate the first empty (unaligned) cluster */
@@ -117,43 +226,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&swap_lock);
-                                si->cluster_next = offset-SWAPFILE_CLUSTER+1;
+                                offset -= SWAPFILE_CLUSTER - 1;
-                                goto cluster;
+                                si->cluster_next = offset;
+                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
+                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
                                cond_resched();
                                latency_ration = LATENCY_LIMIT;
                        }
                }
+                offset = si->lowest_bit;
+                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+                /* Locate the first empty (unaligned) cluster */
+                for (; last_in_cluster < scan_base; offset++) {
+                        if (si->swap_map[offset])
+                                last_in_cluster = offset + SWAPFILE_CLUSTER;
+                        else if (offset == last_in_cluster) {
+                                spin_lock(&swap_lock);
+                                offset -= SWAPFILE_CLUSTER - 1;
+                                si->cluster_next = offset;
+                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                                found_free_cluster = 1;
+                                goto checks;
+                        }
+                        if (unlikely(--latency_ration < 0)) {
+                                cond_resched();
+                                latency_ration = LATENCY_LIMIT;
+                        }
+                }
+                offset = scan_base;
                spin_lock(&swap_lock);
-                goto lowest;
+                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                si->lowest_alloc = 0;
        }
-        si->cluster_nr--;
+checks:
-cluster:
+        if (!(si->flags & SWP_WRITEOK))
-        offset = si->cluster_next;
-        if (offset > si->highest_bit)
-lowest:         offset = si->lowest_bit;
-checks: if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
                goto no_page;
-        if (!si->swap_map[offset]) {
+        if (offset > si->highest_bit)
-                if (offset == si->lowest_bit)
+                scan_base = offset = si->lowest_bit;
-                        si->lowest_bit++;
+        if (si->swap_map[offset])
-                if (offset == si->highest_bit)
+                goto scan;
-                        si->highest_bit--;
-                si->inuse_pages++;
+        if (offset == si->lowest_bit)
-                if (si->inuse_pages == si->pages) {
+                si->lowest_bit++;
-                        si->lowest_bit = si->max;
+        if (offset == si->highest_bit)
-                        si->highest_bit = 0;
+                si->highest_bit--;
+        si->inuse_pages++;
+        if (si->inuse_pages == si->pages) {
+                si->lowest_bit = si->max;
+                si->highest_bit = 0;
+        }
+        si->swap_map[offset] = 1;
+        si->cluster_next = offset + 1;
+        si->flags -= SWP_SCANNING;
+        if (si->lowest_alloc) {
+                /*
+                 * Only set when SWP_DISCARDABLE, and there's a scan
+                 * for a free cluster in progress or just completed.
+                 */
+                if (found_free_cluster) {
+                        /*
+                         * To optimize wear-levelling, discard the
+                         * old data of the cluster, taking care not to
+                         * discard any of its pages that have already
+                         * been allocated by racing tasks (offset has
+                         * already stepped over any at the beginning).
+                         */
+                        if (offset < si->highest_alloc &&
+                            si->lowest_alloc <= last_in_cluster)
+                                last_in_cluster = si->lowest_alloc - 1;
+                        si->flags |= SWP_DISCARDING;
+                        spin_unlock(&swap_lock);
+                        if (offset < last_in_cluster)
+                                discard_swap_cluster(si, offset,
+                                        last_in_cluster - offset + 1);
+                        spin_lock(&swap_lock);
+                        si->lowest_alloc = 0;
+                        si->flags &= ~SWP_DISCARDING;
+                        smp_mb();       /* wake_up_bit advises this */
+                        wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+                } else if (si->flags & SWP_DISCARDING) {
+                        /*
+                         * Delay using pages allocated by racing tasks
+                         * until the whole discard has been issued. We
+                         * could defer that delay until swap_writepage,
+                         * but it's easier to keep this self-contained.
+                         */
+                        spin_unlock(&swap_lock);
+                        wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                                wait_for_discard, TASK_UNINTERRUPTIBLE);
+                        spin_lock(&swap_lock);
+                } else {
+                        /*
+                         * Note pages allocated by racing tasks while
+                         * scan for a free cluster is in progress, so
+                         * that its final discard can exclude them.
+                         */
+                        if (offset < si->lowest_alloc)
+                                si->lowest_alloc = offset;
+                        if (offset > si->highest_alloc)
+                                si->highest_alloc = offset;
                }
-                si->swap_map[offset] = 1;
-                si->cluster_next = offset + 1;
-                si->flags -= SWP_SCANNING;
-                return offset;
        }
+        return offset;
+scan:
        spin_unlock(&swap_lock);
        while (++offset <= si->highest_bit) {
                if (!si->swap_map[offset]) {
@@ -165,8 +355,18 @@ checks:	if (!(si->flags & SWP_WRITEOK))
                        latency_ration = LATENCY_LIMIT;
                }
        }
+        offset = si->lowest_bit;
+        while (++offset < scan_base) {
+                if (!si->swap_map[offset]) {
+                        spin_lock(&swap_lock);
+                        goto checks;
+                }
+                if (unlikely(--latency_ration < 0)) {
+                        cond_resched();
+                        latency_ration = LATENCY_LIMIT;
+                }
+        }
        spin_lock(&swap_lock);
-        goto lowest;
 no_page:
        si->flags -= SWP_SCANNING;
@@ -268,7 +468,7 @@ bad_nofile:
        printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
        return NULL;
-}       
+}
 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 {
@@ -326,97 +526,58 @@ static inline int page_swapcount(struct page *page)
 }
 /*
- * We can use this swap cache entry directly
+ * We can write to an anon page without COW if there are no other references
- * if there are no other references to it.
+ * to it.  And as a side-effect, free up its swap: because the old content
+ * on disk will never be read, and seeking back there to write new content
+ * later would only waste time away from clustering.
 */
-int can_share_swap_page(struct page *page)
+int reuse_swap_page(struct page *page)
 {
        int count;
-        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageLocked(page));
        count = page_mapcount(page);
-        if (count <= 1 && PageSwapCache(page))
+        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
+                if (count == 1 && !PageWriteback(page)) {
+                        delete_from_swap_cache(page);
+                        SetPageDirty(page);
+                }
+        }
        return count == 1;
 }
 /*
- * Work out if there are any other processes sharing this
+ * If swap is getting full, or if there are no more mappings of this page,
- * swap cache page. Free it if you can. Return success.
+ * then try_to_free_swap is called to free its swap space.
 */
-static int remove_exclusive_swap_page_count(struct page *page, int count)
+int try_to_free_swap(struct page *page)
 {
-        int retval;
+        VM_BUG_ON(!PageLocked(page));
-        struct swap_info_struct * p;
-        swp_entry_t entry;
-        BUG_ON(PagePrivate(page));
-        BUG_ON(!PageLocked(page));
        if (!PageSwapCache(page))
                return 0;
        if (PageWriteback(page))
                return 0;
-        if (page_count(page) != count) /* us + cache + ptes */
+        if (page_swapcount(page))
-                return 0;
-        entry.val = page_private(page);
-        p = swap_info_get(entry);
-        if (!p)
                return 0;
-        /* Is the only swap cache user the cache itself? */
+        delete_from_swap_cache(page);
-        retval = 0;
+        SetPageDirty(page);
-        if (p->swap_map[swp_offset(entry)] == 1) {
+        return 1;
-                /* Recheck the page count with the swapcache lock held.. */
-                spin_lock_irq(&swapper_space.tree_lock);
-                if ((page_count(page) == count) && !PageWriteback(page)) {
-                        __delete_from_swap_cache(page);
-                        SetPageDirty(page);
-                        retval = 1;
-                }
-                spin_unlock_irq(&swapper_space.tree_lock);
-        }
-        spin_unlock(&swap_lock);
-        if (retval) {
-                swap_free(entry);
-                page_cache_release(page);
-        }
-        return retval;
-}
-/*
- * Most of the time the page should have two references: one for the
- * process and one for the swap cache.
- */
-int remove_exclusive_swap_page(struct page *page)
-{
-        return remove_exclusive_swap_page_count(page, 2);
-}
-/*
- * The pageout code holds an extra reference to the page.  That raises
- * the reference count to test for to 2 for a page that is only in the
- * swap cache plus 1 for each process that maps the page.
- */
-int remove_exclusive_swap_page_ref(struct page *page)
-{
-        return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
 }
 /*
 * Free the swap entry like above, but also try to
 * free the page cache entry if it is the last user.
 */
-void free_swap_and_cache(swp_entry_t entry)
+int free_swap_and_cache(swp_entry_t entry)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        struct page *page = NULL;
        if (is_migration_entry(entry))
-                return;
+                return 1;
        p = swap_info_get(entry);
        if (p) {
@@ -430,20 +591,19 @@ void free_swap_and_cache(swp_entry_t entry)
                spin_unlock(&swap_lock);
        }
        if (page) {
-                int one_user;
+                /*
+                 * Not mapped elsewhere, or swap space full? Free it!
-                BUG_ON(PagePrivate(page));
+                 * Also recheck PageSwapCache now page is locked (above).
-                one_user = (page_count(page) == 2);
+                 */
-                /* Only cache user (+us), or swap space full? Free it! */
-                /* Also recheck PageSwapCache after page is locked (above) */
                if (PageSwapCache(page) && !PageWriteback(page) &&
-                                        (one_user || vm_swap_full())) {
+                                (!page_mapped(page) || vm_swap_full())) {
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
                unlock_page(page);
                page_cache_release(page);
        }
+        return p != NULL;
 }
 #ifdef CONFIG_HIBERNATION
@@ -776,10 +936,10 @@ static int try_to_unuse(unsigned int type)
                        break;
                }
-                /* 
+                /*
                 * Get a page for the entry, using the existing swap
                 * cache page if there is one.  Otherwise, get a clean
-                 * page and read the swap into it. 
+                 * page and read the swap into it.
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
@@ -930,7 +1090,16 @@ static int try_to_unuse(unsigned int type)
                        lock_page(page);
                        wait_on_page_writeback(page);
                }
-                if (PageSwapCache(page))
+                /*
+                 * It is conceivable that a racing task removed this page from
+                 * swap cache just before we acquired the page lock at the top,
+                 * or while we dropped it in unuse_mm().  The page might even
+                 * be back in swap cache on another swap area: that we must not
+                 * delete, since it may not have been written out to swap yet.
+                 */
+                if (PageSwapCache(page) &&
+                    likely(page_private(page) == entry.val))
                        delete_from_swap_cache(page);
                /*
@@ -1203,26 +1372,6 @@ out:
        return ret;
 }
-#if 0   /* We don't need this yet */
-#include <linux/backing-dev.h>
-int page_queue_congested(struct page *page)
-{
-        struct backing_dev_info *bdi;
-        BUG_ON(!PageLocked(page));      /* It pins the swap_info_struct */
-        if (PageSwapCache(page)) {
-                swp_entry_t entry = { .val = page_private(page) };
-                struct swap_info_struct *sis;
-                sis = get_swap_info_struct(swp_type(entry));
-                bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
-        } else
-                bdi = page->mapping->backing_dev_info;
-        return bdi_write_congested(bdi);
-}
-#endif
 asmlinkage long sys_swapoff(const char __user * specialfile)
 {
        struct swap_info_struct * p = NULL;
@@ -1233,7 +1382,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        char * pathname;
        int i, type, prev;
        int err;
-        
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1253,7 +1402,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        spin_lock(&swap_lock);
        for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
                p = swap_info + type;
-                if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
+                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping)
                                break;
                }
@@ -1426,12 +1575,12 @@ static int swap_show(struct seq_file *swap, void *v)
        file = ptr->swap_file;
        len = seq_path(swap, &file->f_path, " \t\n\\");
        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
-                       len < 40 ? 40 - len : 1, " ",
+                        len < 40 ? 40 - len : 1, " ",
-                       S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
+                        S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
                                "partition" : "file\t",
-                       ptr->pages << (PAGE_SHIFT - 10),
+                        ptr->pages << (PAGE_SHIFT - 10),
-                       ptr->inuse_pages << (PAGE_SHIFT - 10),
+                        ptr->inuse_pages << (PAGE_SHIFT - 10),
-                       ptr->prio);
+                        ptr->prio);
        return 0;
 }
@@ -1487,12 +1636,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        int i, prev;
        int error;
        union swap_header *swap_header = NULL;
-        int swap_header_version;
        unsigned int nr_good_pages = 0;
        int nr_extents = 0;
        sector_t span;
        unsigned long maxpages = 1;
-        int swapfilesize;
+        unsigned long swapfilepages;
        unsigned short *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -1570,7 +1718,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
-        swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        /*
         * Read the swap header.
@@ -1584,101 +1732,86 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = PTR_ERR(page);
                goto bad_swap;
        }
-        kmap(page);
+        swap_header = kmap(page);
-        swap_header = page_address(page);
-        if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
+        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
-                swap_header_version = 1;
-        else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
-                swap_header_version = 2;
-        else {
                printk(KERN_ERR "Unable to find swap-space signature\n");
                error = -EINVAL;
                goto bad_swap;
        }
-        
-        switch (swap_header_version) {
+        /* swap partition endianess hack... */
-        case 1:
+        if (swab32(swap_header->info.version) == 1) {
-                printk(KERN_ERR "version 0 swap is no longer supported. "
+                swab32s(&swap_header->info.version);
-                        "Use mkswap -v1 %s\n", name);
+                swab32s(&swap_header->info.last_page);
+                swab32s(&swap_header->info.nr_badpages);
+                for (i = 0; i < swap_header->info.nr_badpages; i++)
+                        swab32s(&swap_header->info.badpages[i]);
+        }
+        /* Check the swap header's sub-version */
+        if (swap_header->info.version != 1) {
+                printk(KERN_WARNING
+                       "Unable to handle swap header version %d\n",
+                       swap_header->info.version);
                error = -EINVAL;
                goto bad_swap;
-        case 2:
+        }
-                /* swap partition endianess hack... */
-                if (swab32(swap_header->info.version) == 1) {
-                        swab32s(&swap_header->info.version);
-                        swab32s(&swap_header->info.last_page);
-                        swab32s(&swap_header->info.nr_badpages);
-                        for (i = 0; i < swap_header->info.nr_badpages; i++)
-                                swab32s(&swap_header->info.badpages[i]);
-                }
-                /* Check the swap header's sub-version and the size of
-                   the swap file and bad block lists */
-                if (swap_header->info.version != 1) {
-                        printk(KERN_WARNING
-                               "Unable to handle swap header version %d\n",
-                               swap_header->info.version);
-                        error = -EINVAL;
-                        goto bad_swap;
-                }
-                p->lowest_bit  = 1;
+        p->lowest_bit  = 1;
-                p->cluster_next = 1;
+        p->cluster_next = 1;
-                /*
+        /*
-                 * Find out how many pages are allowed for a single swap
+         * Find out how many pages are allowed for a single swap
-                 * device. There are two limiting factors: 1) the number of
+         * device. There are two limiting factors: 1) the number of
-                 * bits for the swap offset in the swp_entry_t type and
+         * bits for the swap offset in the swp_entry_t type and
-                 * 2) the number of bits in the a swap pte as defined by
+         * 2) the number of bits in the a swap pte as defined by
-                 * the different architectures. In order to find the
+         * the different architectures. In order to find the
-                 * largest possible bit mask a swap entry with swap type 0
+         * largest possible bit mask a swap entry with swap type 0
-                 * and swap offset ~0UL is created, encoded to a swap pte,
+         * and swap offset ~0UL is created, encoded to a swap pte,
-                 * decoded to a swp_entry_t again and finally the swap
+         * decoded to a swp_entry_t again and finally the swap
-                 * offset is extracted. This will mask all the bits from
+         * offset is extracted. This will mask all the bits from
-                 * the initial ~0UL mask that can't be encoded in either
+         * the initial ~0UL mask that can't be encoded in either
-                 * the swp_entry_t or the architecture definition of a
+         * the swp_entry_t or the architecture definition of a
-                 * swap pte.
+         * swap pte.
-                 */
+         */
-                maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
+        maxpages = swp_offset(pte_to_swp_entry(
-                if (maxpages > swap_header->info.last_page)
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
-                        maxpages = swap_header->info.last_page;
+        if (maxpages > swap_header->info.last_page)
-                p->highest_bit = maxpages - 1;
+                maxpages = swap_header->info.last_page;
+        p->highest_bit = maxpages - 1;
-                error = -EINVAL;
+        error = -EINVAL;
-                if (!maxpages)
+        if (!maxpages)
-                        goto bad_swap;
+                goto bad_swap;
-                if (swapfilesize && maxpages > swapfilesize) {
+        if (swapfilepages && maxpages > swapfilepages) {
-                        printk(KERN_WARNING
+                printk(KERN_WARNING
-                               "Swap area shorter than signature indicates\n");
+                       "Swap area shorter than signature indicates\n");
-                        goto bad_swap;
+                goto bad_swap;
-                }
+        }
-                if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
-                        goto bad_swap;
+                goto bad_swap;
-                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
+        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
-                        goto bad_swap;
+                goto bad_swap;
-                /* OK, set up the swap map and apply the bad block list */
+        /* OK, set up the swap map and apply the bad block list */
-                swap_map = vmalloc(maxpages * sizeof(short));
+        swap_map = vmalloc(maxpages * sizeof(short));
-                if (!swap_map) {
+        if (!swap_map) {
-                        error = -ENOMEM;
+                error = -ENOMEM;
-                        goto bad_swap;
+                goto bad_swap;
-                }
+        }
-                error = 0;
+        memset(swap_map, 0, maxpages * sizeof(short));
-                memset(swap_map, 0, maxpages * sizeof(short));
+        for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                for (i = 0; i < swap_header->info.nr_badpages; i++) {
+                int page_nr = swap_header->info.badpages[i];
-                        int page_nr = swap_header->info.badpages[i];
+                if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
-                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
+                        error = -EINVAL;
-                                error = -EINVAL;
-                        else
-                                swap_map[page_nr] = SWAP_MAP_BAD;
-                }
-                nr_good_pages = swap_header->info.last_page -
-                                swap_header->info.nr_badpages -
-                                1 /* header page */;
-                if (error)
                        goto bad_swap;
+                }
+                swap_map[page_nr] = SWAP_MAP_BAD;
        }
+        nr_good_pages = swap_header->info.last_page -
+                        swap_header->info.nr_badpages -
+                        1 /* header page */;
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
@@ -1697,6 +1830,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
+        if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+                p->flags |= SWP_SOLIDSTATE;
+                p->cluster_next = 1 + (random32() % p->highest_bit);
+        }
+        if (discard_swap(p) == 0)
+                p->flags |= SWP_DISCARDABLE;
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1845,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
-        p->flags = SWP_ACTIVE;
+        p->flags |= SWP_WRITEOK;
        nr_swap_pages += nr_good_pages;
        total_swap_pages += nr_good_pages;
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk\n",
+                        "Priority:%d extents:%d across:%lluk %s%s\n",
                nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
-                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
+                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
+                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
+                (p->flags & SWP_DISCARDABLE) ? "D" : "");
        /* insert swap space into swap_list: */
        prev = -1;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d575ee6e..000000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
- *
- * Matt Mackall <mpm@selenic.com> January, 2004
- * derived from mm/shmem.c and fs/ramfs/inode.c
- *
- * This is intended for small system where the benefits of the full
- * shmem code (swap-backed and resource-limited) are outweighed by
- * their complexity. On systems without swap this code should be
- * effectively equivalent, but much lighter weight.
- */
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/vfs.h>
-#include <linux/mount.h>
-#include <linux/file.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
-        .name           = "tmpfs",
-        .get_sb         = ramfs_get_sb,
-        .kill_sb        = kill_litter_super,
-};
-static struct vfsmount *shm_mnt;
-static int __init init_tmpfs(void)
-{
-        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
-        shm_mnt = kern_mount(&tmpfs_fs_type);
-        BUG_ON(IS_ERR(shm_mnt));
-        return 0;
-}
-module_init(init_tmpfs)
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: vm_flags
- */
-struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
-{
-        int error;
-        struct file *file;
-        struct inode *inode;
-        struct dentry *dentry, *root;
-        struct qstr this;
-        if (IS_ERR(shm_mnt))
-                return (void *)shm_mnt;
-        error = -ENOMEM;
-        this.name = name;
-        this.len = strlen(name);
-        this.hash = 0; /* will go */
-        root = shm_mnt->mnt_root;
-        dentry = d_alloc(root, &this);
-        if (!dentry)
-                goto put_memory;
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
-        error = -ENOSPC;
-        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
-        if (!inode)
-                goto close_file;
-        d_instantiate(dentry, inode);
-        inode->i_size = size;
-        inode->i_nlink = 0;     /* It is unlinked */
-        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &ramfs_file_operations);
-#ifndef CONFIG_MMU
-        error = ramfs_nommu_expand_for_mapping(inode, size);
-        if (error)
-                goto close_file;
-#endif
-        return file;
-close_file:
-        put_filp(file);
-put_dentry:
-        dput(dentry);
-put_memory:
-        return ERR_PTR(error);
-}
-EXPORT_SYMBOL_GPL(shmem_file_setup);
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
-{
-        struct file *file;
-        loff_t size = vma->vm_end - vma->vm_start;
-        file = shmem_file_setup("dev/zero", size, vma->vm_flags);
-        if (IS_ERR(file))
-                return PTR_ERR(file);
-        if (vma->vm_file)
-                fput(vma->vm_file);
-        vma->vm_file = file;
-        vma->vm_ops = &generic_file_vm_ops;
-        return 0;
-}
-int shmem_unuse(swp_entry_t entry, struct page *page)
-{
-        return 0;
-}
-#ifndef CONFIG_MMU
-unsigned long shmem_get_unmapped_area(struct file *file,
-                                      unsigned long addr,
-                                      unsigned long len,
-                                      unsigned long pgoff,
-                                      unsigned long flags)
-{
-        return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7465f22fec0c..c5db9a7264d9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,6 +14,7 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/interrupt.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -381,8 +382,9 @@ found:
                        goto retry;
                }
                if (printk_ratelimit())
-                        printk(KERN_WARNING "vmap allocation failed: "
+                        printk(KERN_WARNING
-                                 "use vmalloc=<size> to increase size.\n");
+                                "vmap allocation for size %lu failed: "
+                                "use vmalloc=<size> to increase size.\n", size);
                return ERR_PTR(-EBUSY);
        }
@@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
        vunmap_page_range(va->va_start, va->va_end);
 }
+static void vmap_debug_free_range(unsigned long start, unsigned long end)
+{
+        /*
+         * Unmap page tables and force a TLB flush immediately if
+         * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
+         * bugs similarly to those in linear kernel virtual address
+         * space after a page has been freed.
+         *
+         * All the lazy freeing logic is still retained, in order to
+         * minimise intrusiveness of this debugging feature.
+         *
+         * This is going to be *slow* (linear kernel virtual address
+         * debugging doesn't do a broadcast TLB flush so it is a lot
+         * faster).
+         */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        vunmap_page_range(start, end);
+        flush_tlb_kernel_range(start, end);
+#endif
+}
 /*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
@@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
                                        int sync, int force_flush)
 {
-        static DEFINE_SPINLOCK(purge_lock);
+        static DEFINE_MUTEX(purge_lock);
        LIST_HEAD(valist);
        struct vmap_area *va;
        int nr = 0;
@@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
         * the case that isn't actually used at the moment anyway.
         */
        if (!sync && !force_flush) {
-                if (!spin_trylock(&purge_lock))
+                if (!mutex_trylock(&purge_lock))
                        return;
        } else
-                spin_lock(&purge_lock);
+                mutex_lock(&purge_lock);
        rcu_read_lock();
        list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
                        __free_vmap_area(va);
                spin_unlock(&vmap_area_lock);
        }
-        spin_unlock(&purge_lock);
+        mutex_unlock(&purge_lock);
 }
 /*
@@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
        BUG_ON(addr & (PAGE_SIZE-1));
        debug_check_no_locks_freed(mem, size);
+        vmap_debug_free_range(addr, addr+size);
        if (likely(count <= VMAP_MAX_ALLOC))
                vb_free(mem, size);
@@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr)
        if (va && va->flags & VM_VM_AREA) {
                struct vm_struct *vm = va->private;
                struct vm_struct *tmp, **p;
+                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
                vm->size -= PAGE_SIZE;
@@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+        ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                             PAGE_KERNEL, -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
@@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node);
 void *vmalloc_exec(unsigned long size)
 {
-        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+                              -1, __builtin_return_address(0));
 }
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size)
 */
 void *vmalloc_32(unsigned long size)
 {
-        return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL);
+        return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
+                              -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_32);
@@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size)
        struct vm_struct *area;
        void *ret;
-        ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
+        ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+                             -1, __builtin_return_address(0));
        if (ret) {
                area = find_vm_area(ret);
                area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d196f46c8808..b07c48b09a93 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
+        /* Number of pages freed so far during a call to shrink_zones() */
+        unsigned long nr_reclaimed;
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -617,7 +620,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        referenced && page_mapping_inuse(page))
                        goto activate_locked;
-#ifdef CONFIG_SWAP
                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
@@ -625,20 +627,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageAnon(page) && !PageSwapCache(page)) {
                        if (!(sc->gfp_mask & __GFP_IO))
                                goto keep_locked;
-                        switch (try_to_munlock(page)) {
+                        if (!add_to_swap(page))
-                        case SWAP_FAIL:         /* shouldn't happen */
-                        case SWAP_AGAIN:
-                                goto keep_locked;
-                        case SWAP_MLOCK:
-                                goto cull_mlocked;
-                        case SWAP_SUCCESS:
-                                ; /* fall thru'; add to swap cache */
-                        }
-                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                        may_enter_fs = 1;
                }
-#endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
@@ -752,6 +744,8 @@ free_it:
                continue;
 cull_mlocked:
+                if (PageSwapCache(page))
+                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
                continue;
@@ -759,7 +753,7 @@ cull_mlocked:
 activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
                if (PageSwapCache(page) && vm_swap_full())
-                        remove_exclusive_swap_page_ref(page);
+                        try_to_free_swap(page);
                VM_BUG_ON(PageActive(page));
                SetPageActive(page);
                pgactivate++;
@@ -1173,11 +1167,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
                zone->prev_priority = priority;
 }
-static inline int zone_is_near_oom(struct zone *zone)
-{
-        return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
-}
 /*
 * This moves pages from the active list to the inactive list.
 *
@@ -1248,6 +1237,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                list_add(&page->lru, &l_inactive);
        }
+        /*
+         * Move the pages to the [file or anon] inactive list.
+         */
+        pagevec_init(&pvec, 1);
+        pgmoved = 0;
+        lru = LRU_BASE + file * LRU_FILE;
        spin_lock_irq(&zone->lru_lock);
        /*
         * Count referenced pages from currently used mappings as
@@ -1255,15 +1251,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * This helps balance scan pressure between file and anonymous
         * pages in get_scan_ratio.
         */
-        zone->recent_rotated[!!file] += pgmoved;
+        if (scan_global_lru(sc))
+                zone->recent_rotated[!!file] += pgmoved;
-        /*
-         * Move the pages to the [file or anon] inactive list.
-         */
-        pagevec_init(&pvec, 1);
-        pgmoved = 0;
-        lru = LRU_BASE + file * LRU_FILE;
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1336,12 +1326,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
-        anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
-                zone_page_state(zone, NR_INACTIVE_ANON);
-        file  = zone_page_state(zone, NR_ACTIVE_FILE) +
-                zone_page_state(zone, NR_INACTIVE_FILE);
-        free  = zone_page_state(zone, NR_FREE_PAGES);
        /* If we have no swap space, do not bother scanning anon pages. */
        if (nr_swap_pages <= 0) {
                percent[0] = 0;
@@ -1349,6 +1333,12 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
                return;
        }
+        anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
+                zone_page_state(zone, NR_INACTIVE_ANON);
+        file  = zone_page_state(zone, NR_ACTIVE_FILE) +
+                zone_page_state(zone, NR_INACTIVE_FILE);
+        free  = zone_page_state(zone, NR_FREE_PAGES);
        /* If we have very few page cache pages, force-scan anon pages. */
        if (unlikely(file + free <= zone->pages_high)) {
                percent[0] = 100;
@@ -1408,14 +1398,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
                                struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
-        unsigned long nr_reclaimed = 0;
        unsigned long percent[2];       /* anon @ 0; file @ 1 */
        enum lru_list l;
+        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long swap_cluster_max = sc->swap_cluster_max;
        get_scan_ratio(zone, sc, percent);
@@ -1431,7 +1422,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                        }
                        zone->lru[l].nr_scan += scan;
                        nr[l] = zone->lru[l].nr_scan;
-                        if (nr[l] >= sc->swap_cluster_max)
+                        if (nr[l] >= swap_cluster_max)
                                zone->lru[l].nr_scan = 0;
                        else
                                nr[l] = 0;
@@ -1450,16 +1441,28 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
                        if (nr[l]) {
-                                nr_to_scan = min(nr[l],
+                                nr_to_scan = min(nr[l], swap_cluster_max);
-                                        (unsigned long)sc->swap_cluster_max);
                                nr[l] -= nr_to_scan;
                                nr_reclaimed += shrink_list(l, nr_to_scan,
-                                                        zone, sc, priority);
+                                                            zone, sc, priority);
                        }
                }
+                /*
+                 * On large memory systems, scan >> priority can become
+                 * really large. This is fine for the starting priority;
+                 * we want to put equal scanning pressure on each zone.
+                 * However, if the VM has a harder time of freeing pages,
+                 * with multiple processes reclaiming pages, the total
+                 * freeing target can get unreasonably large.
+                 */
+                if (nr_reclaimed > swap_cluster_max &&
+                        priority < DEF_PRIORITY && !current_is_kswapd())
+                        break;
        }
+        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
@@ -1470,7 +1473,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
        throttle_vm_writeout(sc->gfp_mask);
-        return nr_reclaimed;
 }
 /*
@@ -1484,16 +1486,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 * b) The zones may be over pages_high but they must go *over* pages_high to
 *    satisfy the `incremental min' zone defense algorithm.
 *
- * Returns the number of reclaimed pages.
- *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
-        unsigned long nr_reclaimed = 0;
        struct zoneref *z;
        struct zone *zone;
@@ -1524,10 +1523,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                                        priority);
                }
-                nr_reclaimed += shrink_zone(priority, zone, sc);
+                shrink_zone(priority, zone, sc);
        }
-        return nr_reclaimed;
 }
 /*
@@ -1552,7 +1549,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        int priority;
        unsigned long ret = 0;
        unsigned long total_scanned = 0;
-        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
        struct zoneref *z;
@@ -1580,7 +1576,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-                nr_reclaimed += shrink_zones(priority, zonelist, sc);
+                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -1588,13 +1584,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                if (scan_global_lru(sc)) {
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
-                                nr_reclaimed += reclaim_state->reclaimed_slab;
+                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
                        }
                }
                total_scanned += sc->nr_scanned;
-                if (nr_reclaimed >= sc->swap_cluster_max) {
+                if (sc->nr_reclaimed >= sc->swap_cluster_max) {
-                        ret = nr_reclaimed;
+                        ret = sc->nr_reclaimed;
                        goto out;
                }
@@ -1617,7 +1613,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scan_global_lru(sc))
-                ret = nr_reclaimed;
+                ret = sc->nr_reclaimed;
 out:
        /*
         * Now that we've scanned all the zones at this priority level, note
@@ -1712,7 +1708,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
        int priority;
        int i;
        unsigned long total_scanned;
-        unsigned long nr_reclaimed;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1726,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 loop_again:
        total_scanned = 0;
-        nr_reclaimed = 0;
+        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
@@ -1817,11 +1812,11 @@ loop_again:
                         */
                        if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
                                                end_zone, 0))
-                                nr_reclaimed += shrink_zone(priority, zone, &sc);
+                                shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
-                        nr_reclaimed += reclaim_state->reclaimed_slab;
+                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
                        if (zone_is_all_unreclaimable(zone))
                                continue;
@@ -1835,7 +1830,7 @@ loop_again:
                         * even in laptop mode
                         */
                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
+                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
                if (all_zones_ok)
@@ -1853,7 +1848,7 @@ loop_again:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1872,10 +1867,27 @@ out:
                try_to_freeze();
+                /*
+                 * Fragmentation may mean that the system cannot be
+                 * rebalanced for high-order allocations in all zones.
+                 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
+                 * it means the zones have been fully scanned and are still
+                 * not balanced. For high-order allocations, there is
+                 * little point trying all over again as kswapd may
+                 * infinite loop.
+                 *
+                 * Instead, recheck all watermarks at order-0 as they
+                 * are the most important. If watermarks are ok, kswapd will go
+                 * back to sleep. High-order users can still perform direct
+                 * reclaim if they wish.
+                 */
+                if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
+                        order = sc.order = 0;
                goto loop_again;
        }
-        return nr_reclaimed;
+        return sc.nr_reclaimed;
 }
 /*
@@ -2227,7 +2239,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
        int priority;
-        unsigned long nr_reclaimed = 0;
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2271,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                priority = ZONE_RECLAIM_PRIORITY;
                do {
                        note_zone_scanning_priority(zone, priority);
-                        nr_reclaimed += shrink_zone(priority, zone, &sc);
+                        shrink_zone(priority, zone, &sc);
                        priority--;
-                } while (priority >= 0 && nr_reclaimed < nr_pages);
+                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2297,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Update nr_reclaimed by the number of slab pages we
                 * reclaimed from this zone.
                 */
-                nr_reclaimed += slab_reclaimable -
+                sc.nr_reclaimed += slab_reclaimable -
                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
        }
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        return nr_reclaimed >= nr_pages;
+        return sc.nr_reclaimed >= nr_pages;
 }
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2472,7 +2483,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 * back onto @zone's unevictable list.
 */
 #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-void scan_zone_unevictable_pages(struct zone *zone)
+static void scan_zone_unevictable_pages(struct zone *zone)
 {
        struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
        unsigned long scan;
@@ -2514,7 +2525,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
 * that has possibly/probably made some previously unevictable pages
 * evictable.
 */
-void scan_all_zones_unevictable_pages(void)
+static void scan_all_zones_unevictable_pages(void)
 {
        struct zone *zone;