14 files changed, 918 insertions, 343 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <linux/hugetlb.h>
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
        .nopage = hugetlb_nopage,
 };
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+        pte_t entry;
+        if (vma->vm_flags & VM_WRITE) {
+                entry =
+                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+        } else {
+                entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+        }
+        entry = pte_mkyoung(entry);
+        entry = pte_mkhuge(entry);
+        return entry;
+}
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+                            struct vm_area_struct *vma)
+{
+        pte_t *src_pte, *dst_pte, entry;
+        struct page *ptepage;
+        unsigned long addr = vma->vm_start;
+        unsigned long end = vma->vm_end;
+        while (addr < end) {
+                dst_pte = huge_pte_alloc(dst, addr);
+                if (!dst_pte)
+                        goto nomem;
+                src_pte = huge_pte_offset(src, addr);
+                BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+                entry = *src_pte;
+                ptepage = pte_page(entry);
+                get_page(ptepage);
+                add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+                set_huge_pte_at(dst, addr, dst_pte, entry);
+                addr += HPAGE_SIZE;
+        }
+        return 0;
+nomem:
+        return -ENOMEM;
+}
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+                          unsigned long end)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long address;
+        pte_t pte;
+        struct page *page;
+        WARN_ON(!is_vm_hugetlb_page(vma));
+        BUG_ON(start & ~HPAGE_MASK);
+        BUG_ON(end & ~HPAGE_MASK);
+        for (address = start; address < end; address += HPAGE_SIZE) {
+                pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+                if (pte_none(pte))
+                        continue;
+                page = pte_page(pte);
+                put_page(page);
+        }
+        add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
+        flush_tlb_range(vma, start, end);
+}
 void zap_hugepage_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long length)
 {
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
        unmap_hugepage_range(vma, start, start + length);
        spin_unlock(&mm->page_table_lock);
 }
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long addr;
+        int ret = 0;
+        WARN_ON(!is_vm_hugetlb_page(vma));
+        BUG_ON(vma->vm_start & ~HPAGE_MASK);
+        BUG_ON(vma->vm_end & ~HPAGE_MASK);
+        hugetlb_prefault_arch_hook(mm);
+        spin_lock(&mm->page_table_lock);
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+                unsigned long idx;
+                pte_t *pte = huge_pte_alloc(mm, addr);
+                struct page *page;
+                if (!pte) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                if (! pte_none(*pte))
+                        hugetlb_clean_stale_pgtable(pte);
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                page = find_get_page(mapping, idx);
+                if (!page) {
+                        /* charge the fs quota first */
+                        if (hugetlb_get_quota(mapping)) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        page = alloc_huge_page();
+                        if (!page) {
+                                hugetlb_put_quota(mapping);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+                        if (! ret) {
+                                unlock_page(page);
+                        } else {
+                                hugetlb_put_quota(mapping);
+                                free_huge_page(page);
+                                goto out;
+                        }
+                }
+                add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+                set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+        }
+out:
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                        struct page **pages, struct vm_area_struct **vmas,
+                        unsigned long *position, int *length, int i)
+{
+        unsigned long vpfn, vaddr = *position;
+        int remainder = *length;
+        BUG_ON(!is_vm_hugetlb_page(vma));
+        vpfn = vaddr/PAGE_SIZE;
+        while (vaddr < vma->vm_end && remainder) {
+                if (pages) {
+                        pte_t *pte;
+                        struct page *page;
+                        /* Some archs (sparc64, sh*) have multiple
+                         * pte_ts to each hugepage.  We have to make
+                         * sure we get the first, for the page
+                         * indexing below to work. */
+                        pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+                        /* hugetlb should be locked, and hence, prefaulted */
+                        WARN_ON(!pte || pte_none(*pte));
+                        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+                        WARN_ON(!PageCompound(page));
+                        get_page(page);
+                        pages[i] = page;
+                }
+                if (vmas)
+                        vmas[i] = vma;
+                vaddr += PAGE_SIZE;
+                ++vpfn;
+                --remainder;
+                ++i;
+        }
+        *length = remainder;
+        *position = vaddr;
+        return i;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 944b5e52d812..e3108054733c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -8,17 +8,47 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/mempolicy.h>
 #include <linux/hugetlb.h>
 /*
 * We can potentially split a vm area into separate
 * areas, each area with its own behavior.
 */
-static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
+static long madvise_behavior(struct vm_area_struct * vma,
-                             unsigned long end, int behavior)
+                     struct vm_area_struct **prev,
+                     unsigned long start, unsigned long end, int behavior)
 {
        struct mm_struct * mm = vma->vm_mm;
        int error = 0;
+        pgoff_t pgoff;
+        int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+        switch (behavior) {
+        case MADV_SEQUENTIAL:
+                new_flags |= VM_SEQ_READ;
+                break;
+        case MADV_RANDOM:
+                new_flags |= VM_RAND_READ;
+                break;
+        default:
+                break;
+        }
+        if (new_flags == vma->vm_flags) {
+                *prev = vma;
+                goto success;
+        }
+        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+        *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+                                vma->vm_file, pgoff, vma_policy(vma));
+        if (*prev) {
+                vma = *prev;
+                goto success;
+        }
+        *prev = vma;
        if (start != vma->vm_start) {
                error = split_vma(mm, vma, start, 1);
@@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
         * vm_flags is protected by the mmap_sem held in write mode.
         */
        VM_ClearReadHint(vma);
+        vma->vm_flags = new_flags;
-        switch (behavior) {
-        case MADV_SEQUENTIAL:
-                vma->vm_flags |= VM_SEQ_READ;
-                break;
-        case MADV_RANDOM:
-                vma->vm_flags |= VM_RAND_READ;
-                break;
-        default:
-                break;
-        }
 out:
        if (error == -ENOMEM)
                error = -EAGAIN;
+success:
        return error;
 }
@@ -58,6 +79,7 @@ out:
 * Schedule all required I/O operations.  Do not wait for completion.
 */
 static long madvise_willneed(struct vm_area_struct * vma,
+                             struct vm_area_struct ** prev,
                             unsigned long start, unsigned long end)
 {
        struct file *file = vma->vm_file;
@@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
        if (!file)
                return -EBADF;
+        *prev = vma;
        start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        if (end > vma->vm_end)
                end = vma->vm_end;
@@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
 static long madvise_dontneed(struct vm_area_struct * vma,
+                             struct vm_area_struct ** prev,
                             unsigned long start, unsigned long end)
 {
+        *prev = vma;
        if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
                return -EINVAL;
@@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
+static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
-                        unsigned long end, int behavior)
+                        unsigned long start, unsigned long end, int behavior)
 {
        long error = -EBADF;
@@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
-                error = madvise_behavior(vma, start, end, behavior);
+                error = madvise_behavior(vma, prev, start, end, behavior);
                break;
        case MADV_WILLNEED:
-                error = madvise_willneed(vma, start, end);
+                error = madvise_willneed(vma, prev, start, end);
                break;
        case MADV_DONTNEED:
-                error = madvise_dontneed(vma, start, end);
+                error = madvise_dontneed(vma, prev, start, end);
                break;
        default:
@@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
 */
 asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 {
-        unsigned long end;
+        unsigned long end, tmp;
-        struct vm_area_struct * vma;
+        struct vm_area_struct * vma, *prev;
        int unmapped_error = 0;
        int error = -EINVAL;
        size_t len;
@@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
        /*
         * If the interval [start,end) covers some unmapped address
         * ranges, just ignore them, but return -ENOMEM at the end.
+         * - different from the way of handling in mlock etc.
         */
-        vma = find_vma(current->mm, start);
+        vma = find_vma_prev(current->mm, start, &prev);
+        if (!vma && prev)
+                vma = prev->vm_next;
        for (;;) {
                /* Still start < end. */
                error = -ENOMEM;
                if (!vma)
                        goto out;
-                /* Here start < vma->vm_end. */
+                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
+                        if (start >= end)
+                                goto out;
                }
-                /* Here vma->vm_start <= start < vma->vm_end. */
+                /* Here vma->vm_start <= start < (end|vma->vm_end) */
-                if (end <= vma->vm_end) {
+                tmp = vma->vm_end;
-                        if (start < end) {
+                if (end < tmp)
-                                error = madvise_vma(vma, start, end,
+                        tmp = end;
-                                                        behavior);
-                                if (error)
-                                        goto out;
-                        }
-                        error = unmapped_error;
-                        goto out;
-                }
-                /* Here vma->vm_start <= start < vma->vm_end < end. */
+                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-                error = madvise_vma(vma, start, vma->vm_end, behavior);
+                error = madvise_vma(vma, &prev, start, tmp, behavior);
                if (error)
                        goto out;
-                start = vma->vm_end;
+                start = tmp;
-                vma = vma->vm_next;
+                if (start < prev->vm_end)
+                        start = prev->vm_end;
+                error = unmapped_error;
+                if (start >= end)
+                        goto out;
+                vma = prev->vm_next;
        }
 out:
        up_write(&current->mm->mmap_sem);
        return error;
diff --git a/mm/memory.c b/mm/memory.c
index d209f745db7f..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
        return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
 }
 EXPORT_SYMBOL(check_user_page_readable);
-/* 
- * Given a physical address, is there a useful struct page pointing to
- * it?  This may become more complex in the future if we start dealing
- * with IO-aperture pages for direct-IO.
- */
-static inline struct page *get_page_map(struct page *page)
-{
-        if (!pfn_valid(page_to_pfn(page)))
-                return NULL;
-        return page;
-}
 static inline int
 untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
                         unsigned long address)
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
        return 0;
 }
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long start, int len, int write, int force,
                struct page **pages, struct vm_area_struct **vmas)
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                }
                spin_lock(&mm->page_table_lock);
                do {
-                        struct page *map;
+                        struct page *page;
                        int lookup_write = write;
                        cond_resched_lock(&mm->page_table_lock);
-                        while (!(map = follow_page(mm, start, lookup_write))) {
+                        while (!(page = follow_page(mm, start, lookup_write))) {
                                /*
                                 * Shortcut for anonymous pages. We don't want
                                 * to force the creation of pages tables for
-                                 * insanly big anonymously mapped areas that
+                                 * insanely big anonymously mapped areas that
                                 * nobody touched so far. This is important
                                 * for doing a core dump for these mappings.
                                 */
                                if (!lookup_write &&
                                    untouched_anonymous_page(mm,vma,start)) {
-                                        map = ZERO_PAGE(start);
+                                        page = ZERO_PAGE(start);
                                        break;
                                }
                                spin_unlock(&mm->page_table_lock);
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                spin_lock(&mm->page_table_lock);
                        }
                        if (pages) {
-                                pages[i] = get_page_map(map);
+                                pages[i] = page;
-                                if (!pages[i]) {
+                                flush_dcache_page(page);
-                                        spin_unlock(&mm->page_table_lock);
+                                if (!PageReserved(page))
-                                        while (i--)
+                                        page_cache_get(page);
-                                                page_cache_release(pages[i]);
-                                        i = -EFAULT;
-                                        goto out;
-                                }
-                                flush_dcache_page(pages[i]);
-                                if (!PageReserved(pages[i]))
-                                        page_cache_get(pages[i]);
                        }
                        if (vmas)
                                vmas[i] = vma;
                        i++;
                        start += PAGE_SIZE;
                        len--;
-                } while(len && start < vma->vm_end);
+                } while (len && start < vma->vm_end);
                spin_unlock(&mm->page_table_lock);
-        } while(len);
+        } while (len);
-out:
        return i;
 }
 EXPORT_SYMBOL(get_user_pages);
 static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        }
        old_page = pfn_to_page(pfn);
-        if (!TestSetPageLocked(old_page)) {
+        if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
                int reuse = can_share_swap_page(old_page);
                unlock_page(old_page);
                if (reuse) {
@@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
        }
        /* The page isn't present yet, go ahead with the fault. */
-                
-        swap_free(entry);
-        if (vm_swap_full())
-                remove_exclusive_swap_page(page);
        inc_mm_counter(mm, rss);
        pte = mk_pte(page, vma->vm_page_prot);
@@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                write_access = 0;
        }
-        unlock_page(page);
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
        page_add_anon_rmap(page, vma, address);
+        swap_free(entry);
+        if (vm_swap_full())
+                remove_exclusive_swap_page(page);
+        unlock_page(page);
        if (write_access) {
                if (do_wp_page(mm, vma, address,
                                page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..cb41c31e7c87 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 }
 /* Ensure all existing pages follow the policy. */
-static int
+static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
-verify_pages(struct mm_struct *mm,
+                unsigned long addr, unsigned long end, unsigned long *nodes)
-             unsigned long addr, unsigned long end, unsigned long *nodes)
 {
-        while (addr < end) {
+        pte_t *orig_pte;
-                struct page *p;
+        pte_t *pte;
-                pte_t *pte;
-                pmd_t *pmd;
+        spin_lock(&mm->page_table_lock);
-                pud_t *pud;
+        orig_pte = pte = pte_offset_map(pmd, addr);
-                pgd_t *pgd;
+        do {
-                pgd = pgd_offset(mm, addr);
+                unsigned long pfn;
-                if (pgd_none(*pgd)) {
+                unsigned int nid;
-                        unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
-                        if (next > addr)
+                if (!pte_present(*pte))
-                                break;
-                        addr = next;
                        continue;
-                }
+                pfn = pte_pfn(*pte);
-                pud = pud_offset(pgd, addr);
+                if (!pfn_valid(pfn))
-                if (pud_none(*pud)) {
-                        addr = (addr + PUD_SIZE) & PUD_MASK;
                        continue;
-                }
+                nid = pfn_to_nid(pfn);
-                pmd = pmd_offset(pud, addr);
+                if (!test_bit(nid, nodes))
-                if (pmd_none(*pmd)) {
+                        break;
-                        addr = (addr + PMD_SIZE) & PMD_MASK;
+        } while (pte++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(orig_pte);
+        spin_unlock(&mm->page_table_lock);
+        return addr != end;
+}
+static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+                unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+        pmd_t *pmd;
+        unsigned long next;
+        pmd = pmd_offset(pud, addr);
+        do {
+                next = pmd_addr_end(addr, end);
+                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                }
+                if (check_pte_range(mm, pmd, addr, next, nodes))
-                p = NULL;
+                        return -EIO;
-                pte = pte_offset_map(pmd, addr);
+        } while (pmd++, addr = next, addr != end);
-                if (pte_present(*pte))
+        return 0;
-                        p = pte_page(*pte);
+}
-                pte_unmap(pte);
-                if (p) {
+static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
-                        unsigned nid = page_to_nid(p);
+                unsigned long addr, unsigned long end, unsigned long *nodes)
-                        if (!test_bit(nid, nodes))
+{
-                                return -EIO;
+        pud_t *pud;
-                }
+        unsigned long next;
-                addr += PAGE_SIZE;
-        }
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+                if (pud_none_or_clear_bad(pud))
+                        continue;
+                if (check_pmd_range(mm, pud, addr, next, nodes))
+                        return -EIO;
+        } while (pud++, addr = next, addr != end);
+        return 0;
+}
+static inline int check_pgd_range(struct mm_struct *mm,
+                unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+        pgd_t *pgd;
+        unsigned long next;
+        pgd = pgd_offset(mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd))
+                        continue;
+                if (check_pud_range(mm, pgd, addr, next, nodes))
+                        return -EIO;
+        } while (pgd++, addr = next, addr != end);
        return 0;
 }
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                if (prev && prev->vm_end < vma->vm_start)
                        return ERR_PTR(-EFAULT);
                if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
-                        err = verify_pages(vma->vm_mm,
+                        err = check_pgd_range(vma->vm_mm,
                                           vma->vm_start, vma->vm_end, nodes);
                        if (err) {
                                first = ERR_PTR(err);
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
        zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
        page = __alloc_pages(gfp, order, zl);
        if (page && page_zone(page) == zl->zones[0]) {
-                zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+                zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
                put_cpu();
        }
        return page;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..da3fa90a0aae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
                    (!vma || addr + len <= vma->vm_start))
                        return addr;
        }
-        start_addr = addr = mm->free_area_cache;
+        if (len > mm->cached_hole_size) {
+                start_addr = addr = mm->free_area_cache;
+        } else {
+                start_addr = addr = TASK_UNMAPPED_BASE;
+                mm->cached_hole_size = 0;
+        }
 full_search:
        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
                         * some holes.
                         */
                        if (start_addr != TASK_UNMAPPED_BASE) {
-                                start_addr = addr = TASK_UNMAPPED_BASE;
+                                addr = TASK_UNMAPPED_BASE;
+                                start_addr = addr;
+                                mm->cached_hole_size = 0;
                                goto full_search;
                        }
                        return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
                        mm->free_area_cache = addr + len;
                        return addr;
                }
+                if (addr + mm->cached_hole_size < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
                addr = vma->vm_end;
        }
 }
 #endif  
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
        /*
         * Is this a new hole at the lowest possible address?
         */
-        if (area->vm_start >= TASK_UNMAPPED_BASE &&
+        if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
-                        area->vm_start < area->vm_mm->free_area_cache)
+                mm->free_area_cache = addr;
-                area->vm_mm->free_area_cache = area->vm_start;
+                mm->cached_hole_size = ~0UL;
+        }
 }
 /*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return addr;
        }
+        /* check if free_area_cache is useful for us */
+        if (len <= mm->cached_hole_size) {
+                mm->cached_hole_size = 0;
+                mm->free_area_cache = mm->mmap_base;
+        }
        /* either no address requested or can't fit in requested address hole */
        addr = mm->free_area_cache;
@@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        return (mm->free_area_cache = addr-len);
        }
+        if (mm->mmap_base < len)
+                goto bottomup;
        addr = mm->mmap_base-len;
        do {
@@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
                        /* remember the address as a hint for next time */
                        return (mm->free_area_cache = addr);
+                /* remember the largest hole we saw so far */
+                if (addr + mm->cached_hole_size < vma->vm_start)
+                        mm->cached_hole_size = vma->vm_start - addr;
                /* try just below the current vma->vm_start */
                addr = vma->vm_start-len;
        } while (len < vma->vm_start);
+bottomup:
        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
-        mm->free_area_cache = TASK_UNMAPPED_BASE;
+        mm->cached_hole_size = ~0UL;
+        mm->free_area_cache = TASK_UNMAPPED_BASE;
        addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
        /*
         * Restore the topdown base:
         */
        mm->free_area_cache = mm->mmap_base;
+        mm->cached_hole_size = ~0UL;
        return addr;
 }
 #endif
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
        /*
         * Is this a new hole at the highest possible address?
         */
-        if (area->vm_end > area->vm_mm->free_area_cache)
+        if (addr > mm->free_area_cache)
-                area->vm_mm->free_area_cache = area->vm_end;
+                mm->free_area_cache = addr;
        /* dont allow allocations above current base */
-        if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
+        if (mm->free_area_cache > mm->mmap_base)
-                area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+                mm->free_area_cache = mm->mmap_base;
 }
 unsigned long
@@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
        if (area->vm_flags & VM_LOCKED)
                area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
        vm_stat_unaccount(area);
-        area->vm_mm->unmap_area(area);
        remove_vm_struct(area);
 }
@@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct vm_area_struct **insertion_point;
        struct vm_area_struct *tail_vma = NULL;
+        unsigned long addr;
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        do {
@@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
        tail_vma->vm_next = NULL;
+        if (mm->unmap_area == arch_unmap_area)
+                addr = prev ? prev->vm_end : mm->mmap_base;
+        else
+                addr = vma ?  vma->vm_start : mm->mmap_base;
+        mm->unmap_area(mm, addr);
        mm->mmap_cache = NULL;          /* Kill the cache. */
 }
diff --git a/mm/msync.c b/mm/msync.c
index 090f426bca7d..d0f5a1bce7cb 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (!pte_present(*pte))
                        continue;
+                if (!pte_maybe_dirty(*pte))
+                        continue;
                pfn = pte_pfn(*pte);
                if (!pfn_valid(pfn))
                        continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
        return -ENOMEM;
 }
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4bbb1cb10495..59666d905f19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask)
        struct mm_struct *mm = NULL;
        task_t * p;
+        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+        /* print memory stats */
+        show_mem();
        read_lock(&tasklist_lock);
 retry:
        p = select_bad_process();
@@ -268,12 +272,9 @@ retry:
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                read_unlock(&tasklist_lock);
-                show_free_areas();
                panic("Out of memory and no killable processes...\n");
        }
-        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
-        show_free_areas();
        mm = oom_kill_process(p);
        if (!mm)
                goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..206920796f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
        printk(KERN_EMERG "Backtrace:\n");
        dump_stack();
        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
-        page->flags &= ~(1 << PG_private        |
+        page->flags &= ~(1 << PG_lru    |
+                        1 << PG_private |
                        1 << PG_locked  |
-                        1 << PG_lru     |
                        1 << PG_active  |
                        1 << PG_dirty   |
+                        1 << PG_reclaim |
+                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback);
        set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
 */
 static void prep_new_page(struct page *page, int order)
 {
-        if (page->mapping || page_mapcount(page) ||
+        if (    page_mapcount(page) ||
-            (page->flags & (
+                page->mapping != NULL ||
+                page_count(page) != 0 ||
+                (page->flags & (
+                        1 << PG_lru     |
                        1 << PG_private |
                        1 << PG_locked  |
-                        1 << PG_lru     |
                        1 << PG_active  |
                        1 << PG_dirty   |
                        1 << PG_reclaim |
+                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback )))
                bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return allocated;
 }
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+        struct zone *zone;
+        int i;
+        unsigned long flags;
+        local_irq_save(flags);
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset;
+                /* Do not drain local pagesets */
+                if (zone->zone_pgdat->node_id == numa_node_id())
+                        continue;
+                pset = zone->pageset[smp_processor_id()];
+                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+                        struct per_cpu_pages *pcp;
+                        pcp = &pset->pcp[i];
+                        if (pcp->count)
+                                pcp->count -= free_pages_bulk(zone, pcp->count,
+                                                &pcp->list, 0);
+                }
+        }
+        local_irq_restore(flags);
+}
+#endif
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
-                pset = &zone->pageset[cpu];
+                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        p = &z->pageset[cpu];
+        p = zone_pcp(z,cpu);
        if (pg == orig) {
-                z->pageset[cpu].numa_hit++;
+                p->numa_hit++;
        } else {
                p->numa_miss++;
-                zonelist->zones[0]->pageset[cpu].numa_foreign++;
+                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
        }
        if (pg == NODE_DATA(numa_node_id()))
                p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
        free_pages_check(__FUNCTION__, page);
-        pcp = &zone->pageset[get_cpu()].pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
-        if (pcp->count >= pcp->high)
-                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
+        if (pcp->count >= pcp->high)
+                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
        local_irq_restore(flags);
        put_cpu();
 }
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                pcp = &zone->pageset[get_cpu()].pcp[cold];
+                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
                if (pcp->count <= pcp->low)
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+        if (!z->reclaim_pages)
+                return 0;
+        if (gfp_mask & __GFP_NORECLAIM)
+                return 0;
+        return 1;
+}
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
        classzone_idx = zone_idx(zones[0]);
- restart:
+restart:
        /* Go through the zonelist once, looking for a zone with enough free */
        for (i = 0; (z = zones[i]) != NULL; i++) {
+                int do_reclaim = should_reclaim_zone(z, gfp_mask);
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0))
-                        continue;
                if (!cpuset_zone_allowed(z))
                        continue;
+                /*
+                 * If the zone is to attempt early page reclaim then this loop
+                 * will try to reclaim pages and check the watermark a second
+                 * time before giving up and falling back to the next zone.
+                 */
+zone_reclaim_retry:
+                if (!zone_watermark_ok(z, order, z->pages_low,
+                                       classzone_idx, 0, 0)) {
+                        if (!do_reclaim)
+                                continue;
+                        else {
+                                zone_reclaim(z, gfp_mask, order);
+                                /* Only try reclaim once */
+                                do_reclaim = 0;
+                                goto zone_reclaim_retry;
+                        }
+                }
                page = buffered_rmqueue(z, order, gfp_mask);
                if (page)
                        goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+        did_some_progress = try_to_free_pages(zones, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
                        " order:%d, mode:0x%x\n",
                        p->comm, order, gfp_mask);
                dump_stack();
+                show_mem();
        }
        return NULL;
 got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 }
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
        return ret;
 }
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
        void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
                        if (!cpu_possible(cpu))
                                continue;
-                        pageset = zone->pageset + cpu;
+                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
-                                printk("cpu %d %s: low %d, high %d, batch %d\n",
+                                printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
                                        cpu,
                                        temperature ? "cold" : "hot",
                                        pageset->pcp[temperature].low,
                                        pageset->pcp[temperature].high,
-                                        pageset->pcp[temperature].batch);
+                                        pageset->pcp[temperature].batch,
+                                        pageset->pcp[temperature].count);
                }
        }
        get_page_state(&ps);
        get_zone_counts(&active, &inactive, &free);
-        printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+        printk("Free pages: %11ukB (%ukB HighMem)\n",
                K(nr_free_pages()),
                K(nr_free_highpages()));
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+        int batch;
+        /*
+         * The per-cpu-pages pools are set to around 1000th of the
+         * size of the zone.  But no more than 1/4 of a meg - there's
+         * no point in going beyond the size of L2 cache.
+         *
+         * OK, so we don't know how big the cache is.  So guess.
+         */
+        batch = zone->present_pages / 1024;
+        if (batch * PAGE_SIZE > 256 * 1024)
+                batch = (256 * 1024) / PAGE_SIZE;
+        batch /= 4;             /* We effectively *= 4 below */
+        if (batch < 1)
+                batch = 1;
+        /*
+         * Clamp the batch to a 2^n - 1 value. Having a power
+         * of 2 value was found to be more likely to have
+         * suboptimal cache aliasing properties in some cases.
+         *
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
+         */
+        batch = (1 << fls(batch + batch/2)) - 1;
+        return batch;
+}
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+        struct per_cpu_pages *pcp;
+        pcp = &p->pcp[0];               /* hot */
+        pcp->count = 0;
+        pcp->low = 2 * batch;
+        pcp->high = 6 * batch;
+        pcp->batch = max(1UL, 1 * batch);
+        INIT_LIST_HEAD(&pcp->list);
+        pcp = &p->pcp[1];               /* cold*/
+        pcp->count = 0;
+        pcp->low = 0;
+        pcp->high = 2 * batch;
+        pcp->batch = max(1UL, 1 * batch);
+        INIT_LIST_HEAD(&pcp->list);
+}
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ * These will be discarded when bootup is complete.
+ */
+static struct per_cpu_pageset
+        boot_pageset[NR_CPUS] __initdata;
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+        struct zone *zone, *dzone;
+        for_each_zone(zone) {
+                zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+                if (!zone->pageset[cpu])
+                        goto bad;
+                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+        }
+        return 0;
+bad:
+        for_each_zone(dzone) {
+                if (dzone == zone)
+                        break;
+                kfree(dzone->pageset[cpu]);
+                dzone->pageset[cpu] = NULL;
+        }
+        return -ENOMEM;
+}
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+        struct zone *zone;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                kfree(pset);
+        }
+#endif
+}
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        int cpu = (long)hcpu;
+        int ret = NOTIFY_OK;
+        switch (action) {
+                case CPU_UP_PREPARE:
+                        if (process_zones(cpu))
+                                ret = NOTIFY_BAD;
+                        break;
+#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_DEAD:
+                        free_zone_pagesets(cpu);
+                        break;
+#endif
+                default:
+                        break;
+        }
+        return ret;
+}
+static struct notifier_block pageset_notifier =
+        { &pageset_cpuup_callback, NULL, 0 };
+void __init setup_per_cpu_pageset()
+{
+        int err;
+        /* Initialize per_cpu_pageset for cpu 0.
+         * A cpuup callback will do this for every cpu
+         * as it comes online
+         */
+        err = process_zones(smp_processor_id());
+        BUG_ON(err);
+        register_cpu_notifier(&pageset_notifier);
+}
+#endif
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                /*
+                batch = zone_batchsize(zone);
-                 * The per-cpu-pages pools are set to around 1000th of the
-                 * size of the zone.  But no more than 1/4 of a meg - there's
-                 * no point in going beyond the size of L2 cache.
-                 *
-                 * OK, so we don't know how big the cache is.  So guess.
-                 */
-                batch = zone->present_pages / 1024;
-                if (batch * PAGE_SIZE > 256 * 1024)
-                        batch = (256 * 1024) / PAGE_SIZE;
-                batch /= 4;             /* We effectively *= 4 below */
-                if (batch < 1)
-                        batch = 1;
-                /*
-                 * Clamp the batch to a 2^n - 1 value. Having a power
-                 * of 2 value was found to be more likely to have
-                 * suboptimal cache aliasing properties in some cases.
-                 *
-                 * For example if 2 tasks are alternately allocating
-                 * batches of pages, one task can end up with a lot
-                 * of pages of one half of the possible page colors
-                 * and the other with pages of the other colors.
-                 */
-                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
-                        struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+                        /* Early boot. Slab allocator not functional yet */
-                        pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                        zone->pageset[cpu] = &boot_pageset[cpu];
-                        pcp->count = 0;
+                        setup_pageset(&boot_pageset[cpu],0);
-                        pcp->low = 2 * batch;
+#else
-                        pcp->high = 6 * batch;
+                        setup_pageset(zone_pcp(zone,cpu), batch);
-                        pcp->batch = 1 * batch;
+#endif
-                        INIT_LIST_HEAD(&pcp->list);
-                        pcp = &zone->pageset[cpu].pcp[1];       /* cold */
-                        pcp->count = 0;
-                        pcp->low = 0;
-                        pcp->high = 2 * batch;
-                        pcp->batch = 1 * batch;
-                        INIT_LIST_HEAD(&pcp->list);
                }
                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
                                zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_inactive = 0;
                zone->nr_active = 0;
                zone->nr_inactive = 0;
+                atomic_set(&zone->reclaim_in_progress, -1);
                if (!size)
                        continue;
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
        .show   = frag_show,
 };
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = arg;
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+                int i;
+                if (!zone->present_pages)
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+                seq_printf(m,
+                           "\n  pages free     %lu"
+                           "\n        min      %lu"
+                           "\n        low      %lu"
+                           "\n        high     %lu"
+                           "\n        active   %lu"
+                           "\n        inactive %lu"
+                           "\n        scanned  %lu (a: %lu i: %lu)"
+                           "\n        spanned  %lu"
+                           "\n        present  %lu",
+                           zone->free_pages,
+                           zone->pages_min,
+                           zone->pages_low,
+                           zone->pages_high,
+                           zone->nr_active,
+                           zone->nr_inactive,
+                           zone->pages_scanned,
+                           zone->nr_scan_active, zone->nr_scan_inactive,
+                           zone->spanned_pages,
+                           zone->present_pages);
+                seq_printf(m,
+                           "\n        protection: (%lu",
+                           zone->lowmem_reserve[0]);
+                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m,
+                           ")"
+                           "\n  pagesets");
+                for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+                        struct per_cpu_pageset *pageset;
+                        int j;
+                        pageset = zone_pcp(zone, i);
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                if (pageset->pcp[j].count)
+                                        break;
+                        }
+                        if (j == ARRAY_SIZE(pageset->pcp))
+                                continue;
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                seq_printf(m,
+                                           "\n    cpu: %i pcp: %i"
+                                           "\n              count: %i"
+                                           "\n              low:   %i"
+                                           "\n              high:  %i"
+                                           "\n              batch: %i",
+                                           i, j,
+                                           pageset->pcp[j].count,
+                                           pageset->pcp[j].low,
+                                           pageset->pcp[j].high,
+                                           pageset->pcp[j].batch);
+                        }
+#ifdef CONFIG_NUMA
+                        seq_printf(m,
+                                   "\n            numa_hit:       %lu"
+                                   "\n            numa_miss:      %lu"
+                                   "\n            numa_foreign:   %lu"
+                                   "\n            interleave_hit: %lu"
+                                   "\n            local_node:     %lu"
+                                   "\n            other_node:     %lu",
+                                   pageset->numa_hit,
+                                   pageset->numa_miss,
+                                   pageset->numa_foreign,
+                                   pageset->interleave_hit,
+                                   pageset->local_node,
+                                   pageset->other_node);
+#endif
+                }
+                seq_printf(m,
+                           "\n  all_unreclaimable: %u"
+                           "\n  prev_priority:     %i"
+                           "\n  temp_priority:     %i"
+                           "\n  start_pfn:         %lu",
+                           zone->all_unreclaimable,
+                           zone->prev_priority,
+                           zone->temp_priority,
+                           zone->zone_start_pfn);
+                spin_unlock_irqrestore(&zone->lock, flags);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations zoneinfo_op = {
+        .start  = frag_start, /* iterate over all zones. The same as in
+                               * fragmentation. */
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = zoneinfo_show,
+};
 static char *vmstat_text[] = {
        "nr_dirty",
        "nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                        /* if it's a lowmem zone, reserve a number of pages 
+                        /* if it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->pages_min = (pages_min * zone->present_pages) / 
+                        zone->pages_min = (pages_min * zone->present_pages) /
                                           lowmem_pages;
                }
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
                goto out_unmap;
        }
-        /*
-         * Don't pull an anonymous page out from under get_user_pages.
-         * GUP carefully breaks COW and raises page count (while holding
-         * page_table_lock, as we have here) to make sure that the page
-         * cannot be freed.  If we unmap that page here, a user write
-         * access to the virtual address will bring back the page, but
-         * its raised count will (ironically) be taken to mean it's not
-         * an exclusive swap page, do_wp_page will replace it by a copy
-         * page, and the user never get to see the data GUP was holding
-         * the original page for.
-         *
-         * This test is also useful for when swapoff (unuse_process) has
-         * to drop page lock: its reference to the page stops existing
-         * ptes from being unmapped, so swapoff can make progress.
-         */
-        if (PageSwapCache(page) &&
-            page_count(page) != page_mapcount(page) + 2) {
-                ret = SWAP_FAIL;
-                goto out_unmap;
-        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
        pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 61574b81d979..e64fa726a790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,8 +6,8 @@
 *               2000-2001 Christoph Rohland
 *               2000-2001 SAP AG
 *               2002 Red Hat Inc.
- * Copyright (C) 2002-2004 Hugh Dickins.
+ * Copyright (C) 2002-2005 Hugh Dickins.
- * Copyright (C) 2002-2004 VERITAS Software Corporation.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock);
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-        if (sbinfo) {
+        if (sbinfo->max_blocks) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_blocks += pages;
                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
                 * page (and perhaps indirect index pages) yet to allocate:
                 * a waste to allocate index if we cannot allocate data.
                 */
-                if (sbinfo) {
+                if (sbinfo->max_blocks) {
                        spin_lock(&sbinfo->stat_lock);
                        if (sbinfo->free_blocks <= 1) {
                                spin_unlock(&sbinfo->stat_lock);
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode)
                        spin_unlock(&shmem_swaplist_lock);
                }
        }
-        if (sbinfo) {
+        BUG_ON(inode->i_blocks);
-                BUG_ON(inode->i_blocks);
+        if (sbinfo->max_inodes) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
                spin_unlock(&sbinfo->stat_lock);
@@ -1080,7 +1080,7 @@ repeat:
        } else {
                shmem_swp_unmap(entry);
                sbinfo = SHMEM_SB(inode->i_sb);
-                if (sbinfo) {
+                if (sbinfo->max_blocks) {
                        spin_lock(&sbinfo->stat_lock);
                        if (sbinfo->free_blocks == 0 ||
                            shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-        if (sbinfo) {
+        if (sbinfo->max_inodes) {
                spin_lock(&sbinfo->stat_lock);
                if (!sbinfo->free_inodes) {
                        spin_unlock(&sbinfo->stat_lock);
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                        mpol_shared_policy_init(&info->policy);
                        break;
                }
-        } else if (sbinfo) {
+        } else if (sbinfo->max_inodes) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
                spin_unlock(&sbinfo->stat_lock);
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 }
 #ifdef CONFIG_TMPFS
-static int shmem_set_size(struct shmem_sb_info *sbinfo,
-                          unsigned long max_blocks, unsigned long max_inodes)
-{
-        int error;
-        unsigned long blocks, inodes;
-        spin_lock(&sbinfo->stat_lock);
-        blocks = sbinfo->max_blocks - sbinfo->free_blocks;
-        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-        error = -EINVAL;
-        if (max_blocks < blocks)
-                goto out;
-        if (max_inodes < inodes)
-                goto out;
-        error = 0;
-        sbinfo->max_blocks  = max_blocks;
-        sbinfo->free_blocks = max_blocks - blocks;
-        sbinfo->max_inodes  = max_inodes;
-        sbinfo->free_inodes = max_inodes - inodes;
-out:
-        spin_unlock(&sbinfo->stat_lock);
-        return error;
-}
 static struct inode_operations shmem_symlink_inode_operations;
 static struct inode_operations shmem_symlink_inline_operations;
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
        buf->f_namelen = NAME_MAX;
-        if (sbinfo) {
+        spin_lock(&sbinfo->stat_lock);
-                spin_lock(&sbinfo->stat_lock);
+        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
                buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+        }
+        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_inodes;
-                spin_unlock(&sbinfo->stat_lock);
        }
        /* else leave those fields 0 like simple_statfs */
+        spin_unlock(&sbinfo->stat_lock);
        return 0;
 }
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         */
-        if (sbinfo) {
+        if (sbinfo->max_inodes) {
                spin_lock(&sbinfo->stat_lock);
                if (!sbinfo->free_inodes) {
                        spin_unlock(&sbinfo->stat_lock);
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-                if (sbinfo) {
+                if (sbinfo->max_inodes) {
                        spin_lock(&sbinfo->stat_lock);
                        sbinfo->free_inodes++;
                        spin_unlock(&sbinfo->stat_lock);
@@ -1921,22 +1898,42 @@ bad_val:
 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-        unsigned long max_blocks = 0;
+        unsigned long max_blocks = sbinfo->max_blocks;
-        unsigned long max_inodes = 0;
+        unsigned long max_inodes = sbinfo->max_inodes;
+        unsigned long blocks;
+        unsigned long inodes;
+        int error = -EINVAL;
+        if (shmem_parse_options(data, NULL, NULL, NULL,
+                                &max_blocks, &max_inodes))
+                return error;
-        if (sbinfo) {
+        spin_lock(&sbinfo->stat_lock);
-                max_blocks = sbinfo->max_blocks;
+        blocks = sbinfo->max_blocks - sbinfo->free_blocks;
-                max_inodes = sbinfo->max_inodes;
+        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-        }
+        if (max_blocks < blocks)
-        if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
+                goto out;
-                return -EINVAL;
+        if (max_inodes < inodes)
-        /* Keep it simple: disallow limited <-> unlimited remount */
+                goto out;
-        if ((max_blocks || max_inodes) == !sbinfo)
+        /*
-                return -EINVAL;
+         * Those tests also disallow limited->unlimited while any are in
-        /* But allow the pointless unlimited -> unlimited remount */
+         * use, so i_blocks will always be zero when max_blocks is zero;
-        if (!sbinfo)
+         * but we must separately disallow unlimited->limited, because
-                return 0;
+         * in that case we have no record of how much is already in use.
-        return shmem_set_size(sbinfo, max_blocks, max_inodes);
+         */
+        if (max_blocks && !sbinfo->max_blocks)
+                goto out;
+        if (max_inodes && !sbinfo->max_inodes)
+                goto out;
+        error = 0;
+        sbinfo->max_blocks  = max_blocks;
+        sbinfo->free_blocks = max_blocks - blocks;
+        sbinfo->max_inodes  = max_inodes;
+        sbinfo->free_inodes = max_inodes - inodes;
+out:
+        spin_unlock(&sbinfo->stat_lock);
+        return error;
 }
 #endif
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb,
        uid_t uid = current->fsuid;
        gid_t gid = current->fsgid;
        int err = -ENOMEM;
+        struct shmem_sb_info *sbinfo;
-#ifdef CONFIG_TMPFS
        unsigned long blocks = 0;
        unsigned long inodes = 0;
+#ifdef CONFIG_TMPFS
        /*
         * Per default we only allow half of the physical ram per
         * tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb,
                inodes = totalram_pages - totalhigh_pages;
                if (inodes > blocks)
                        inodes = blocks;
+                if (shmem_parse_options(data, &mode, &uid, &gid,
-                if (shmem_parse_options(data, &mode,
+                                        &blocks, &inodes))
-                                        &uid, &gid, &blocks, &inodes))
                        return -EINVAL;
        }
-        if (blocks || inodes) {
-                struct shmem_sb_info *sbinfo;
-                sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
-                if (!sbinfo)
-                        return -ENOMEM;
-                sb->s_fs_info = sbinfo;
-                spin_lock_init(&sbinfo->stat_lock);
-                sbinfo->max_blocks = blocks;
-                sbinfo->free_blocks = blocks;
-                sbinfo->max_inodes = inodes;
-                sbinfo->free_inodes = inodes;
-        }
-        sb->s_xattr = shmem_xattr_handlers;
 #else
        sb->s_flags |= MS_NOUSER;
 #endif
+        /* Round up to L1_CACHE_BYTES to resist false sharing */
+        sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+                                L1_CACHE_BYTES), GFP_KERNEL);
+        if (!sbinfo)
+                return -ENOMEM;
+        spin_lock_init(&sbinfo->stat_lock);
+        sbinfo->max_blocks = blocks;
+        sbinfo->free_blocks = blocks;
+        sbinfo->max_inodes = inodes;
+        sbinfo->free_inodes = inodes;
+        sb->s_fs_info = sbinfo;
        sb->s_maxbytes = SHMEM_MAX_BYTES;
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
+        sb->s_xattr = shmem_xattr_handlers;
        inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
        if (!inode)
                goto failed;
diff --git a/mm/slab.c b/mm/slab.c
index c78d343b3c5f..93cbbbb39f42 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2851,6 +2851,7 @@ next:
        }
        check_irq_on();
        up(&cache_chain_sem);
+        drain_remote_pages();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
 }
 /*
- * Check if we're the only user of a swap page,
+ * How many references to page are currently swapped out?
- * when the page is locked.
 */
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
 {
-        int retval = 0;
+        int count = 0;
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        swp_entry_t entry;
        entry.val = page->private;
        p = swap_info_get(entry);
        if (p) {
-                /* Is the only swap cache user the cache itself? */
+                /* Subtract the 1 for the swap cache itself */
-                if (p->swap_map[swp_offset(entry)] == 1) {
+                count = p->swap_map[swp_offset(entry)] - 1;
-                        /* Recheck the page count with the swapcache lock held.. */
-                        write_lock_irq(&swapper_space.tree_lock);
-                        if (page_count(page) == 2)
-                                retval = 1;
-                        write_unlock_irq(&swapper_space.tree_lock);
-                }
                swap_info_put(p);
        }
-        return retval;
+        return count;
 }
 /*
 * We can use this swap cache entry directly
 * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
 */
 int can_share_swap_page(struct page *page)
 {
-        int retval = 0;
+        int count;
-        if (!PageLocked(page))
+        BUG_ON(!PageLocked(page));
-                BUG();
+        count = page_mapcount(page);
-        switch (page_count(page)) {
+        if (count <= 1 && PageSwapCache(page))
-        case 3:
+                count += page_swapcount(page);
-                if (!PagePrivate(page))
+        return count == 1;
-                        break;
-                /* Fallthrough */
-        case 2:
-                if (!PageSwapCache(page))
-                        break;
-                retval = exclusive_swap_page(page);
-                break;
-        case 1:
-                if (PageReserved(page))
-                        break;
-                retval = 1;
-        }
-        return retval;
 }
 /*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
-                 * Our reference to the page stops try_to_unmap_one from
+                 * Activate page so shrink_cache is unlikely to unmap its
-                 * unmapping its ptes, so swapoff can make progress.
+                 * ptes while lock is dropped, so swapoff can make progress.
                 */
+                activate_page(page);
                unlock_page(page);
                down_read(&mm->mmap_sem);
                lock_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 269eded9b459..4b8e62a19370 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,6 +74,9 @@ struct scan_control {
        int may_writepage;
+        /* Can pages be swapped as part of reclaim? */
+        int may_swap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
         * In this context, it doesn't matter that we scan the
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker);
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
 */
 static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
+        int ret = 0;
        if (scanned == 0)
                scanned = SWAP_CLUSTER_MAX;
        if (!down_read_trylock(&shrinker_rwsem))
-                return 0;
+                return 1;       /* Assume we'll be able to shrink next time */
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
                while (total_scan >= SHRINK_BATCH) {
                        long this_scan = SHRINK_BATCH;
                        int shrink_ret;
+                        int nr_before;
+                        nr_before = (*shrinker->shrinker)(0, gfp_mask);
                        shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
                        if (shrink_ret == -1)
                                break;
+                        if (shrink_ret < nr_before)
+                                ret += nr_before - shrink_ret;
                        mod_page_state(slabs_scanned, this_scan);
                        total_scan -= this_scan;
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
                shrinker->nr += total_scan;
        }
        up_read(&shrinker_rwsem);
-        return 0;
+        return ret;
 }
 /* Called without lock on whether page is mapped, so answer is unstable */
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
+                if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
                        if (!add_to_swap(page))
                                goto activate_locked;
                }
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
                if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
+                atomic_inc(&zone->reclaim_in_progress);
                shrink_zone(zone, sc);
+                atomic_dec(&zone->reclaim_in_progress);
        }
 }
 
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
-int try_to_free_pages(struct zone **zones,
+int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
-                unsigned int gfp_mask, unsigned int order)
 {
        int priority;
        int ret = 0;
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones,
        sc.gfp_mask = gfp_mask;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1020,6 +1032,7 @@ loop_again:
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
        sc.may_writepage = 0;
+        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1079,6 +1092,7 @@ scan:
                 */
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
+                        int nr_slab;
                        if (zone->present_pages == 0)
                                continue;
@@ -1098,16 +1112,19 @@ scan:
                        sc.nr_reclaimed = 0;
                        sc.priority = priority;
                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+                        atomic_inc(&zone->reclaim_in_progress);
                        shrink_zone(zone, &sc);
+                        atomic_dec(&zone->reclaim_in_progress);
                        reclaim_state->reclaimed_slab = 0;
-                        shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_reclaimed += sc.nr_reclaimed;
                        total_scanned += sc.nr_scanned;
                        if (zone->all_unreclaimable)
                                continue;
-                        if (zone->pages_scanned >= (zone->nr_active +
+                        if (nr_slab == 0 && zone->pages_scanned >=
-                                                        zone->nr_inactive) * 4)
+                                    (zone->nr_active + zone->nr_inactive) * 4)
                                zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+        struct scan_control sc;
+        int nr_pages = 1 << order;
+        int total_reclaimed = 0;
+        /* The reclaim may sleep, so don't do it if sleep isn't allowed */
+        if (!(gfp_mask & __GFP_WAIT))
+                return 0;
+        if (zone->all_unreclaimable)
+                return 0;
+        sc.gfp_mask = gfp_mask;
+        sc.may_writepage = 0;
+        sc.may_swap = 0;
+        sc.nr_mapped = read_page_state(nr_mapped);
+        sc.nr_scanned = 0;
+        sc.nr_reclaimed = 0;
+        /* scan at the highest priority */
+        sc.priority = 0;
+        if (nr_pages > SWAP_CLUSTER_MAX)
+                sc.swap_cluster_max = nr_pages;
+        else
+                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+        /* Don't reclaim the zone if there are other reclaimers active */
+        if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+                goto out;
+        shrink_zone(zone, &sc);
+        total_reclaimed = sc.nr_reclaimed;
+ out:
+        atomic_dec(&zone->reclaim_in_progress);
+        return total_reclaimed;
+}
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+                                     unsigned int state)
+{
+        struct zone *z;
+        int i;
+        if (node >= MAX_NUMNODES || !node_online(node))
+                return -EINVAL;
+        /* This will break if we ever add more zones */
+        if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+                return -EINVAL;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                if (!(zone & 1<<i))
+                        continue;
+                z = &NODE_DATA(node)->node_zones[i];
+                if (state)
+                        z->reclaim_pages = 1;
+                else
+                        z->reclaim_pages = 0;
+        }
+        return 0;
+}