20 files changed, 732 insertions, 496 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..b3db11f137e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
 config FLATMEM_MANUAL
        bool "Flat Memory"
-        depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+        depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
        help
          This option allows you to change some of the ways that
          Linux manages its memory internally.  Most users will
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                unsigned long v = ~map[i / BITS_PER_LONG];
                if (gofast && v == ~0UL) {
-                        int j, order;
+                        int order;
                        page = pfn_to_page(pfn);
                        count += BITS_PER_LONG;
-                        __ClearPageReserved(page);
                        order = ffs(BITS_PER_LONG) - 1;
-                        set_page_refs(page, order);
+                        __free_pages_bootmem(page, order);
-                        for (j = 1; j < BITS_PER_LONG; j++) {
-                                if (j + 16 < BITS_PER_LONG)
-                                        prefetchw(page + j + 16);
-                                __ClearPageReserved(page + j);
-                                set_page_count(page + j, 0);
-                        }
-                        __free_pages(page, order);
                        i += BITS_PER_LONG;
                        page += BITS_PER_LONG;
                } else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
                                if (v & m) {
                                        count++;
-                                        __ClearPageReserved(page);
+                                        __free_pages_bootmem(page, 0);
-                                        set_page_refs(page, 0);
-                                        __free_page(page);
                                }
                        }
                } else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
        count = 0;
        for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
                count++;
-                __ClearPageReserved(page);
+                __free_pages_bootmem(page, 0);
-                set_page_count(page, 1);
-                __free_page(page);
        }
        total += count;
        bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
        return(free_all_bootmem_core(NODE_DATA(0)));
 }
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
-                                unsigned long limit)
 {
        pg_data_t *pgdat = pgdat_list;
        void *ptr;
        for_each_pgdat(pgdat)
                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-                                                 align, goal, limit)))
+                                                 align, goal, 0)))
                        return(ptr);
        /*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
-                                     unsigned long goal, unsigned long limit)
+                                   unsigned long goal)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return (ptr);
-        return __alloc_bootmem_limit(size, align, goal, limit);
+        return __alloc_bootmem(size, align, goal);
+}
+#define LOW32LIMIT 0xffffffff
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+        pg_data_t *pgdat = pgdat_list;
+        void *ptr;
+        for_each_pgdat(pgdat)
+                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+                                                 align, goal, LOW32LIMIT)))
+                        return(ptr);
+        /*
+         * Whoops, we cannot satisfy the allocation request.
+         */
+        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of low memory");
+        return NULL;
 }
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+                                       unsigned long align, unsigned long goal)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..4ef24a397684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -555,11 +555,12 @@ repeat:
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
-                        lock_page(page);
+                        __lock_page(page);
                        read_lock_irq(&mapping->tree_lock);
                        /* Has the page been truncated while we slept? */
-                        if (page->mapping != mapping || page->index != offset) {
+                        if (unlikely(page->mapping != mapping ||
+                                     page->index != offset)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
@@ -831,8 +832,13 @@ readpage:
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);
-                if (unlikely(error))
+                if (unlikely(error)) {
+                        if (error == AOP_TRUNCATED_PAGE) {
+                                page_cache_release(page);
+                                goto find_page;
+                        }
                        goto readpage_error;
+                }
                if (!PageUptodate(page)) {
                        lock_page(page);
@@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
-        int error;
+        int ret;
-        page = page_cache_alloc_cold(mapping);
+        do {
-        if (!page)
+                page = page_cache_alloc_cold(mapping);
-                return -ENOMEM;
+                if (!page)
+                        return -ENOMEM;
+                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+                if (ret == 0)
+                        ret = mapping->a_ops->readpage(file, page);
+                else if (ret == -EEXIST)
+                        ret = 0; /* losing race to add is OK */
-        error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-        if (!error) {
-                error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
-                return error;
-        }
-        /*
+        } while (ret == AOP_TRUNCATED_PAGE);
-         * We arrive here in the unlikely event that someone 
+                
-         * raced with us and added our page to the cache first
+        return ret;
-         * or we are out of memory for radix-tree nodes.
-         */
-        page_cache_release(page);
-        return error == -EEXIST ? 0 : error;
 }
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1335,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1358,10 +1366,14 @@ page_not_uptodate:
                goto success;
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1444,10 +1456,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1470,10 +1486,14 @@ page_not_uptodate:
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
                if (unlikely(status)) {
                        loff_t isize = i_size_read(inode);
+                        if (status != AOP_TRUNCATED_PAGE)
+                                unlock_page(page);
+                        page_cache_release(page);
+                        if (status == AOP_TRUNCATED_PAGE)
+                                continue;
                        /*
                         * prepare_write() may have instantiated a few blocks
                         * outside i_size.  Trim these off again.
                         */
-                        unlock_page(page);
-                        page_cache_release(page);
                        if (pos + bytes > isize)
                                vmtruncate(inode, isize);
                        break;
@@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                                cur_iov, iov_base, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
+                if (status == AOP_TRUNCATED_PAGE) {
+                        page_cache_release(page);
+                        continue;
+                }
                if (likely(copied > 0)) {
                        if (!status)
                                status = copied;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..f4c43d7980ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page)
        free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+                                unsigned long address)
 {
        int nid = numa_node_id();
        struct page *page = NULL;
+        struct zonelist *zonelist = huge_zonelist(vma, address);
+        struct zone **z;
-        if (list_empty(&hugepage_freelists[nid])) {
+        for (z = zonelist->zones; *z; z++) {
-                for (nid = 0; nid < MAX_NUMNODES; ++nid)
+                nid = (*z)->zone_pgdat->node_id;
-                        if (!list_empty(&hugepage_freelists[nid]))
+                if (!list_empty(&hugepage_freelists[nid]))
-                                break;
+                        break;
        }
-        if (nid >= 0 && nid < MAX_NUMNODES &&
-            !list_empty(&hugepage_freelists[nid])) {
+        if (*z) {
                page = list_entry(hugepage_freelists[nid].next,
                                  struct page, lru);
                list_del(&page->lru);
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page();
+        page = dequeue_huge_page(vma, addr);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                return NULL;
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
        spin_lock(&hugetlb_lock);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
        .nopage = hugetlb_nopage,
 };
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+                                int writable)
 {
        pte_t entry;
-        if (vma->vm_flags & VM_WRITE) {
+        if (writable) {
                entry =
                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
        } else {
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
        return entry;
 }
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+                                   unsigned long address, pte_t *ptep)
+{
+        pte_t entry;
+        entry = pte_mkwrite(pte_mkdirty(*ptep));
+        ptep_set_access_flags(vma, address, ptep, entry, 1);
+        update_mmu_cache(vma, address, entry);
+        lazy_mmu_prot_update(entry);
+}
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
+        int cow;
+        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
                src_pte = huge_pte_offset(src, addr);
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock(&dst->page_table_lock);
                spin_lock(&src->page_table_lock);
                if (!pte_none(*src_pte)) {
+                        if (cow)
+                                ptep_set_wrprotect(src, addr, src_pte);
                        entry = *src_pte;
                        ptepage = pte_page(entry);
                        get_page(ptepage);
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
 }
-static struct page *find_lock_huge_page(struct address_space *mapping,
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long idx)
+                        unsigned long address, pte_t *ptep, pte_t pte)
 {
-        struct page *page;
+        struct page *old_page, *new_page;
-        int err;
+        int i, avoidcopy;
-        struct inode *inode = mapping->host;
-        unsigned long size;
-retry:
+        old_page = pte_page(pte);
-        page = find_lock_page(mapping, idx);
-        if (page)
-                goto out;
-        /* Check to make sure the mapping hasn't been truncated */
+        /* If no-one else is actually using this page, avoid the copy
-        size = i_size_read(inode) >> HPAGE_SHIFT;
+         * and just make the page writable */
-        if (idx >= size)
+        avoidcopy = (page_count(old_page) == 1);
-                goto out;
+        if (avoidcopy) {
+                set_huge_ptep_writable(vma, address, ptep);
+                return VM_FAULT_MINOR;
+        }
-        if (hugetlb_get_quota(mapping))
+        page_cache_get(old_page);
-                goto out;
+        new_page = alloc_huge_page(vma, address);
-        page = alloc_huge_page();
-        if (!page) {
+        if (!new_page) {
-                hugetlb_put_quota(mapping);
+                page_cache_release(old_page);
-                goto out;
+                /* Logically this is OOM, not a SIGBUS, but an OOM
+                 * could cause the kernel to go killing other
+                 * processes which won't help the hugepage situation
+                 * at all (?) */
+                return VM_FAULT_SIGBUS;
        }
-        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+        spin_unlock(&mm->page_table_lock);
-        if (err) {
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-                put_page(page);
+                copy_user_highpage(new_page + i, old_page + i,
-                hugetlb_put_quota(mapping);
+                                   address + i*PAGE_SIZE);
-                if (err == -EEXIST)
+        spin_lock(&mm->page_table_lock);
-                        goto retry;
-                page = NULL;
+        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        if (likely(pte_same(*ptep, pte))) {
+                /* Break COW */
+                set_huge_pte_at(mm, address, ptep,
+                                make_huge_pte(vma, new_page, 1));
+                /* Make the old page be freed below */
+                new_page = old_page;
        }
-out:
+        page_cache_release(new_page);
-        return page;
+        page_cache_release(old_page);
+        return VM_FAULT_MINOR;
 }
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, int write_access)
+                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
        unsigned long idx;
        unsigned long size;
-        pte_t *pte;
        struct page *page;
        struct address_space *mapping;
+        pte_t new_pte;
-        pte = huge_pte_alloc(mm, address);
-        if (!pte)
-                goto out;
        mapping = vma->vm_file->f_mapping;
        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-        page = find_lock_huge_page(mapping, idx);
+retry:
-        if (!page)
+        page = find_lock_page(mapping, idx);
-                goto out;
+        if (!page) {
+                if (hugetlb_get_quota(mapping))
+                        goto out;
+                page = alloc_huge_page(vma, address);
+                if (!page) {
+                        hugetlb_put_quota(mapping);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_SHARED) {
+                        int err;
+                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+                        if (err) {
+                                put_page(page);
+                                hugetlb_put_quota(mapping);
+                                if (err == -EEXIST)
+                                        goto retry;
+                                goto out;
+                        }
+                } else
+                        lock_page(page);
+        }
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto backout;
        ret = VM_FAULT_MINOR;
-        if (!pte_none(*pte))
+        if (!pte_none(*ptep))
                goto backout;
        add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-        set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+                                && (vma->vm_flags & VM_SHARED)));
+        set_huge_pte_at(mm, address, ptep, new_pte);
+        if (write_access && !(vma->vm_flags & VM_SHARED)) {
+                /* Optimization, do the COW without a second fault */
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+        }
        spin_unlock(&mm->page_table_lock);
        unlock_page(page);
 out:
@@ -433,6 +492,33 @@ backout:
        goto out;
 }
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, int write_access)
+{
+        pte_t *ptep;
+        pte_t entry;
+        int ret;
+        ptep = huge_pte_alloc(mm, address);
+        if (!ptep)
+                return VM_FAULT_OOM;
+        entry = *ptep;
+        if (pte_none(entry))
+                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+        ret = VM_FAULT_MINOR;
+        spin_lock(&mm->page_table_lock);
+        /* Check for a racing update before calling hugetlb_cow */
+        if (likely(pte_same(entry, *ptep)))
+                if (write_access && !pte_write(entry))
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
 * 2 of the License, or (at your option) any later version.
 */
-/* page_alloc.c */
+static inline void set_page_refs(struct page *page, int order)
-extern void set_page_refs(struct page *page, int order);
+{
+#ifdef CONFIG_MMU
+        set_page_count(page, 1);
+#else
+        int i;
+        /*
+         * We need to reference all the pages for this order, otherwise if
+         * anyone accesses one of the pages with (get/put) it will be freed.
+         * - eg: access_process_vm()
+         */
+        for (i = 0; i < (1 << order); i++)
+                set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+                                                unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+                                unsigned long start, unsigned long end)
+{
+        struct address_space *mapping;
+        loff_t offset, endoff;
+        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+                return -EINVAL;
+        if (!vma->vm_file || !vma->vm_file->f_mapping
+                || !vma->vm_file->f_mapping->host) {
+                        return -EINVAL;
+        }
+        mapping = vma->vm_file->f_mapping;
+        offset = (loff_t)(start - vma->vm_start)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        endoff = (loff_t)(end - vma->vm_start - 1)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        return  vmtruncate_range(mapping->host, offset, endoff);
+}
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        case MADV_RANDOM:
                error = madvise_behavior(vma, prev, start, end, behavior);
                break;
+        case MADV_REMOVE:
+                error = madvise_remove(vma, start, end);
+                break;
        case MADV_WILLNEED:
                error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 *              some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *              so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *              pages and associated backing store.
 *
 * return values:
 *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..7197f9bcd384 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
                update_mmu_cache(vma, address, entry);
                lazy_mmu_prot_update(entry);
                lru_cache_add_active(new_page);
-                page_add_anon_rmap(new_page, vma, address);
+                page_add_new_anon_rmap(new_page, vma, address);
                /* Free the old page.. */
                new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
 out_busy:
        return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * If the underlying filesystem is not going to provide
+         * a way to truncate a range of blocks (punch a hole) -
+         * we should return failure right now.
+         */
+        if (!inode->i_op || !inode->i_op->truncate_range)
+                return -ENOSYS;
+        down(&inode->i_sem);
+        down_write(&inode->i_alloc_sem);
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        truncate_inode_pages_range(mapping, offset, end);
+        inode->i_op->truncate_range(inode, offset, end);
+        up_write(&inode->i_alloc_sem);
+        up(&inode->i_sem);
+        return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
 /* 
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto release;
                inc_mm_counter(mm, anon_rss);
                lru_cache_add_active(page);
-                SetPageReferenced(page);
+                page_add_new_anon_rmap(page, vma, address);
-                page_add_anon_rmap(page, vma, address);
        } else {
                /* Map the ZERO_PAGE - vm_page_prot is readonly */
                page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
                        lru_cache_add_active(new_page);
-                        page_add_anon_rmap(new_page, vma, address);
+                        page_add_new_anon_rmap(new_page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a8..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                                  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
-        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
        int ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..0f1d2b8a952b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
 /* Highest zone. An specific allocation for a zone below that is not
   policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes) {
+        for_each_node_mask(nd, *nodes)
-                int k;
+                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
-                for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
-                        if (!z->present_pages)
-                                continue;
-                        zl->zones[num++] = z;
-                        if (k > policy_zone)
-                                policy_zone = k;
-                }
-        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -785,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
        return nid;
 }
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+                 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+        if (vma) {
+                unsigned long off;
+                off = vma->vm_pgoff;
+                off += (addr - vma->vm_start) >> shift;
+                return offset_il_node(pol, vma, off);
+        } else
+                return interleave_nodes(pol);
+}
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        if (pol->policy == MPOL_INTERLEAVE) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+        }
+        return zonelist_policy(GFP_HIGHUSER, pol);
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
-                if (vma) {
-                        unsigned long off;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                        off = vma->vm_pgoff;
-                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
-                        nid = offset_il_node(pol, vma, off);
-                } else {
-                        /* fall back to process interleaving */
-                        nid = interleave_nodes(pol);
-                }
                return alloc_page_interleave(gfp, 0, nid);
        }
        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -940,54 +952,6 @@ void __mpol_free(struct mempolicy *p)
 }
 /*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_DEFAULT:
-                return numa_node_id();
-        case MPOL_BIND:
-                return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-        case MPOL_INTERLEAVE:
-                return interleave_nodes(pol);
-        case MPOL_PREFERRED:
-                return pol->v.preferred_node >= 0 ?
-                                pol->v.preferred_node : numa_node_id();
-        }
-        BUG();
-        return 0;
-}
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_PREFERRED:
-        case MPOL_DEFAULT:
-        case MPOL_INTERLEAVE:
-                return 1;
-        case MPOL_BIND: {
-                struct zone **z;
-                for (z = pol->v.zonelist->zones; *z; z++)
-                        if ((*z)->zone_pgdat->node_id == nid)
-                                return 1;
-                return 0;
-        }
-        default:
-                BUG();
-                return 0;
-        }
-}
-/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
        return 0;
 }
+struct page *filemap_nopage(struct vm_area_struct *area,
+                        unsigned long address, int *type)
+{
+        BUG();
+        return NULL;
+}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+static void fastcall free_hot_cold_page(struct page *page, int cold);
 /*
 * results with 256, 32 in the lowmem_reserve sysctl:
 *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
        int ret = 0;
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page)
        return 0;
 }
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+        return 0;
+}
+#endif
+static void bad_page(struct page *page)
 {
-        printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+        printk(KERN_EMERG "Bad page state in process '%s'\n"
-                function, current->comm, page);
+                "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
-        printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+                "Trying to fix it up, but a reboot is needed\n"
-                (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
+                "Backtrace:\n",
-                page->mapping, page_mapcount(page), page_count(page));
+                current->comm, page, (int)(2*sizeof(unsigned long)),
-        printk(KERN_EMERG "Backtrace:\n");
+                (unsigned long)page->flags, page->mapping,
+                page_mapcount(page), page_count(page));
        dump_stack();
-        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
        page->flags &= ~(1 << PG_lru    |
                        1 << PG_private |
                        1 << PG_locked  |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        if (!PageCompound(page))
+        if (unlikely(page[1].index != order))
-                return;
+                bad_page(page);
-        if (page[1].index != order)
-                bad_page(__FUNCTION__, page);
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (!PageCompound(p))
+                if (unlikely(!PageCompound(p) |
-                        bad_page(__FUNCTION__, page);
+                                (page_private(p) != (unsigned long)page)))
-                if (page_private(p) != (unsigned long)page)
+                        bad_page(page);
-                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
 }
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 /*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
+ * (a) the buddy is not in a hole &&
- * (b) the buddy is on the buddy system &&
+ * (b) the buddy is free &&
- * (c) a page and its buddy have the same order.
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
 * for recording page's order, we use page_private(page) and PG_private.
 *
 */
 static inline int page_is_buddy(struct page *page, int order)
 {
+#ifdef CONFIG_HOLES_IN_ZONE
+        if (!pfn_valid(page_to_pfn(page)))
+                return 0;
+#endif
       if (PagePrivate(page)           &&
           (page_order(page) == order) &&
            page_count(page) == 0)
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page,
        unsigned long page_idx;
        int order_size = 1 << order;
-        if (unlikely(order))
+        if (unlikely(PageCompound(page)))
                destroy_compound_page(page, order);
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page,
                struct free_area *area;
                struct page *buddy;
-                combined_idx = __find_combined_index(page_idx, order);
                buddy = __page_find_buddy(page, page_idx, order);
-                if (bad_range(zone, buddy))
-                        break;
                if (!page_is_buddy(buddy, order))
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
                area = zone->free_area + order;
                area->nr_free--;
                rmv_page_order(buddy);
+                combined_idx = __find_combined_index(page_idx, order);
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page,
        zone->free_area[order].nr_free++;
 }
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(function, page);
+                bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
        /*
@@ -375,11 +386,10 @@ static int
 free_pages_bulk(struct zone *zone, int count,
                struct list_head *list, unsigned int order)
 {
-        unsigned long flags;
        struct page *page = NULL;
        int ret = 0;
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
        while (!list_empty(list) && count--) {
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count,
                __free_pages_bulk(page, zone, order);
                ret++;
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
        return ret;
 }
 void __free_pages_ok(struct page *page, unsigned int order)
 {
+        unsigned long flags;
        LIST_HEAD(list);
        int i;
        int reserved = 0;
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order)
 #endif
        for (i = 0 ; i < (1 << order) ; ++i)
-                reserved += free_pages_check(__FUNCTION__, page + i);
+                reserved += free_pages_check(page + i);
        if (reserved)
                return;
        list_add(&page->lru, &list);
-        mod_page_state(pgfree, 1 << order);
        kernel_map_pages(page, 1<<order, 0);
+        local_irq_save(flags);
+        __mod_page_state(pgfree, 1 << order);
        free_pages_bulk(page_zone(page), 1, &list, order);
+        local_irq_restore(flags);
+}
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+        if (order == 0) {
+                __ClearPageReserved(page);
+                set_page_count(page, 0);
+                free_hot_cold_page(page, 0);
+        } else {
+                LIST_HEAD(list);
+                int loop;
+                for (loop = 0; loop < BITS_PER_LONG; loop++) {
+                        struct page *p = &page[loop];
+                        if (loop + 16 < BITS_PER_LONG)
+                                prefetchw(p + 16);
+                        __ClearPageReserved(p);
+                        set_page_count(p, 0);
+                }
+                arch_free_page(page, order);
+                mod_page_state(pgfree, 1 << order);
+                list_add(&page->lru, &list);
+                kernel_map_pages(page, 1 << order, 0);
+                free_pages_bulk(page_zone(page), 1, &list, order);
+        }
 }
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
 *
 * -- wli
 */
-static inline struct page *
+static inline void expand(struct zone *zone, struct page *page,
-expand(struct zone *zone, struct page *page,
        int low, int high, struct free_area *area)
 {
        unsigned long size = 1 << high;
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page,
                area->nr_free++;
                set_page_order(&page[size], high);
        }
-        return page;
-}
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-        set_page_count(page, 1);
-#else
-        int i;
-        /*
-         * We need to reference all the pages for this order, otherwise if
-         * anyone accesses one of the pages with (get/put) it will be freed.
-         * - eg: access_process_vm()
-         */
-        for (i = 0; i < (1 << order); i++)
-                set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 /*
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order)
 */
 static int prep_new_page(struct page *page, int order)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(__FUNCTION__, page);
+                bad_page(page);
        /*
         * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                rmv_page_order(page);
                area->nr_free--;
                zone->free_pages -= 1UL << order;
-                return expand(zone, page, order, current_order, area);
+                expand(zone, page, order, current_order, area);
+                return page;
        }
        return NULL;
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list)
 {
-        unsigned long flags;
        int i;
-        int allocated = 0;
-        struct page *page;
        
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                page = __rmqueue(zone, order);
+                struct page *page = __rmqueue(zone, order);
-                if (page == NULL)
+                if (unlikely(page == NULL))
                        break;
-                allocated++;
                list_add_tail(&page->lru, list);
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
-        return allocated;
+        return i;
 }
 #ifdef CONFIG_NUMA
@@ -589,6 +613,7 @@ void drain_remote_pages(void)
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+        unsigned long flags;
        struct zone *zone;
        int i;
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu)
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
+                        local_irq_save(flags);
                        pcp->count -= free_pages_bulk(zone, pcp->count,
                                                &pcp->list, 0);
+                        local_irq_restore(flags);
                }
        }
 }
@@ -647,18 +674,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-        unsigned long flags;
-        int cpu;
        pg_data_t *pg = z->zone_pgdat;
        pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
        struct per_cpu_pageset *p;
-        local_irq_save(flags);
+        p = zone_pcp(z, cpu);
-        cpu = smp_processor_id();
-        p = zone_pcp(z,cpu);
        if (pg == orig) {
                p->numa_hit++;
        } else {
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
                p->local_node++;
        else
                p->other_node++;
-        local_irq_restore(flags);
 #endif
 }
 /*
 * Free a 0-order page
 */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
-        if (free_pages_check(__FUNCTION__, page))
+        if (free_pages_check(page))
                return;
-        inc_page_state(pgfree);
        kernel_map_pages(page, 1, 0);
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
+        __inc_page_state(pgfree);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
        if (pcp->count >= pcp->high)
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
-static struct page *
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+                        struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
+        int cpu;
 again:
+        cpu  = get_cpu();
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                page = NULL;
+                pcp = &zone_pcp(zone, cpu)->pcp[cold];
-                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
-                if (pcp->count <= pcp->low)
+                if (!pcp->count) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                                pcp->batch, &pcp->list);
-                if (pcp->count) {
+                        if (unlikely(!pcp->count))
-                        page = list_entry(pcp->list.next, struct page, lru);
+                                goto failed;
-                        list_del(&page->lru);
-                        pcp->count--;
                }
-                local_irq_restore(flags);
+                page = list_entry(pcp->list.next, struct page, lru);
-                put_cpu();
+                list_del(&page->lru);
+                pcp->count--;
        } else {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
-                spin_unlock_irqrestore(&zone->lock, flags);
+                spin_unlock(&zone->lock);
+                if (!page)
+                        goto failed;
        }
-        if (page != NULL) {
+        __mod_page_state_zone(zone, pgalloc, 1 << order);
-                BUG_ON(bad_range(zone, page));
+        zone_statistics(zonelist, zone, cpu);
-                mod_page_state_zone(zone, pgalloc, 1 << order);
+        local_irq_restore(flags);
-                if (prep_new_page(page, order))
+        put_cpu();
-                        goto again;
+        BUG_ON(bad_range(zone, page));
+        if (prep_new_page(page, order))
+                goto again;
-                if (gfp_flags & __GFP_ZERO)
+        if (gfp_flags & __GFP_ZERO)
-                        prep_zero_page(page, order, gfp_flags);
+                prep_zero_page(page, order, gfp_flags);
-                if (order && (gfp_flags & __GFP_COMP))
+        if (order && (gfp_flags & __GFP_COMP))
-                        prep_compound_page(page, order);
+                prep_compound_page(page, order);
-        }
        return page;
+failed:
+        local_irq_restore(flags);
+        put_cpu();
+        return NULL;
 }
 #define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                continue;
                }
-                page = buffered_rmqueue(*z, order, gfp_mask);
+                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
                if (page) {
-                        zone_statistics(zonelist, *z);
                        break;
                }
        } while (*(++z) != NULL);
@@ -903,8 +932,7 @@ restart:
                alloc_flags |= ALLOC_HARDER;
        if (gfp_mask & __GFP_HIGH)
                alloc_flags |= ALLOC_HIGH;
-        if (wait)
+        alloc_flags |= ALLOC_CPUSET;
-                alloc_flags |= ALLOC_CPUSET;
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +954,7 @@ restart:
 nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
                        page = get_page_from_freelist(gfp_mask, order,
-                                zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+                                zonelist, ALLOC_NO_WATERMARKS);
                        if (page)
                                goto got_pg;
                        if (gfp_mask & __GFP_NOFAIL) {
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
        int cpu = 0;
        memset(ret, 0, sizeof(*ret));
-        cpus_and(*cpumask, *cpumask, cpu_online_map);
        cpu = first_cpu(*cpumask);
        while (cpu < NR_CPUS) {
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
-        for_each_online_cpu(cpu) {
+        for_each_cpu(cpu) {
                unsigned long in;
                in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
        return ret;
 }
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+        void *ptr;
+        ptr = &__get_cpu_var(page_states);
+        *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
-        void* ptr;
+        void *ptr;
        local_irq_save(flags);
        ptr = &__get_cpu_var(page_states);
-        *(unsigned long*)(ptr + offset) += delta;
+        *(unsigned long *)(ptr + offset) += delta;
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL(mod_page_state_offset);
-EXPORT_SYMBOL(__mod_page_state);
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
                        unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1370,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s per-cpu:", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk(" empty\n");
                        continue;
                } else
@@ -1347,10 +1382,9 @@ void show_free_areas(void)
                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
-                                printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+                                printk("cpu %d %s: high %d, batch %d used:%d\n",
                                        cpu,
                                        temperature ? "cold" : "hot",
-                                        pageset->pcp[temperature].low,
                                        pageset->pcp[temperature].high,
                                        pageset->pcp[temperature].batch,
                                        pageset->pcp[temperature].count);
@@ -1413,7 +1447,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s: ", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk("empty\n");
                        continue;
                }
@@ -1433,36 +1467,29 @@ void show_free_areas(void)
 /*
 * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
 */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+static int __init build_zonelists_node(pg_data_t *pgdat,
-{
+                        struct zonelist *zonelist, int nr_zones, int zone_type)
-        switch (k) {
+{
-                struct zone *zone;
+        struct zone *zone;
-        default:
-                BUG();
+        BUG_ON(zone_type > ZONE_HIGHMEM);
-        case ZONE_HIGHMEM:
-                zone = pgdat->node_zones + ZONE_HIGHMEM;
+        do {
-                if (zone->present_pages) {
+                zone = pgdat->node_zones + zone_type;
+                if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-                        BUG();
+                        BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-                        zonelist->zones[j++] = zone;
+                        zonelist->zones[nr_zones++] = zone;
+                        check_highest_zone(zone_type);
                }
-        case ZONE_NORMAL:
+                zone_type--;
-                zone = pgdat->node_zones + ZONE_NORMAL;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        case ZONE_DMA32:
-                zone = pgdat->node_zones + ZONE_DMA32;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        case ZONE_DMA:
-                zone = pgdat->node_zones + ZONE_DMA;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        }
-        return j;
+        } while (zone_type >= 0);
+        return nr_zones;
 }
 static inline int highest_zone(int zone_bits)
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
                if (!early_pfn_valid(pfn))
                        continue;
-                if (!early_pfn_in_nid(pfn, nid))
-                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                set_page_count(page, 1);
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp[0];               /* hot */
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
        INIT_LIST_HEAD(&pcp->list);
        pcp = &p->pcp[1];               /* cold*/
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 2 * batch;
        pcp->batch = max(1UL, batch/2);
        INIT_LIST_HEAD(&pcp->list);
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg)
        int order;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
                int i;
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                                seq_printf(m,
                                           "\n    cpu: %i pcp: %i"
                                           "\n              count: %i"
-                                           "\n              low:   %i"
                                           "\n              high:  %i"
                                           "\n              batch: %i",
                                           i, j,
                                           pageset->pcp[j].count,
-                                           pageset->pcp[j].low,
                                           pageset->pcp[j].high,
                                           pageset->pcp[j].batch);
                        }
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = {
        "pgpgout",
        "pswpin",
        "pswpout",
-        "pgalloc_high",
+        "pgalloc_high",
        "pgalloc_normal",
+        "pgalloc_dma32",
        "pgalloc_dma",
        "pgfree",
        "pgactivate",
        "pgdeactivate",
        "pgfault",
        "pgmajfault",
        "pgrefill_high",
        "pgrefill_normal",
+        "pgrefill_dma32",
        "pgrefill_dma",
        "pgsteal_high",
        "pgsteal_normal",
+        "pgsteal_dma32",
        "pgsteal_dma",
        "pgscan_kswapd_high",
        "pgscan_kswapd_normal",
+        "pgscan_kswapd_dma32",
        "pgscan_kswapd_dma",
        "pgscan_direct_high",
        "pgscan_direct_normal",
+        "pgscan_direct_dma32",
        "pgscan_direct_dma",
-        "pginodesteal",
+        "pginodesteal",
        "slabs_scanned",
        "kswapd_steal",
        "kswapd_inodesteal",
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
        unsigned page_idx;
        struct pagevec lru_pvec;
-        int ret = 0;
+        int ret;
        if (mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                list_del(&page->lru);
                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        mapping->a_ops->readpage(filp, page);
+                        ret = mapping->a_ops->readpage(filp, page);
-                        if (!pagevec_add(&lru_pvec, page))
+                        if (ret != AOP_TRUNCATED_PAGE) {
-                                __pagevec_lru_add(&lru_pvec);
+                                if (!pagevec_add(&lru_pvec, page))
-                } else {
+                                        __pagevec_lru_add(&lru_pvec);
-                        page_cache_release(page);
+                                continue;
+                        } /* else fall through to release */
                }
+                page_cache_release(page);
        }
        pagevec_lru_add(&lru_pvec);
+        ret = 0;
 out:
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..6f3f7db27128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
 }
 /**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        BUG_ON(!anon_vma);
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+        page->index = linear_page_index(vma, address);
+        /*
+         * nr_mapped state can be updated without turning off
+         * interrupts because it is not modified via interrupt.
+         */
+        __inc_page_state(nr_mapped);
+}
+/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        if (atomic_inc_and_test(&page->_mapcount)) {
+        if (atomic_inc_and_test(&page->_mapcount))
-                struct anon_vma *anon_vma = vma->anon_vma;
+                __page_set_anon_rmap(page, vma, address);
-                BUG_ON(!anon_vma);
-                anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-                page->mapping = (struct address_space *) anon_vma;
-                page->index = linear_page_index(vma, address);
-                inc_page_state(nr_mapped);
-        }
        /* else checking page index and mapping is racy */
 }
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+        __page_set_anon_rmap(page, vma, address);
+}
 /**
 * page_add_file_rmap - add pte mapping to a file page
 * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
        BUG_ON(!pfn_valid(page_to_pfn(page)));
        if (atomic_inc_and_test(&page->_mapcount))
-                inc_page_state(nr_mapped);
+                __inc_page_state(nr_mapped);
 }
 /**
@@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page)
                 */
                if (page_test_and_clear_dirty(page))
                        set_page_dirty(page);
-                dec_page_state(nr_mapped);
+                __dec_page_state(nr_mapped);
        }
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..a1f2f02af724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
        } while (next);
 }
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
        long nr_swaps_freed = 0;
        int offset;
        int freed;
+        int punch_hole = 0;
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-        idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (idx >= info->next_index)
                return;
        spin_lock(&info->lock);
        info->flags |= SHMEM_TRUNCATE;
-        limit = info->next_index;
+        if (likely(end == (loff_t) -1)) {
-        info->next_index = idx;
+                limit = info->next_index;
+                info->next_index = idx;
+        } else {
+                limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                if (limit > info->next_index)
+                        limit = info->next_index;
+                punch_hole = 1;
+        }
        topdir = info->i_indirect;
-        if (topdir && idx <= SHMEM_NR_DIRECT) {
+        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
                info->i_indirect = NULL;
                nr_pages_to_free++;
                list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
                        set_page_private(subdir, page_private(subdir) - freed);
                        if (offset)
                                spin_unlock(&info->lock);
-                        BUG_ON(page_private(subdir) > offset);
+                        if (!punch_hole)
+                                BUG_ON(page_private(subdir) > offset);
                }
                if (offset)
                        offset = 0;
-                else if (subdir) {
+                else if (subdir && !page_private(subdir)) {
                        dir[diroff] = NULL;
                        nr_pages_to_free++;
                        list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
                 * Also, though shmem_getpage checks i_size before adding to
                 * cache, no recheck after: so fix the narrow window there too.
                 */
-                truncate_inode_pages(inode->i_mapping, inode->i_size);
+                truncate_inode_pages_range(inode->i_mapping, start, end);
        }
        spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
        }
 }
+static void shmem_truncate(struct inode *inode)
+{
+        shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
        swap_free(swap);
 redirty:
        set_page_dirty(page);
-        return WRITEPAGE_ACTIVATE;      /* Return with the page locked */
+        return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
 }
 #ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
        return retval;
 }
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
        .truncate       = shmem_truncate,
        .setattr        = shmem_notify_change,
+        .truncate_range = shmem_truncate_range,
 };
 static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..ee6d71ccfa56 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
        put_cpu_var(lru_add_active_pvecs);
 }
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+        /* CPU is dead, so no locking needed. */
        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);
-        pvec = &__get_cpu_var(lru_add_active_pvecs);
+        pvec = &per_cpu(lru_add_active_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add_active(pvec);
-        put_cpu_var(lru_add_pvecs);
+}
+void lru_add_drain(void)
+{
+        __lru_add_drain(get_cpu());
+        put_cpu();
 }
 /*
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
-        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-        /* CPU is dead, so no locking needed. */
-        if (pagevec_count(pvec))
-                __pagevec_lru_add(pvec);
-        pvec = &per_cpu(lru_add_active_pvecs, cpu);
-        if (pagevec_count(pvec))
-                __pagevec_lru_add_active(pvec);
-}
 /* Drop the CPU's cached committed space back into the central pool. */
 static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        if (action == CPU_DEAD) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
-                lru_drain_cache((long)hcpu);
+                __lru_add_drain((long)hcpu);
        }
        return NOTIFY_OK;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..fc2aecb70a95 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 #include <asm/pgtable.h>
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
 */
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
-        int chunk = 16;
        struct page **pagep = pages;
        lru_add_drain();
        while (nr) {
-                int todo = min(chunk, nr);
+                int todo = min(nr, PAGEVEC_SIZE);
                int i;
                for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..6da4b28b896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
        return (swp_entry_t) {0};
 }
+swp_entry_t get_swap_page_of_type(int type)
+{
+        struct swap_info_struct *si;
+        pgoff_t offset;
+        spin_lock(&swap_lock);
+        si = swap_info + type;
+        if (si->flags & SWP_WRITEOK) {
+                nr_swap_pages--;
+                offset = scan_swap_map(si);
+                if (offset) {
+                        spin_unlock(&swap_lock);
+                        return swp_entry(type, offset);
+                }
+                nr_swap_pages++;
+        }
+        spin_unlock(&swap_lock);
+        return (swp_entry_t) {0};
+}
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct * p;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..cdc6d431972b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
                goto close_file;
        d_instantiate(dentry, inode);
-        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
        file->f_vfsmnt = mntget(shm_mnt);
        file->f_dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = &ramfs_file_operations;
        file->f_mode = FMODE_WRITE | FMODE_READ;
+        /* notify everyone as to the change of file size */
+        error = do_truncate(dentry, size, file);
+        if (error < 0)
+                goto close_file;
        return file;
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
        return 0;
 }
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+#ifndef CONFIG_MMU
+        return ramfs_nommu_mmap(file, vma);
+#else
+        return 0;
+#endif
+}
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+                                      unsigned long addr,
+                                      unsigned long len,
+                                      unsigned long pgoff,
+                                      unsigned long flags)
+{
+        return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..7dee32745901 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
 *
- * Truncate the page cache at a set offset, removing the pages that are beyond
+ * Truncate the page cache, removing the pages that are between
- * that offset (and zeroing out partial pages).
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
 */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+                                loff_t lstart, loff_t lend)
 {
        const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end;
        const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
        if (mapping->nrpages == 0)
                return;
+        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        end = (lend >> PAGE_CACHE_SHIFT);
        pagevec_init(&pvec, 0);
        next = start;
-        while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+        while (next <= end &&
+               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
+                        if (page_index > end) {
+                                next = page_index;
+                                break;
+                        }
                        if (page_index > next)
                                next = page_index;
                        next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                        next = start;
                        continue;
                }
+                if (pvec.pages[0]->index > end) {
+                        pagevec_release(&pvec);
+                        break;
+                }
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
+                        if (page->index > end)
+                                break;
                        lock_page(page);
                        wait_on_page_writeback(page);
                        if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                pagevec_release(&pvec);
        }
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 /**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..be8235fb1939 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
        unsigned long nr_mapped;        /* From page_state */
-        /* How many pages shrink_cache() should reclaim */
-        int nr_to_reclaim;
        /* Ask shrink_caches, or shrink_zone to scan at this priority */
        unsigned int priority;
@@ -74,9 +71,6 @@ struct scan_control {
        int may_writepage;
-        /* Can pages be swapped as part of reclaim? */
-        int may_swap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
         * In this context, it doesn't matter that we scan the
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                res = mapping->a_ops->writepage(page, &wbc);
                if (res < 0)
                        handle_write_error(mapping, page, res);
-                if (res == WRITEPAGE_ACTIVATE) {
+                if (res == AOP_WRITEPAGE_ACTIVATE) {
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }
@@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!sc->may_swap)
-                                goto keep_locked;
                        if (!add_to_swap(page))
                                goto activate_locked;
                }
@@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                        goto done;
                max_scan -= nr_scan;
-                if (current_is_kswapd())
-                        mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-                else
-                        mod_page_state_zone(zone, pgscan_direct, nr_scan);
                nr_freed = shrink_list(&page_list, sc);
-                if (current_is_kswapd())
-                        mod_page_state(kswapd_steal, nr_freed);
-                mod_page_state_zone(zone, pgsteal, nr_freed);
-                sc->nr_to_reclaim -= nr_freed;
-                spin_lock_irq(&zone->lru_lock);
+                local_irq_disable();
+                if (current_is_kswapd()) {
+                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                        __mod_page_state(kswapd_steal, nr_freed);
+                } else
+                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                 */
@@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                }
        }
        zone->nr_active += pgmoved;
-        spin_unlock_irq(&zone->lru_lock);
+        spin_unlock(&zone->lru_lock);
-        pagevec_release(&pvec);
+        __mod_page_state_zone(zone, pgrefill, pgscanned);
+        __mod_page_state(pgdeactivate, pgdeactivate);
+        local_irq_enable();
-        mod_page_state_zone(zone, pgrefill, pgscanned);
+        pagevec_release(&pvec);
-        mod_page_state(pgdeactivate, pgdeactivate);
 }
 /*
@@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        else
                nr_inactive = 0;
-        sc->nr_to_reclaim = sc->swap_cluster_max;
        while (nr_active || nr_inactive) {
                if (nr_active) {
                        sc->nr_to_scan = min(nr_active,
@@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
                                        (unsigned long)sc->swap_cluster_max);
                        nr_inactive -= sc->nr_to_scan;
                        shrink_cache(zone, sc);
-                        if (sc->nr_to_reclaim <= 0)
-                                break;
                }
        }
@@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (zone->present_pages == 0)
+                if (!populated_zone(zone))
                        continue;
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        sc.gfp_mask = gfp_mask;
        sc.may_writepage = 0;
-        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1055,7 +1044,6 @@ loop_again:
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
        sc.may_writepage = 0;
-        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1084,7 +1072,7 @@ loop_again:
                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
                                struct zone *zone = pgdat->node_zones + i;
-                                if (zone->present_pages == 0)
+                                if (!populated_zone(zone))
                                        continue;
                                if (zone->all_unreclaimable &&
@@ -1121,7 +1109,7 @@ scan:
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        if (zone->present_pages == 0)
+                        if (!populated_zone(zone))
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
        pg_data_t *pgdat;
-        if (zone->present_pages == 0)
+        if (!populated_zone(zone))
                return;
        pgdat = zone->zone_pgdat;
@@ -1353,76 +1341,3 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
-/*
- * Try to free up some pages from this zone through reclaim.
- */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-{
-        struct scan_control sc;
-        int nr_pages = 1 << order;
-        int total_reclaimed = 0;
-        /* The reclaim may sleep, so don't do it if sleep isn't allowed */
-        if (!(gfp_mask & __GFP_WAIT))
-                return 0;
-        if (zone->all_unreclaimable)
-                return 0;
-        sc.gfp_mask = gfp_mask;
-        sc.may_writepage = 0;
-        sc.may_swap = 0;
-        sc.nr_mapped = read_page_state(nr_mapped);
-        sc.nr_scanned = 0;
-        sc.nr_reclaimed = 0;
-        /* scan at the highest priority */
-        sc.priority = 0;
-        disable_swap_token();
-        if (nr_pages > SWAP_CLUSTER_MAX)
-                sc.swap_cluster_max = nr_pages;
-        else
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-        /* Don't reclaim the zone if there are other reclaimers active */
-        if (atomic_read(&zone->reclaim_in_progress) > 0)
-                goto out;
-        shrink_zone(zone, &sc);
-        total_reclaimed = sc.nr_reclaimed;
- out:
-        return total_reclaimed;
-}
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-                                     unsigned int state)
-{
-        struct zone *z;
-        int i;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (node >= MAX_NUMNODES || !node_online(node))
-                return -EINVAL;
-        /* This will break if we ever add more zones */
-        if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-                return -EINVAL;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                if (!(zone & 1<<i))
-                        continue;
-                z = &NODE_DATA(node)->node_zones[i];
-                if (state)
-                        z->reclaim_pages = 1;
-                else
-                        z->reclaim_pages = 0;
-        }
-        return 0;
-}