8 files changed, 85 insertions, 55 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbd1111ea119..6bf720bc662c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
+        pte_t *ptep;
        pte_t pte;
        struct page *page;
@@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(end & ~HPAGE_MASK);
        for (address = start; address < end; address += HPAGE_SIZE) {
-                pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+                ptep = huge_pte_offset(mm, address);
+                if (! ptep)
+                        /* This can happen on truncate, or if an
+                         * mmap() is aborted due to an error before
+                         * the prefault */
+                        continue;
+                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (pte_none(pte))
                        continue;
                page = pte_page(pte);
                put_page(page);
        }
diff --git a/mm/madvise.c b/mm/madvise.c
index 73180a22877e..c8c01a12fea4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -83,9 +83,6 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
        struct file *file = vma->vm_file;
-        if (!file)
-                return -EBADF;
        if (file->f_mapping->a_ops->get_xip_page) {
                /* no bad return value, but ignore advice */
                return 0;
@@ -140,11 +137,16 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
-static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+static long
-                        unsigned long start, unsigned long end, int behavior)
+madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+                unsigned long start, unsigned long end, int behavior)
 {
+        struct file *filp = vma->vm_file;
        long error = -EBADF;
+        if (!filp)
+                goto  out;
        switch (behavior) {
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
@@ -165,6 +167,7 @@ static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev
                break;
        }
                
+out:
        return error;
 }
diff --git a/mm/memory.c b/mm/memory.c
index beabdefa6254..e046b7e4b530 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 * Do a quick page-table lookup for a single page.
 * mm->page_table_lock must be held.
 */
-static struct page *
+static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
-__follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
+                        int read, int write, int accessed)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write)
                pfn = pte_pfn(pte);
                if (pfn_valid(pfn)) {
                        page = pfn_to_page(pfn);
-                        if (write && !pte_dirty(pte) && !PageDirty(page))
+                        if (accessed) {
-                                set_page_dirty(page);
+                                if (write && !pte_dirty(pte) &&!PageDirty(page))
-                        mark_page_accessed(page);
+                                        set_page_dirty(page);
+                                mark_page_accessed(page);
+                        }
                        return page;
                }
        }
@@ -829,16 +831,19 @@ out:
        return NULL;
 }
-struct page *
+inline struct page *
 follow_page(struct mm_struct *mm, unsigned long address, int write)
 {
-        return __follow_page(mm, address, /*read*/0, write);
+        return __follow_page(mm, address, 0, write, 1);
 }
-int
+/*
-check_user_page_readable(struct mm_struct *mm, unsigned long address)
+ * check_user_page_readable() can be called frm niterrupt context by oprofile,
+ * so we need to avoid taking any non-irq-safe locks
+ */
+int check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
-        return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
+        return __follow_page(mm, address, 1, 0, 0) != NULL;
 }
 EXPORT_SYMBOL(check_user_page_readable);
@@ -908,9 +913,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pud = pud_offset(pgd, pg);
                        BUG_ON(pud_none(*pud));
                        pmd = pmd_offset(pud, pg);
-                        BUG_ON(pmd_none(*pmd));
+                        if (pmd_none(*pmd))
+                                return i ? : -EFAULT;
                        pte = pte_offset_map(pmd, pg);
-                        BUG_ON(pte_none(*pte));
+                        if (pte_none(*pte)) {
+                                pte_unmap(pte);
+                                return i ? : -EFAULT;
+                        }
                        if (pages) {
                                pages[i] = pte_page(*pte);
                                get_page(pages[i]);
@@ -935,11 +944,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                }
                spin_lock(&mm->page_table_lock);
                do {
+                        int write_access = write;
                        struct page *page;
-                        int lookup_write = write;
                        cond_resched_lock(&mm->page_table_lock);
-                        while (!(page = follow_page(mm, start, lookup_write))) {
+                        while (!(page = follow_page(mm, start, write_access))) {
+                                int ret;
                                /*
                                 * Shortcut for anonymous pages. We don't want
                                 * to force the creation of pages tables for
@@ -947,13 +958,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * nobody touched so far. This is important
                                 * for doing a core dump for these mappings.
                                 */
-                                if (!lookup_write &&
+                                if (!write && untouched_anonymous_page(mm,vma,start)) {
-                                    untouched_anonymous_page(mm,vma,start)) {
                                        page = ZERO_PAGE(start);
                                        break;
                                }
                                spin_unlock(&mm->page_table_lock);
-                                switch (handle_mm_fault(mm,vma,start,write)) {
+                                ret = __handle_mm_fault(mm, vma, start, write_access);
+                                /*
+                                 * The VM_FAULT_WRITE bit tells us that do_wp_page has
+                                 * broken COW when necessary, even if maybe_mkwrite
+                                 * decided not to set pte_write. We can thus safely do
+                                 * subsequent page lookups as if they were reads.
+                                 */
+                                if (ret & VM_FAULT_WRITE)
+                                        write_access = 0;
+                                
+                                switch (ret & ~VM_FAULT_WRITE) {
                                case VM_FAULT_MINOR:
                                        tsk->min_flt++;
                                        break;
@@ -967,14 +988,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                default:
                                        BUG();
                                }
-                                /*
-                                 * Now that we have performed a write fault
-                                 * and surely no longer have a shared page we
-                                 * shouldn't write, we shouldn't ignore an
-                                 * unwritable page in the page table if
-                                 * we are forcing write access.
-                                 */
-                                lookup_write = write && !force;
                                spin_lock(&mm->page_table_lock);
                        }
                        if (pages) {
@@ -1224,6 +1237,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        struct page *old_page, *new_page;
        unsigned long pfn = pte_pfn(pte);
        pte_t entry;
+        int ret;
        if (unlikely(!pfn_valid(pfn))) {
                /*
@@ -1251,7 +1265,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                        lazy_mmu_prot_update(entry);
                        pte_unmap(page_table);
                        spin_unlock(&mm->page_table_lock);
-                        return VM_FAULT_MINOR;
+                        return VM_FAULT_MINOR|VM_FAULT_WRITE;
                }
        }
        pte_unmap(page_table);
@@ -1278,6 +1292,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        /*
         * Re-check the pte - we dropped the lock
         */
+        ret = VM_FAULT_MINOR;
        spin_lock(&mm->page_table_lock);
        page_table = pte_offset_map(pmd, address);
        if (likely(pte_same(*page_table, pte))) {
@@ -1294,12 +1309,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                /* Free the old page.. */
                new_page = old_page;
+                ret |= VM_FAULT_WRITE;
        }
        pte_unmap(page_table);
        page_cache_release(new_page);
        page_cache_release(old_page);
        spin_unlock(&mm->page_table_lock);
-        return VM_FAULT_MINOR;
+        return ret;
 no_new_page:
        page_cache_release(old_page);
@@ -1991,7 +2007,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        if (write_access) {
                if (!pte_write(entry))
                        return do_wp_page(mm, vma, address, pte, pmd, entry);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
@@ -2006,7 +2021,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
 * By the time we get here, we already hold the mm semaphore
 */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
                unsigned long address, int write_access)
 {
        pgd_t *pgd;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cb41c31e7c87..b4eababc8198 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        struct mempolicy *new;
        DECLARE_BITMAP(nodes, MAX_NUMNODES);
-        if (mode > MPOL_MAX)
+        if (mode < 0 || mode > MPOL_MAX)
                return -EINVAL;
        err = get_nodes(nodes, nmask, maxnode, mode);
        if (err)
@@ -1138,11 +1138,11 @@ void mpol_free_shared_policy(struct shared_policy *p)
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
+                rb_erase(&n->nd, &p->root);
                mpol_free(n->policy);
                kmem_cache_free(sn_cache, n);
        }
        spin_unlock(&p->lock);
-        p->root = RB_ROOT;
 }
 /* assumes fs == KERNEL_DS */
diff --git a/mm/mmap.c b/mm/mmap.c
index da3fa90a0aae..404319477e71 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/mremap.c b/mm/mremap.c
index ec7238a78f36..fc45dc9a617b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * since do_munmap() will decrement it by old_len == new_len
         */
        mm->total_vm += new_len >> PAGE_SHIFT;
+        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
                /* OOM: unable to split vma, just get accounts right */
@@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                        vma->vm_next->vm_flags |= VM_ACCOUNT;
        }
-        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                if (new_len > old_len)
diff --git a/mm/nommu.c b/mm/nommu.c
index ce74452c02d9..fd4e8df0f02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d6ba6a4b594..8d088371196a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1061,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 static unsigned int nr_free_zone_pages(int offset)
 {
-        pg_data_t *pgdat;
+        /* Just pick one node, since fallback list is circular */
+        pg_data_t *pgdat = NODE_DATA(numa_node_id());
        unsigned int sum = 0;
-        for_each_pgdat(pgdat) {
+        struct zonelist *zonelist = pgdat->node_zonelists + offset;
-                struct zonelist *zonelist = pgdat->node_zonelists + offset;
+        struct zone **zonep = zonelist->zones;
-                struct zone **zonep = zonelist->zones;
+        struct zone *zone;
-                struct zone *zone;
-                for (zone = *zonep++; zone; zone = *zonep++) {
+        for (zone = *zonep++; zone; zone = *zonep++) {
-                        unsigned long size = zone->present_pages;
+                unsigned long size = zone->present_pages;
-                        unsigned long high = zone->pages_high;
+                unsigned long high = zone->pages_high;
-                        if (size > high)
+                if (size > high)
-                                sum += size - high;
+                        sum += size - high;
-                }
        }
        return sum;
@@ -1861,7 +1860,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        unsigned long i, j;
-        const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
        int cpu, nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
@@ -1934,9 +1932,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->zone_mem_map = pfn_to_page(zone_start_pfn);
                zone->zone_start_pfn = zone_start_pfn;
-                if ((zone_start_pfn) & (zone_required_alignment-1))
-                        printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n");
                memmap_init(size, nid, j, zone_start_pfn);
                zonetable_add(zone, nid, j, zone_start_pfn, size);