7 files changed, 61 insertions, 35 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fbd1111ea119..6bf720bc662c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -301,6 +301,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
+        pte_t *ptep;
        pte_t pte;
        struct page *page;
@@ -309,9 +310,17 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        BUG_ON(end & ~HPAGE_MASK);
        for (address = start; address < end; address += HPAGE_SIZE) {
-                pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+                ptep = huge_pte_offset(mm, address);
+                if (! ptep)
+                        /* This can happen on truncate, or if an
+                         * mmap() is aborted due to an error before
+                         * the prefault */
+                        continue;
+                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (pte_none(pte))
                        continue;
                page = pte_page(pte);
                put_page(page);
        }
diff --git a/mm/memory.c b/mm/memory.c
index 6fe77acbc1cd..e046b7e4b530 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -913,9 +913,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pud = pud_offset(pgd, pg);
                        BUG_ON(pud_none(*pud));
                        pmd = pmd_offset(pud, pg);
-                        BUG_ON(pmd_none(*pmd));
+                        if (pmd_none(*pmd))
+                                return i ? : -EFAULT;
                        pte = pte_offset_map(pmd, pg);
-                        BUG_ON(pte_none(*pte));
+                        if (pte_none(*pte)) {
+                                pte_unmap(pte);
+                                return i ? : -EFAULT;
+                        }
                        if (pages) {
                                pages[i] = pte_page(*pte);
                                get_page(pages[i]);
@@ -940,11 +944,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                }
                spin_lock(&mm->page_table_lock);
                do {
+                        int write_access = write;
                        struct page *page;
-                        int lookup_write = write;
                        cond_resched_lock(&mm->page_table_lock);
-                        while (!(page = follow_page(mm, start, lookup_write))) {
+                        while (!(page = follow_page(mm, start, write_access))) {
+                                int ret;
                                /*
                                 * Shortcut for anonymous pages. We don't want
                                 * to force the creation of pages tables for
@@ -952,13 +958,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 * nobody touched so far. This is important
                                 * for doing a core dump for these mappings.
                                 */
-                                if (!lookup_write &&
+                                if (!write && untouched_anonymous_page(mm,vma,start)) {
-                                    untouched_anonymous_page(mm,vma,start)) {
                                        page = ZERO_PAGE(start);
                                        break;
                                }
                                spin_unlock(&mm->page_table_lock);
-                                switch (handle_mm_fault(mm,vma,start,write)) {
+                                ret = __handle_mm_fault(mm, vma, start, write_access);
+                                /*
+                                 * The VM_FAULT_WRITE bit tells us that do_wp_page has
+                                 * broken COW when necessary, even if maybe_mkwrite
+                                 * decided not to set pte_write. We can thus safely do
+                                 * subsequent page lookups as if they were reads.
+                                 */
+                                if (ret & VM_FAULT_WRITE)
+                                        write_access = 0;
+                                
+                                switch (ret & ~VM_FAULT_WRITE) {
                                case VM_FAULT_MINOR:
                                        tsk->min_flt++;
                                        break;
@@ -972,14 +988,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                default:
                                        BUG();
                                }
-                                /*
-                                 * Now that we have performed a write fault
-                                 * and surely no longer have a shared page we
-                                 * shouldn't write, we shouldn't ignore an
-                                 * unwritable page in the page table if
-                                 * we are forcing write access.
-                                 */
-                                lookup_write = write && !force;
                                spin_lock(&mm->page_table_lock);
                        }
                        if (pages) {
@@ -1229,6 +1237,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        struct page *old_page, *new_page;
        unsigned long pfn = pte_pfn(pte);
        pte_t entry;
+        int ret;
        if (unlikely(!pfn_valid(pfn))) {
                /*
@@ -1256,7 +1265,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                        lazy_mmu_prot_update(entry);
                        pte_unmap(page_table);
                        spin_unlock(&mm->page_table_lock);
-                        return VM_FAULT_MINOR;
+                        return VM_FAULT_MINOR|VM_FAULT_WRITE;
                }
        }
        pte_unmap(page_table);
@@ -1283,6 +1292,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
        /*
         * Re-check the pte - we dropped the lock
         */
+        ret = VM_FAULT_MINOR;
        spin_lock(&mm->page_table_lock);
        page_table = pte_offset_map(pmd, address);
        if (likely(pte_same(*page_table, pte))) {
@@ -1299,12 +1309,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
                /* Free the old page.. */
                new_page = old_page;
+                ret |= VM_FAULT_WRITE;
        }
        pte_unmap(page_table);
        page_cache_release(new_page);
        page_cache_release(old_page);
        spin_unlock(&mm->page_table_lock);
-        return VM_FAULT_MINOR;
+        return ret;
 no_new_page:
        page_cache_release(old_page);
@@ -1996,7 +2007,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        if (write_access) {
                if (!pte_write(entry))
                        return do_wp_page(mm, vma, address, pte, pmd, entry);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
@@ -2011,7 +2021,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
 * By the time we get here, we already hold the mm semaphore
 */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
                unsigned long address, int write_access)
 {
        pgd_t *pgd;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1694845526be..b4eababc8198 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -443,7 +443,7 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        struct mempolicy *new;
        DECLARE_BITMAP(nodes, MAX_NUMNODES);
-        if (mode > MPOL_MAX)
+        if (mode < 0 || mode > MPOL_MAX)
                return -EINVAL;
        err = get_nodes(nodes, nmask, maxnode, mode);
        if (err)
diff --git a/mm/mmap.c b/mm/mmap.c
index da3fa90a0aae..404319477e71 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/mremap.c b/mm/mremap.c
index ec7238a78f36..fc45dc9a617b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -229,6 +229,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * since do_munmap() will decrement it by old_len == new_len
         */
        mm->total_vm += new_len >> PAGE_SHIFT;
+        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
                /* OOM: unable to split vma, just get accounts right */
@@ -243,7 +244,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                        vma->vm_next->vm_flags |= VM_ACCOUNT;
        }
-        __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                if (new_len > old_len)
diff --git a/mm/nommu.c b/mm/nommu.c
index ce74452c02d9..fd4e8df0f02d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1167,7 +1167,11 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
           leave 3% of the size of this process for other processes */
        allowed -= current->mm->total_vm / 32;
-        if (atomic_read(&vm_committed_space) < allowed)
+        /*
+         * cast `allowed' as a signed long because vm_committed_space
+         * sometimes has a negative value
+         */
+        if (atomic_read(&vm_committed_space) < (long)allowed)
                return 0;
        vm_unacct_memory(pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 42bccfb8464d..8d088371196a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1061,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
 static unsigned int nr_free_zone_pages(int offset)
 {
-        pg_data_t *pgdat;
+        /* Just pick one node, since fallback list is circular */
+        pg_data_t *pgdat = NODE_DATA(numa_node_id());
        unsigned int sum = 0;
-        for_each_pgdat(pgdat) {
+        struct zonelist *zonelist = pgdat->node_zonelists + offset;
-                struct zonelist *zonelist = pgdat->node_zonelists + offset;
+        struct zone **zonep = zonelist->zones;
-                struct zone **zonep = zonelist->zones;
+        struct zone *zone;
-                struct zone *zone;
-                for (zone = *zonep++; zone; zone = *zonep++) {
+        for (zone = *zonep++; zone; zone = *zonep++) {
-                        unsigned long size = zone->present_pages;
+                unsigned long size = zone->present_pages;
-                        unsigned long high = zone->pages_high;
+                unsigned long high = zone->pages_high;
-                        if (size > high)
+                if (size > high)
-                                sum += size - high;
+                        sum += size - high;
-                }
        }
        return sum;