Merge commit 'v2.6.38' into x86/mm

Conflicts: arch/x86/mm/numa_64.c Merge reason: Resolve the conflict, update the branch to .38. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2011-03-15 03:29:44 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-03-15 03:29:44 -0400
commit: 8460b3e5bc64955aeefdd8357b3bf7b5ff79b3f2 (patch)
tree: 7e5f6d050b72ab08a4497e82a4a103fefb086e80 /mm
parent: 56396e6823fe9b42fe9cf9403d6ed67756255f70 (diff)
parent: 521cb40b0c44418a4fd36dc633f575813d59a43d (diff)
10 files changed, 116 insertions, 76 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e62ddb8f24b6..113e35c47502 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 static inline struct page *alloc_hugepage_vma(int defrag,
                                              struct vm_area_struct *vma,
-                                              unsigned long haddr)
+                                              unsigned long haddr, int nd)
 {
        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-                               HPAGE_PMD_ORDER, vma, haddr);
+                               HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                          vma, haddr);
+                                          vma, haddr, numa_node_id());
                if (unlikely(!page))
                        goto out;
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
        for (i = 0; i < HPAGE_PMD_NR; i++) {
-                pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
-                                          vma, address);
+                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_newpage_charge(pages[i], mm,
                                                       GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                              vma, haddr);
+                                              vma, haddr, numa_node_id());
        else
                new_page = NULL;
@@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage,
-                               struct vm_area_struct *vma)
+                               struct vm_area_struct *vma,
+                               int node)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1761,6 +1762,10 @@ static void collapse_huge_page(struct mm_struct *mm,
 #ifndef CONFIG_NUMA
        VM_BUG_ON(!*hpage);
        new_page = *hpage;
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                up_read(&mm->mmap_sem);
+                return;
+        }
 #else
        VM_BUG_ON(*hpage);
        /*
@@ -1773,18 +1778,19 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                      node);
        if (unlikely(!new_page)) {
                up_read(&mm->mmap_sem);
                *hpage = ERR_PTR(-ENOMEM);
                return;
        }
-#endif
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                up_read(&mm->mmap_sem);
                put_page(new_page);
                return;
        }
+#endif
        /* after allocating the hugepage upgrade to mmap_sem write mode */
        up_read(&mm->mmap_sem);
@@ -1811,6 +1817,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
                goto out;
+        if (is_vma_temporary_stack(vma))
+                goto out;
        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
        pgd = pgd_offset(mm, address);
@@ -1917,6 +1925,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
+        int node = -1;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1947,6 +1956,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
+                /*
+                 * Chose the node of the first page. This could
+                 * be more sophisticated and look at more pages,
+                 * but isn't for now.
+                 */
+                if (node == -1)
+                        node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -1963,7 +1979,7 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret)
                /* collapse_huge_page will return with the mmap_sem released */
-                collapse_huge_page(mm, address, hpage, vma);
+                collapse_huge_page(mm, address, hpage, vma, node);
 out:
        return ret;
 }
@@ -2032,32 +2048,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
                     !khugepaged_always()) ||
                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                skip:
                        progress++;
                        continue;
                }
                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
-                        khugepaged_scan.address = vma->vm_end;
+                        goto skip;
-                        progress++;
+                if (is_vma_temporary_stack(vma))
-                        continue;
+                        goto skip;
-                }
                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
-                if (hstart >= hend) {
+                if (hstart >= hend)
-                        progress++;
+                        goto skip;
-                        continue;
+                if (khugepaged_scan.address > hend)
-                }
+                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
-                if (khugepaged_scan.address > hend) {
+                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
-                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-                        progress++;
-                        continue;
-                }
-                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -2086,7 +2097,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
        spin_lock(&khugepaged_mm_lock);
-        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
@@ -2241,9 +2252,9 @@ static int khugepaged(void *none)
        for (;;) {
                mutex_unlock(&khugepaged_mutex);
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                khugepaged_loop();
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())
diff --git a/mm/memory.c b/mm/memory.c
index 8e8c18324863..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
        details.i_mmap_lock = &mapping->i_mmap_lock;
+        mutex_lock(&mapping->unmap_mutex);
        spin_lock(&mapping->i_mmap_lock);
        /* Protect against endless unmapping loops */
@@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+        int nd)
 {
-        int nd = numa_node_id();
        switch (policy->mode) {
        case MPOL_PREFERRED:
                if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
-                zl = policy_zonelist(gfp_flags, *mpol);
+                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr)
+                unsigned long addr, int node)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
-        zl = policy_zonelist(gfp, pol);
+        zl = policy_zonelist(gfp, pol, node);
        if (unlikely(mpol_needs_cond_ref(pol))) {
                /*
                 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
-                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+                                policy_zonelist(gfp, pol, numa_node_id()),
+                                policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 766115253807..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1287,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                return -EPERM;
        /* Find the mm_struct */
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!mm)
                return -EINVAL;
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 */
                mapping = vma->vm_file->f_mapping;
                spin_lock(&mapping->i_mmap_lock);
-                if (new_vma->vm_truncate_count &&
+                new_vma->vm_truncate_count = 0;
-                    new_vma->vm_truncate_count != vma->vm_truncate_count)
-                        new_vma->vm_truncate_count = 0;
        }
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index da0fe32059b3..bd7625676a64 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5371,10 +5371,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
                unsigned long check = pfn + iter;
-                if (!pfn_valid_within(check)) {
+                if (!pfn_valid_within(check))
-                        iter++;
                        continue;
-                }
                page = pfn_to_page(check);
                if (!page_count(page)) {
                        if (PageBuddy(page))
diff --git a/mm/rmap.c b/mm/rmap.c
index f21f4a1d6a1c..941bf82e8961 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -497,41 +497,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
        struct mm_struct *mm = vma->vm_mm;
        int referenced = 0;
-        /*
-         * Don't want to elevate referenced for mlocked page that gets this far,
-         * in order that it progresses to try_to_unmap and is moved to the
-         * unevictable list.
-         */
-        if (vma->vm_flags & VM_LOCKED) {
-                *mapcount = 0;  /* break early from loop */
-                *vm_flags |= VM_LOCKED;
-                goto out;
-        }
-        /* Pretend the page is referenced if the task has the
-           swap token and is in the middle of a page fault. */
-        if (mm != current->mm && has_swap_token(mm) &&
-                        rwsem_is_locked(&mm->mmap_sem))
-                referenced++;
        if (unlikely(PageTransHuge(page))) {
                pmd_t *pmd;
                spin_lock(&mm->page_table_lock);
+                /*
+                 * rmap might return false positives; we must filter
+                 * these out using page_check_address_pmd().
+                 */
                pmd = page_check_address_pmd(page, mm, address,
                                             PAGE_CHECK_ADDRESS_PMD_FLAG);
-                if (pmd && !pmd_trans_splitting(*pmd) &&
+                if (!pmd) {
-                    pmdp_clear_flush_young_notify(vma, address, pmd))
+                        spin_unlock(&mm->page_table_lock);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_LOCKED) {
+                        spin_unlock(&mm->page_table_lock);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
+                /* go ahead even if the pmd is pmd_trans_splitting() */
+                if (pmdp_clear_flush_young_notify(vma, address, pmd))
                        referenced++;
                spin_unlock(&mm->page_table_lock);
        } else {
                pte_t *pte;
                spinlock_t *ptl;
+                /*
+                 * rmap might return false positives; we must filter
+                 * these out using page_check_address().
+                 */
                pte = page_check_address(page, mm, address, &ptl, 0);
                if (!pte)
                        goto out;
+                if (vma->vm_flags & VM_LOCKED) {
+                        pte_unmap_unlock(pte, ptl);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        /*
                         * Don't treat a reference through a sequentially read
@@ -546,6 +556,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
+        /* Pretend the page is referenced if the task has the
+           swap token and is in the middle of a page fault. */
+        if (mm != current->mm && has_swap_token(mm) &&
+                        rwsem_is_locked(&mm->mmap_sem))
+                referenced++;
        (*mapcount)--;
        if (referenced)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = -EINVAL;
        if (S_ISBLK(inode->i_mode)) {
-                bdev = I_BDEV(inode);
+                bdev = bdgrab(I_BDEV(inode));
                error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
                                   sys_swapon);
                if (error < 0) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        next = start;
        while (next <= end &&
               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17497d0cd8b9..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
                return false;
-        /*
+        /* Consider stopping depending on scan and reclaim activity */
-         * If we failed to reclaim and have scanned the full list, stop.
+        if (sc->gfp_mask & __GFP_REPEAT) {
-         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+                /*
-         *       faster but obviously would be less likely to succeed
+                 * For __GFP_REPEAT allocations, stop reclaiming if the
-         *       allocation. If this is desirable, use GFP_REPEAT to decide
+                 * full LRU list has been scanned and we are still failing
-         *       if both reclaimed and scanned should be checked or just
+                 * to reclaim pages. This full LRU scan is potentially
-         *       reclaimed
+                 * expensive but a __GFP_REPEAT caller really wants to succeed
-         */
+                 */
-        if (!nr_reclaimed && !nr_scanned)
+                if (!nr_reclaimed && !nr_scanned)
-                return false;
+                        return false;
+        } else {
+                /*
+                 * For non-__GFP_REPEAT allocations which can presumably
+                 * fail without consequence, stop if we failed to reclaim
+                 * any pages from the last SWAP_CLUSTER_MAX number of
+                 * pages that were scanned. This will return to the
+                 * caller faster at the risk reclaim/compaction and
+                 * the resulting allocation attempt fails
+                 */
+                if (!nr_reclaimed)
+                        return false;
+        }
        /*
         * If we have not reclaimed enough pages for compaction and the
author	Ingo Molnar <mingo@elte.hu>	2011-03-15 03:29:44 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-03-15 03:29:44 -0400
commit	8460b3e5bc64955aeefdd8357b3bf7b5ff79b3f2 (patch)
tree	7e5f6d050b72ab08a4497e82a4a103fefb086e80 /mm
parent	56396e6823fe9b42fe9cf9403d6ed67756255f70 (diff)
parent	521cb40b0c44418a4fd36dc633f575813d59a43d (diff)