1 files changed, 269 insertions, 91 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed1..8e8c1832486 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long address)
 {
        pgtable_t new = pte_alloc_one(mm, address);
+        int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        spin_lock(&mm->page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        wait_split_huge_page = 0;
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm->nr_ptes++;
                pmd_populate(mm, pmd, new);
                new = NULL;
-        }
+        } else if (unlikely(pmd_trans_splitting(*pmd)))
+                wait_split_huge_page = 1;
        spin_unlock(&mm->page_table_lock);
        if (new)
                pte_free(mm, new);
+        if (wait_split_huge_page)
+                wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&init_mm.page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-        }
+        } else
+                VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
        return 0;
 }
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end)
+                   unsigned long addr, unsigned long end)
 {
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*src_pmd)) {
+                        int err;
+                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                        err = copy_huge_pmd(dst_mm, src_mm,
+                                            dst_pmd, src_pmd, addr, vma);
+                        if (err == -ENOMEM)
+                                return -ENOMEM;
+                        if (!err)
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next-addr != HPAGE_PMD_SIZE) {
+                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        } else if (zap_huge_pmd(tlb, vma, pmd)) {
+                                (*zap_work)--;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd)) {
                        (*zap_work)--;
                        continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-        if (pud_huge(*pud)) {
+        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
-        if (pmd_huge(*pmd)) {
+        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if (pmd_trans_huge(*pmd)) {
+                if (flags & FOLL_SPLIT) {
+                        split_huge_page_pmd(mm, pmd);
+                        goto split_fallthrough;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely(pmd_trans_huge(*pmd))) {
+                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                                spin_unlock(&mm->page_table_lock);
+                                wait_split_huge_page(vma->anon_vma, pmd);
+                        } else {
+                                page = follow_trans_huge_pmd(mm, address,
+                                                             pmd, flags);
+                                spin_unlock(&mm->page_table_lock);
+                                goto out;
+                        }
+                } else
+                        spin_unlock(&mm->page_table_lock);
+                /* fall through */
+        }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 */
                mark_page_accessed(page);
        }
+        if (flags & FOLL_MLOCK) {
+                /*
+                 * The preliminary mapping check is mainly to avoid the
+                 * pointless overhead of lock_page on the ZERO_PAGE
+                 * which might bounce very badly if there is contention.
+                 *
+                 * If the page is already locked, we don't need to
+                 * handle it now - vmscan will handle it later if and
+                 * when it attempts to reclaim the page.
+                 */
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();  /* push cached pages to LRU */
+                        /*
+                         * Because we lock page here and migration is
+                         * blocked by the pte's page reference, we need
+                         * only check for file-cache page truncation.
+                         */
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -1341,7 +1412,8 @@ no_page_table:
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking)
 {
        int i;
        unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
+                                unsigned int fault_flags = 0;
+                                if (foll_flags & FOLL_WRITE)
+                                        fault_flags |= FAULT_FLAG_WRITE;
+                                if (nonblocking)
+                                        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
                                ret = handle_mm_fault(mm, vma, start,
-                                        (foll_flags & FOLL_WRITE) ?
+                                                        fault_flags);
-                                        FAULT_FLAG_WRITE : 0);
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                else
                                        tsk->min_flt++;
+                                if (ret & VM_FAULT_RETRY) {
+                                        *nonblocking = 0;
+                                        return i;
+                                }
                                /*
                                 * The VM_FAULT_WRITE bit tells us that
                                 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
        struct page *page;
        if (__get_user_pages(current, current->mm, addr, 1,
-                        FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                             NULL) < 1)
                return NULL;
        flush_cache_page(vma, addr, page_to_pfn(page));
        return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-                if (pmd)
+                if (pmd) {
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+                }
        }
        return NULL;
 }
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
        return same;
 }
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-        if (likely(vma->vm_flags & VM_WRITE))
-                pte = pte_mkwrite(pte);
-        return pte;
-}
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int reuse = 0, ret = 0;
+        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
@@ -2144,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
                }
-                reuse = reuse_swap_page(old_page);
+                if (reuse_swap_page(old_page)) {
-                if (reuse)
                        /*
                         * The page is all ours.  Move it to our anon_vma so
                         * the rmap code will not search our parent or siblings.
                         * Protected against the rmap code by the page lock.
                         */
                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        goto reuse;
+                }
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2212,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
@@ -2220,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                dirty_page = old_page;
                get_page(dirty_page);
-                reuse = 1;
-        }
-        if (reuse) {
 reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, page_table);
+                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-                goto unlock;
+                if (!dirty_page)
+                        return ret;
+                /*
+                 * Yes, Virginia, this is actually required to prevent a race
+                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * bit after it clear all dirty ptes, but before a racing
+                 * do_wp_page installs a dirty pte.
+                 *
+                 * do_no_page is protected similarly.
+                 */
+                if (!page_mkwrite) {
+                        wait_on_page_locked(dirty_page);
+                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                }
+                put_page(dirty_page);
+                if (page_mkwrite) {
+                        struct address_space *mapping = dirty_page->mapping;
+                        set_page_dirty(dirty_page);
+                        unlock_page(dirty_page);
+                        page_cache_release(dirty_page);
+                        if (mapping)    {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
+                return ret;
        }
        /*
@@ -2256,16 +2365,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        /*
-         * Don't let another task, with possibly unlocked vma,
-         * keep the mlocked page.
-         */
-        if ((vma->vm_flags & VM_LOCKED) && old_page) {
-                lock_page(old_page);    /* for LRU manipulation */
-                clear_page_mlock(old_page);
-                unlock_page(old_page);
-        }
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
@@ -2333,42 +2432,19 @@ gotten:
        if (new_page)
                page_cache_release(new_page);
-        if (old_page)
-                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (dirty_page) {
+        if (old_page) {
                /*
-                 * Yes, Virginia, this is actually required to prevent a race
+                 * Don't let another task, with possibly unlocked vma,
-                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * keep the mlocked page.
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_no_page is protected similarly.
                 */
-                if (!page_mkwrite) {
+                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
-                        wait_on_page_locked(dirty_page);
+                        lock_page(old_page);    /* LRU manipulation */
-                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                        munlock_vma_page(old_page);
-                }
+                        unlock_page(old_page);
-                put_page(dirty_page);
-                if (page_mkwrite) {
-                        struct address_space *mapping = dirty_page->mapping;
-                        set_page_dirty(dirty_page);
-                        unlock_page(dirty_page);
-                        page_cache_release(dirty_page);
-                        if (mapping)    {
-                                /*
-                                 * Some device drivers do not set page.mapping
-                                 * but still dirty their pages
-                                 */
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
                }
+                page_cache_release(old_page);
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
        }
        return ret;
 oom_free_new:
@@ -2975,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                        /*
-                         * Don't let another task, with possibly unlocked vma,
-                         * keep the mlocked page.
-                         */
-                        if (vma->vm_flags & VM_LOCKED)
-                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
@@ -3147,9 +3217,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static inline int handle_pte_fault(struct mm_struct *mm,
+int handle_pte_fault(struct mm_struct *mm,
-                struct vm_area_struct *vma, unsigned long address,
+                     struct vm_area_struct *vma, unsigned long address,
-                pte_t *pte, pmd_t *pmd, unsigned int flags)
+                     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3228,9 +3298,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
-        pte = pte_alloc_map(mm, pmd, address);
+        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-        if (!pte)
+                if (!vma->vm_ops)
+                        return do_huge_pmd_anonymous_page(mm, vma, address,
+                                                          pmd, flags);
+        } else {
+                pmd_t orig_pmd = *pmd;
+                barrier();
+                if (pmd_trans_huge(orig_pmd)) {
+                        if (flags & FAULT_FLAG_WRITE &&
+                            !pmd_write(orig_pmd) &&
+                            !pmd_trans_splitting(orig_pmd))
+                                return do_huge_pmd_wp_page(mm, vma, address,
+                                                           pmd, orig_pmd);
+                        return 0;
+                }
+        }
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
@@ -3296,7 +3397,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
                return -ENOMEM;
-        write = (vma->vm_flags & VM_WRITE) != 0;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3474,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
        pmd = pmd_offset(pud, address);
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
@@ -3608,3 +3715,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                                unsigned long addr,
+                                unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *p = page;
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page;
+             i++, p = mem_map_next(p, page, i)) {
+                cond_resched();
+                clear_user_highpage(p, addr + i * PAGE_SIZE);
+        }
+}
+void clear_huge_page(struct page *page,
+                     unsigned long addr, unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                clear_gigantic_page(page, addr, pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+        }
+}
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page; ) {
+                cond_resched();
+                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_user_huge_page(struct page *dst, struct page *src,
+                         unsigned long addr, struct vm_area_struct *vma,
+                         unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                copy_user_gigantic_page(dst, src, addr, vma,
+                                        pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+        }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */